xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/morton_msb_radix_bitonic_sort_shared.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 //
2 // Copyright (C) 2009-2021 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7 
8 //
9 //   This file contains structure definitions shared by GRL OCL kernels and host code
10 //
11 
12 #pragma once
13 
14 #include "GRLGen12.h"
15 
16 // NOTE:
17 // MSB(Most significant byte) - here I refer to it as a part of sorting that does MSB Radix sort, which can spawn additional work
18 // BLS(Bottom level sort) - here I refer to it as a last part of sorting a particular range(currently Bitonic), which cannot spawn additional work
19 //
20 
21 #define MSB_RADIX_NUM_BINS    256
22 #define MSB_BITS_PER_ITERATION 8 // how many bits are sorted per iteration
23 #define MSB_SHIFT_BYTE_START_OFFSET 56 // start offset for byte shifting, first iteration will start from here
24 
25 #define MSB_RADIX_NUM_VCONTEXTS 8 // NOTE: mkulikow: maybe expand/shrink? More means more MSB processed in parallel but more memory used
26 
27 #define MSB_STACK_ENTRIES_NUM (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS * 7) // first level doesn't get spawned, so 7 iterations must fit here,
28 // since at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS we need 7 of these
29 
30 #define MSB_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS) // one per context
31 
32 #define BLS_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS) // each context can spawn MSB_RADIX_NUM_BINS,
33 // so at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS
34 
35 #define MSB_WG_SORT_ELEMENTS_THRESHOLD 256 // This tells us how many elements at max we can process in a single workgroup.
36                                            // If a single MSB entry needs more, then it will spawn more WGs
37                                            // after updating this also needs to update msb_radix_bitonic_sort.grl's computation of initial workgroups num
38 
39 #define BOTTOM_LEVEL_SORT_THRESHOLD 512 // TODO: is 4096 best value? ON skl gives best performance
40 // Right now we use 256 workitems in simd16 which give us 16 hw threads, assuming 2KB per thread, we have 32KB SLM to play with.
41 // Since we use ulong(8bytes) we can store 4096 elements
42 // This also tells us that if number of elements to sort is less than this, we don't need to allocate scheduler
43 // Need to keep in sync with the GRL const BOTTOM_LEVEL_SORT_THRESHOLD
44 
45 #define BOTTOM_LEVEL_SORT_MERGING_THRESHOLD 512 // This is the amount till which we'll merge small BLS'es produced by MSB into a single bigger BLS
46 
47 GRL_NAMESPACE_BEGIN(GRL)
48 
49 
50 
51 
52 GRL_NAMESPACE_BEGIN(RTAS)
53 GRL_NAMESPACE_BEGIN(MORTON_MSB_RADIX_BITONIC_SORT)
54 
55 struct MSBStackEntry
56 {
57     uint start_offset;
58     uint count;
59     uint iteration;
60 };
61 
62 struct MSBStack
63 {
64     dword num_entries;
65     struct MSBStackEntry entries[MSB_STACK_ENTRIES_NUM];
66 };
67 
68 struct MSBRadixContext
69 {
70     uint start[MSB_RADIX_NUM_BINS];
71     uint count[MSB_RADIX_NUM_BINS];
72     uint num_wgs_in_flight; // this is used to identify which msb wg is last
73     uint num_keys; // number of keys to process
74     uint iteration;
75     ulong* keys_in;
76     ulong* keys_out;
77 
78     uint start_offset; //offset from the beginning of the buffer
79 };
80 
81 struct MSBDispatchRecord
82 {
83     uint wgs_to_dispatch; // amount of workgroups to dispatch for this current record
84 };
85 
86 struct MSBDispatchQueue
87 {
88     dword num_records;
89     struct MSBDispatchRecord records[MSB_RADIX_NUM_VCONTEXTS]; // each context have its own record
90 };
91 
92 // BLS(Bottom Level Sort) - last stage of sorting which will not spawn any new tasks
93 struct BLSDispatchRecord
94 {
95     uint start_offset; // offset from the beginning of the buffer
96     uint count;
97     ulong* keys_in; // we don't need keys_out since we will write always to the same output buffer
98 };
99 
100 struct BLSDispatchQueue
101 {
102     dword num_records;
103     struct BLSDispatchRecord records[BLS_DISPATCH_QUEUE_NUM_RECORDS];
104 };
105 
106 struct VContextScheduler
107 {
108     /////////////////////////////////////////////////////////////
109     //  State data used for communication with command streamer
110     //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
111     /////////////////////////////////////////////////////////////
112 
113     dword num_wgs_msb; // number of MSB workgroups being processed by current iteration
114     dword num_wgs_bls; // number of BLS workgroups being processed by current iteration
115 
116     dword scheduler_postsync;
117     dword _pad1;
118 
119     /////////////////////////////////////////////////////////////
120 
121     struct MSBDispatchQueue msb_queue;
122     struct BLSDispatchQueue bls_queue0;
123     struct BLSDispatchQueue bls_queue1;
124 
125     struct BLSDispatchQueue* curr_bls_queue;
126     struct BLSDispatchQueue* next_bls_queue;
127 
128     struct MSBStack msb_stack;
129 
130     struct MSBRadixContext contexts[MSB_RADIX_NUM_VCONTEXTS];
131 };
132 
133 GRL_NAMESPACE_END(MORTON_MSB_RADIX_BITONIC_SORT)
134 GRL_NAMESPACE_END(RTAS)
135 GRL_NAMESPACE_END(GRL)
136