xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/grl_structs.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /**
25  * This file contains a redefinition of structures defined in the GRL library.
26  * We need to have those structures defined to allocate & prepare data for
27  * the OpenCL kernels building acceleration structures. Unfortunately because
28  * of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL
29  * header files directly so we have to redefine stuff here.
30  */
31 
32 #ifndef GRL_STRUCTS_H
33 #define GRL_STRUCTS_H
34 
35 #include "GRLStructs.h"
36 #include "GRLRTASCommon.h"
37 
38 struct MKBuilderState {
39    qword geomDesc_buffer;
40    qword build_primref_buffer;
41    qword build_globals;
42    qword bvh_buffer;
43    dword leaf_type;
44    dword leaf_size;
45 };
46 
47 #define PREFIX_MK_STATE(prefix, obj) \
48    (struct prefix##_MKBuilderState) { \
49       .geomDesc_buffer = (obj).geomDesc_buffer, \
50       .build_primref_buffer = (obj).build_primref_buffer, \
51       .build_globals = (obj).build_globals, \
52       .bvh_buffer = (obj).bvh_buffer, \
53       .leaf_type = (obj).leaf_type, \
54       .leaf_size = (obj).leaf_size, \
55    }
56 
57 struct MKSizeEstimate {
58    dword numTriangles;
59    dword numProcedurals;
60    dword numPrimitives;
61    dword numMeshes;
62    dword numBuildPrimitives;
63    dword numPrimitivesToSplit;
64    dword instance_descs_start;
65    dword geo_meta_data_start;
66    dword node_data_start;
67    dword leaf_data_start;
68    dword procedural_data_start;
69    dword back_pointer_start;
70    dword sizeTotal;
71    dword updateScratchSizeTotal;
72    dword fatleaf_table_start;
73    dword innernode_table_start;
74    dword max_fatleaves;
75 
76    size_t max_instance_leafs;
77    size_t max_inner_nodes;
78    size_t leaf_data_size;
79    size_t min_primitives;
80    size_t max_primitives;
81 };
82 
83 #define PREFIX_MK_SIZE(prefix, obj) \
84    (struct prefix##_MKSizeEstimate) { \
85       .numTriangles = (obj).numTriangles, \
86       .numProcedurals = (obj).numProcedurals, \
87       .numPrimitives = (obj).numPrimitives, \
88       .numMeshes = (obj).numMeshes, \
89       .numBuildPrimitives = (obj).numBuildPrimitives, \
90       .numPrimitivesToSplit = (obj).numPrimitivesToSplit, \
91       .instance_descs_start = (obj).instance_descs_start, \
92       .geo_meta_data_start = (obj).geo_meta_data_start, \
93       .node_data_start = (obj).node_data_start, \
94       .leaf_data_start = (obj).leaf_data_start, \
95       .procedural_data_start = (obj).procedural_data_start, \
96       .back_pointer_start = (obj).back_pointer_start, \
97       .sizeTotal = (obj).sizeTotal, \
98       .updateScratchSizeTotal = (obj).updateScratchSizeTotal, \
99       .fatleaf_table_start = (obj).fatleaf_table_start, \
100       .innernode_table_start = (obj).innernode_table_start, \
101       .max_fatleaves = (obj).max_fatleaves, \
102    }
103 
104 typedef struct AABB {
105    float lower[4];
106    float upper[4];
107 } AABB;
108 
109 struct Globals
110 {
111    struct AABB centroidBounds;
112 
113    unsigned int build_record_start;
114    unsigned int numPrimitives;
115    unsigned int leafPrimType;
116    unsigned int leafSize;
117 
118    unsigned int numSplittedPrimitives;
119    unsigned int numBuildRecords;
120 
121    // spatial split sate
122    unsigned int numOriginalPrimitives;
123    float presplitPrioritySum;
124    float probThreshold;
125 
126    // binned-sah bfs state
127    unsigned int counter;
128    unsigned int numBuildRecords_extended;
129 
130    // sync variable used for global-sync on work groups
131    unsigned int sync;
132 
133 
134    /* morton code builder state */
135    unsigned int shift;      // used by adaptive mc-builder
136    unsigned int shift_mask; // used by adaptive mc-builder
137    unsigned int binary_hierarchy_root;
138    unsigned int p0_allocated_num;
139    unsigned int p0_created_num;
140    unsigned int morton_sort_in_flight;
141    unsigned int sort_iterations;
142 
143    gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy.  Stashed here as a debug aid
144 };
145 
146 typedef struct BVHBase
147 {
148    // TODO:  Implement the "copy-first-node" trick... duplicate root node here
149 
150    uint64_t rootNodeOffset;
151 
152    uint32_t reserved;
153 
154    uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64
155    uint32_t quadLeafStart;
156    uint32_t quadLeafCur;
157    uint32_t proceduralDataStart;
158    uint32_t proceduralDataCur;
159    uint32_t instanceLeafStart;
160    uint32_t instanceLeafEnd;
161    uint32_t backPointerDataStart;     //
162    uint32_t refitTreeletsDataStart;   // refit structs
163    uint32_t refitStartPointDataStart; //
164    uint32_t BVHDataEnd;
165 
166    // number of bottom treelets
167    // if 1, then the bottom treelet is also tip treelet
168    uint32_t refitTreeletCnt;
169    uint32_t refitTreeletCnt2; // always 0, used for atomic updates
170    // data layout:
171    // @backPointerDataStart
172    //  'backpointer' - a dword per inner node.
173    //  The bits are used as follows:
174    //     2:0  --> Used as a refit counter during BVH refitting.  MBZ
175    //     5:3  --> Number of children
176    //     31:6 --> Index of the parent node in the internal node array
177    //    The root node has a parent index of all ones
178    // @refitTreeletsDataStart
179    //  RefitTreelet[], the last treelet is for top treelet all previous are for bottom
180    // @refitStartPointDataStart
181    //  for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space
182    // @backPointerDataEnd
183 
184    uint32_t fatLeafCount;  // number of internal nodes which are "fat-leaves"
185    uint32_t innerCount;    // number of internal nodes which are true inner nodes (all internalNode children)
186    uint32_t fatLeafTableStart;
187    uint32_t innerTableStart;
188 
189    uint32_t _pad[12];
190 
191    struct RTASMetaData Meta;
192 } BVHBase;
193 
194 
195 struct BatchedInitGlobalsData
196 {
197    qword p_build_globals;
198    qword p_bvh_buffer;
199    dword numPrimitives;
200    dword numGeometries;
201    dword numInstances;
202    dword instance_descs_start;
203    dword geo_meta_data_start;
204    dword node_data_start;
205    dword leaf_data_start;
206    dword procedural_data_start;
207    dword back_pointer_start;
208    dword sizeTotal;
209    dword leafType;
210    dword leafSize;
211    dword fatleaf_table_start;
212    dword innernode_table_start;
213 };
214 
215 
216 #define BFS_NUM_BINS        16
217 #define BFS_NUM_VCONTEXTS   256
218 #define BFS_MAX_DEPTH 32
219 
220 #define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384
221 
222 struct BFS_Split
223 {
224    float sah;
225    int dim;
226    int pos;
227 };
228 
229 struct BFS_BinInfo
230 {
231    float min_max[18 * BFS_NUM_BINS]; //  layout: bins[axis][num_bins][6]
232    //          The 6 are lower(xyz) and -upper(xyz)
233    // bins use negated-max so that we can use vectorized mins instead of min/max pairs
234    uint counts[3 * BFS_NUM_BINS];
235 };
236 
237 struct SAHBuildGlobals
238 {
239    qword   p_primref_index_buffers;
240    qword   p_primrefs_buffer;
241    qword   p_bvh2;
242    qword   p_globals;     // TODO: deprecate this
243    qword   p_bvh_base;
244    gpuva_t p_qnode_root_buffer;
245 
246    dword flags; // bit 1 is 'alloc_backpointers'.  bit 2 is 'need_masks'
247    dword num_primrefs;
248    dword leaf_size;
249    dword leaf_type;
250 
251    dword root_buffer_num_produced;
252    dword root_buffer_num_produced_hi;
253    dword root_buffer_num_consumed;
254    dword root_buffer_num_consumed_hi;
255    dword root_buffer_num_to_consume;
256    dword root_buffer_num_to_consume_hi;
257 };
258 
259 typedef union LRBounds
260 {
261    struct
262    {
263       struct AABB3f left_centroid_bounds;
264       struct AABB3f left_geom_bounds;
265       struct AABB3f right_centroid_bounds;
266       struct AABB3f right_geom_bounds;
267    } boxes;
268    struct
269    {
270       float Array[24];
271    } scalars;
272 } LRBounds;
273 
274 
275 struct VContext
276 {
277    uint dispatch_primref_begin;    // range of primrefs for this task
278    uint dispatch_primref_end;
279    uint bvh2_root;                 // BVH2 root node for this task
280    uint tree_depth;                // depth of this node in the tree
281    uint num_left;          // primref counts
282    uint num_right;
283    uint lr_mask;      // lower 8b : left mask.  upper 8b : right mask
284    uint batch_index;
285 
286    // pass1 global working state and output
287    struct BFS_Split split;
288    struct BFS_BinInfo global_bin_info;
289 
290    // pass2 global working state and output
291    LRBounds lr_bounds;
292 };
293 
294 
295 
296 struct BFSDispatchRecord
297 {
298    ushort batch_index;
299    ushort context_id;
300 };
301 
302 
303 struct BFSDispatchQueue
304 {
305    uint num_dispatches;
306    uint wg_count[BFS_NUM_VCONTEXTS];
307    struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS];
308 };
309 
310 struct BFS1SpillStackEntry
311 {
312    uint primref_begin;
313    uint primref_end;
314    uint bvh2_root;
315    ushort tree_depth;
316    ushort batch_index;
317 };
318 
319 struct BFS1SpillStack
320 {
321    uint size;
322    struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH];
323 };
324 
325 struct QNodeGlobalRootBufferEntry
326 {
327    uint bvh2_node;
328    uint qnode;
329    uint build_idx;
330    uint _pad;
331 };
332 
333 struct QNodeGlobalRootBuffer
334 {
335    uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM
336    struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2];
337 };
338 
339 struct DFSDispatchRecord
340 {
341    uint primref_base;
342    uint bvh2_base;
343    uint batch_index;
344    ushort num_primrefs;
345    ushort tree_depth;
346 };
347 
348 
349 struct DFSDispatchQueue
350 {
351    struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2];
352 };
353 
354 #define VCONTEXT_STATE_EXECUTING   0
355 #define VCONTEXT_STATE_UNALLOCATED 1
356 
357 union SchedulerUnion
358 {
359    struct VContextScheduler
360    {
361       /////////////////////////////////////////////////////////////
362       //  State data used for communication with command streamer
363       //   NOTE: This part must match definition in 'new_sah_builder.grl'
364       /////////////////////////////////////////////////////////////
365 
366       dword num_bfs_wgs;
367       dword num_dfs_wgs;
368 
369       dword scheduler_postsync;
370       dword _pad1;
371 
372       dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
373       dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
374 
375       dword batched_build_wg_count;  // number of wgs to dispatch for initial BFS pass
376       dword batched_build_loop_mask; // value is 0 if  #builds <= #contexts.  else 1  command streamer uses this as a loop condition
377 
378       /////////////////////////////////////////////////////////////
379 
380       dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
381       dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
382 
383       dword vcontext_state[BFS_NUM_VCONTEXTS];
384 
385       struct BFSDispatchQueue bfs_queue;
386       struct DFSDispatchQueue dfs_queue;
387 
388       struct VContext contexts[BFS_NUM_VCONTEXTS];
389 
390       struct BFS1SpillStack bfs2_spill_stack;
391    } vContextScheduler;
392 
393    struct QnodeScheduler
394    {
395       dword num_qnode_grb_curr_entries;
396       dword num_qnode_grb_new_entries;
397 
398       dword scheduler_postsync;
399       dword _pad1;
400 
401       dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size).
402       dword num_single_builds;  // number of single-wg builds (#primrefs < threshold)
403 
404       dword batched_builds_to_process;
405       dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer
406 
407       /////////////////////////////////////////////////////////////
408 
409       dword batched_build_count;  // number of batched builds in the SAHBuildGlobals buffer
410       dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer
411 
412       struct QNodeGlobalRootBuffer qnode_global_root_buffer;
413    } qnodeScheduler;
414 };
415 
416 
417 struct BVH2Node
418 {
419    struct AABB3f box;
420    uint  meta_u;   // leaf:  primref start.  inner: offset from node to its first child
421    uint  meta_ss;
422    //ushort meta_s;   // leaf: primref count.  inner: offset from first to second child, in nodes
423    //uchar is_inner; //  1 if inner, 0 if leaf
424    //uchar mask;
425 };
426 
427 struct BVH2
428 {
429    uint num_nodes;
430    uint _pad[7];  // align to 32B
431 };
432 
433 struct BatchedBLSDispatchEntry
434 {
435    /////////////////////////////////////////////////////////////
436    //  State data used for communication with command streamer
437    //  NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl'
438    /////////////////////////////////////////////////////////////
439    qword p_data_buffer;
440    qword num_elements; // number of elements in p_data_buffer
441 };
442 
443 struct SAHBuildArgsBatchable
444 {
445    qword p_globals_ptrs;
446    qword p_scheduler;
447    qword p_buffers_info;
448    qword p_sah_globals;
449 
450    dword num_max_qnode_global_root_buffer_entries;
451    dword num_builds;
452 };
453 
454 #define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \
455    (struct prefix##_SAHBuildArgsBatchable) { \
456       .p_globals_ptrs = (obj).p_globals_ptrs, \
457       .p_scheduler = (obj).p_scheduler, \
458       .p_buffers_info = (obj).p_buffers_info, \
459       .p_sah_globals = (obj).p_sah_globals, \
460       .num_max_qnode_global_root_buffer_entries = \
461       (obj).num_max_qnode_global_root_buffer_entries, \
462       .num_builds = (obj).num_builds, \
463    }
464 
465 
466 struct SAHBuildBuffersInfo
467 {
468    gpuva_t p_globals;
469    gpuva_t p_primref_index_buffers;
470    gpuva_t p_primrefs_buffer;
471    gpuva_t p_bvh2;
472    gpuva_t p_bvh_base;
473    gpuva_t p_qnode_root_buffer;
474    dword   sah_globals_flags;
475    dword   _pad;
476    gpuva_t _pad2;
477 };
478 
479 #endif /* GRL_STRUCTS_H */
480