1 /* 2 * Copyright © 2022 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 /** 25 * This file contains a redefinition of structures defined in the GRL library. 26 * We need to have those structures defined to allocate & prepare data for 27 * the OpenCL kernels building acceleration structures. Unfortunately because 28 * of C++ & OpenCL assumptions in GRL, it's no possible to just include GRL 29 * header files directly so we have to redefine stuff here. 30 */ 31 32 #ifndef GRL_STRUCTS_H 33 #define GRL_STRUCTS_H 34 35 #include "GRLStructs.h" 36 #include "GRLRTASCommon.h" 37 38 struct MKBuilderState { 39 qword geomDesc_buffer; 40 qword build_primref_buffer; 41 qword build_globals; 42 qword bvh_buffer; 43 dword leaf_type; 44 dword leaf_size; 45 }; 46 47 #define PREFIX_MK_STATE(prefix, obj) \ 48 (struct prefix##_MKBuilderState) { \ 49 .geomDesc_buffer = (obj).geomDesc_buffer, \ 50 .build_primref_buffer = (obj).build_primref_buffer, \ 51 .build_globals = (obj).build_globals, \ 52 .bvh_buffer = (obj).bvh_buffer, \ 53 .leaf_type = (obj).leaf_type, \ 54 .leaf_size = (obj).leaf_size, \ 55 } 56 57 struct MKSizeEstimate { 58 dword numTriangles; 59 dword numProcedurals; 60 dword numPrimitives; 61 dword numMeshes; 62 dword numBuildPrimitives; 63 dword numPrimitivesToSplit; 64 dword instance_descs_start; 65 dword geo_meta_data_start; 66 dword node_data_start; 67 dword leaf_data_start; 68 dword procedural_data_start; 69 dword back_pointer_start; 70 dword sizeTotal; 71 dword updateScratchSizeTotal; 72 dword fatleaf_table_start; 73 dword innernode_table_start; 74 dword max_fatleaves; 75 76 size_t max_instance_leafs; 77 size_t max_inner_nodes; 78 size_t leaf_data_size; 79 size_t min_primitives; 80 size_t max_primitives; 81 }; 82 83 #define PREFIX_MK_SIZE(prefix, obj) \ 84 (struct prefix##_MKSizeEstimate) { \ 85 .numTriangles = (obj).numTriangles, \ 86 .numProcedurals = (obj).numProcedurals, \ 87 .numPrimitives = (obj).numPrimitives, \ 88 .numMeshes = (obj).numMeshes, \ 89 .numBuildPrimitives = (obj).numBuildPrimitives, \ 90 .numPrimitivesToSplit = (obj).numPrimitivesToSplit, \ 91 .instance_descs_start = (obj).instance_descs_start, \ 92 .geo_meta_data_start = (obj).geo_meta_data_start, \ 93 .node_data_start = (obj).node_data_start, \ 94 .leaf_data_start = (obj).leaf_data_start, \ 95 .procedural_data_start = (obj).procedural_data_start, \ 96 .back_pointer_start = (obj).back_pointer_start, \ 97 .sizeTotal = (obj).sizeTotal, \ 98 .updateScratchSizeTotal = (obj).updateScratchSizeTotal, \ 99 .fatleaf_table_start = (obj).fatleaf_table_start, \ 100 .innernode_table_start = (obj).innernode_table_start, \ 101 .max_fatleaves = (obj).max_fatleaves, \ 102 } 103 104 typedef struct AABB { 105 float lower[4]; 106 float upper[4]; 107 } AABB; 108 109 struct Globals 110 { 111 struct AABB centroidBounds; 112 113 unsigned int build_record_start; 114 unsigned int numPrimitives; 115 unsigned int leafPrimType; 116 unsigned int leafSize; 117 118 unsigned int numSplittedPrimitives; 119 unsigned int numBuildRecords; 120 121 // spatial split sate 122 unsigned int numOriginalPrimitives; 123 float presplitPrioritySum; 124 float probThreshold; 125 126 // binned-sah bfs state 127 unsigned int counter; 128 unsigned int numBuildRecords_extended; 129 130 // sync variable used for global-sync on work groups 131 unsigned int sync; 132 133 134 /* morton code builder state */ 135 unsigned int shift; // used by adaptive mc-builder 136 unsigned int shift_mask; // used by adaptive mc-builder 137 unsigned int binary_hierarchy_root; 138 unsigned int p0_allocated_num; 139 unsigned int p0_created_num; 140 unsigned int morton_sort_in_flight; 141 unsigned int sort_iterations; 142 143 gpuva_t binary_hierarchy_buffer; // pointer to the binary morton code hierarchy. Stashed here as a debug aid 144 }; 145 146 typedef struct BVHBase 147 { 148 // TODO: Implement the "copy-first-node" trick... duplicate root node here 149 150 uint64_t rootNodeOffset; 151 152 uint32_t reserved; 153 154 uint32_t nodeDataCur; // nodeDataStart is sizeof(BVHBase) / 64 = BVH_ROOT_NODE_OFFSET / 64 155 uint32_t quadLeafStart; 156 uint32_t quadLeafCur; 157 uint32_t proceduralDataStart; 158 uint32_t proceduralDataCur; 159 uint32_t instanceLeafStart; 160 uint32_t instanceLeafEnd; 161 uint32_t backPointerDataStart; // 162 uint32_t refitTreeletsDataStart; // refit structs 163 uint32_t refitStartPointDataStart; // 164 uint32_t BVHDataEnd; 165 166 // number of bottom treelets 167 // if 1, then the bottom treelet is also tip treelet 168 uint32_t refitTreeletCnt; 169 uint32_t refitTreeletCnt2; // always 0, used for atomic updates 170 // data layout: 171 // @backPointerDataStart 172 // 'backpointer' - a dword per inner node. 173 // The bits are used as follows: 174 // 2:0 --> Used as a refit counter during BVH refitting. MBZ 175 // 5:3 --> Number of children 176 // 31:6 --> Index of the parent node in the internal node array 177 // The root node has a parent index of all ones 178 // @refitTreeletsDataStart 179 // RefitTreelet[], the last treelet is for top treelet all previous are for bottom 180 // @refitStartPointDataStart 181 // for each treelet T there is [T.startpoint_offset, T.numStartpoints) interval of startpoints here in that space 182 // @backPointerDataEnd 183 184 uint32_t fatLeafCount; // number of internal nodes which are "fat-leaves" 185 uint32_t innerCount; // number of internal nodes which are true inner nodes (all internalNode children) 186 uint32_t fatLeafTableStart; 187 uint32_t innerTableStart; 188 189 uint32_t _pad[12]; 190 191 struct RTASMetaData Meta; 192 } BVHBase; 193 194 195 struct BatchedInitGlobalsData 196 { 197 qword p_build_globals; 198 qword p_bvh_buffer; 199 dword numPrimitives; 200 dword numGeometries; 201 dword numInstances; 202 dword instance_descs_start; 203 dword geo_meta_data_start; 204 dword node_data_start; 205 dword leaf_data_start; 206 dword procedural_data_start; 207 dword back_pointer_start; 208 dword sizeTotal; 209 dword leafType; 210 dword leafSize; 211 dword fatleaf_table_start; 212 dword innernode_table_start; 213 }; 214 215 216 #define BFS_NUM_BINS 16 217 #define BFS_NUM_VCONTEXTS 256 218 #define BFS_MAX_DEPTH 32 219 220 #define QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 16384 221 222 struct BFS_Split 223 { 224 float sah; 225 int dim; 226 int pos; 227 }; 228 229 struct BFS_BinInfo 230 { 231 float min_max[18 * BFS_NUM_BINS]; // layout: bins[axis][num_bins][6] 232 // The 6 are lower(xyz) and -upper(xyz) 233 // bins use negated-max so that we can use vectorized mins instead of min/max pairs 234 uint counts[3 * BFS_NUM_BINS]; 235 }; 236 237 struct SAHBuildGlobals 238 { 239 qword p_primref_index_buffers; 240 qword p_primrefs_buffer; 241 qword p_bvh2; 242 qword p_globals; // TODO: deprecate this 243 qword p_bvh_base; 244 gpuva_t p_qnode_root_buffer; 245 246 dword flags; // bit 1 is 'alloc_backpointers'. bit 2 is 'need_masks' 247 dword num_primrefs; 248 dword leaf_size; 249 dword leaf_type; 250 251 dword root_buffer_num_produced; 252 dword root_buffer_num_produced_hi; 253 dword root_buffer_num_consumed; 254 dword root_buffer_num_consumed_hi; 255 dword root_buffer_num_to_consume; 256 dword root_buffer_num_to_consume_hi; 257 }; 258 259 typedef union LRBounds 260 { 261 struct 262 { 263 struct AABB3f left_centroid_bounds; 264 struct AABB3f left_geom_bounds; 265 struct AABB3f right_centroid_bounds; 266 struct AABB3f right_geom_bounds; 267 } boxes; 268 struct 269 { 270 float Array[24]; 271 } scalars; 272 } LRBounds; 273 274 275 struct VContext 276 { 277 uint dispatch_primref_begin; // range of primrefs for this task 278 uint dispatch_primref_end; 279 uint bvh2_root; // BVH2 root node for this task 280 uint tree_depth; // depth of this node in the tree 281 uint num_left; // primref counts 282 uint num_right; 283 uint lr_mask; // lower 8b : left mask. upper 8b : right mask 284 uint batch_index; 285 286 // pass1 global working state and output 287 struct BFS_Split split; 288 struct BFS_BinInfo global_bin_info; 289 290 // pass2 global working state and output 291 LRBounds lr_bounds; 292 }; 293 294 295 296 struct BFSDispatchRecord 297 { 298 ushort batch_index; 299 ushort context_id; 300 }; 301 302 303 struct BFSDispatchQueue 304 { 305 uint num_dispatches; 306 uint wg_count[BFS_NUM_VCONTEXTS]; 307 struct BFSDispatchRecord records[BFS_NUM_VCONTEXTS]; 308 }; 309 310 struct BFS1SpillStackEntry 311 { 312 uint primref_begin; 313 uint primref_end; 314 uint bvh2_root; 315 ushort tree_depth; 316 ushort batch_index; 317 }; 318 319 struct BFS1SpillStack 320 { 321 uint size; 322 struct BFS1SpillStackEntry entries[BFS_NUM_VCONTEXTS * BFS_MAX_DEPTH]; 323 }; 324 325 struct QNodeGlobalRootBufferEntry 326 { 327 uint bvh2_node; 328 uint qnode; 329 uint build_idx; 330 uint _pad; 331 }; 332 333 struct QNodeGlobalRootBuffer 334 { 335 uint curr_entries_offset; // we use "entries" as two buffers, so offset is either 0 or QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM 336 struct QNodeGlobalRootBufferEntry entries[QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM * 2]; 337 }; 338 339 struct DFSDispatchRecord 340 { 341 uint primref_base; 342 uint bvh2_base; 343 uint batch_index; 344 ushort num_primrefs; 345 ushort tree_depth; 346 }; 347 348 349 struct DFSDispatchQueue 350 { 351 struct DFSDispatchRecord records[BFS_NUM_VCONTEXTS * 2]; 352 }; 353 354 #define VCONTEXT_STATE_EXECUTING 0 355 #define VCONTEXT_STATE_UNALLOCATED 1 356 357 union SchedulerUnion 358 { 359 struct VContextScheduler 360 { 361 ///////////////////////////////////////////////////////////// 362 // State data used for communication with command streamer 363 // NOTE: This part must match definition in 'new_sah_builder.grl' 364 ///////////////////////////////////////////////////////////// 365 366 dword num_bfs_wgs; 367 dword num_dfs_wgs; 368 369 dword scheduler_postsync; 370 dword _pad1; 371 372 dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). 373 dword num_single_builds; // number of single-wg builds (#primrefs < threshold) 374 375 dword batched_build_wg_count; // number of wgs to dispatch for initial BFS pass 376 dword batched_build_loop_mask; // value is 0 if #builds <= #contexts. else 1 command streamer uses this as a loop condition 377 378 ///////////////////////////////////////////////////////////// 379 380 dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer 381 dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer 382 383 dword vcontext_state[BFS_NUM_VCONTEXTS]; 384 385 struct BFSDispatchQueue bfs_queue; 386 struct DFSDispatchQueue dfs_queue; 387 388 struct VContext contexts[BFS_NUM_VCONTEXTS]; 389 390 struct BFS1SpillStack bfs2_spill_stack; 391 } vContextScheduler; 392 393 struct QnodeScheduler 394 { 395 dword num_qnode_grb_curr_entries; 396 dword num_qnode_grb_new_entries; 397 398 dword scheduler_postsync; 399 dword _pad1; 400 401 dword num_trivial_builds; // number of trivial builds (#primrefs < leaf_size). 402 dword num_single_builds; // number of single-wg builds (#primrefs < threshold) 403 404 dword batched_builds_to_process; 405 dword num_max_qnode_global_root_buffer_entries; // number of maximum entries to global root buffer 406 407 ///////////////////////////////////////////////////////////// 408 409 dword batched_build_count; // number of batched builds in the SAHBuildGlobals buffer 410 dword batched_build_offset; // location of the first batched-build in the SAHBuildGlobals buffer 411 412 struct QNodeGlobalRootBuffer qnode_global_root_buffer; 413 } qnodeScheduler; 414 }; 415 416 417 struct BVH2Node 418 { 419 struct AABB3f box; 420 uint meta_u; // leaf: primref start. inner: offset from node to its first child 421 uint meta_ss; 422 //ushort meta_s; // leaf: primref count. inner: offset from first to second child, in nodes 423 //uchar is_inner; // 1 if inner, 0 if leaf 424 //uchar mask; 425 }; 426 427 struct BVH2 428 { 429 uint num_nodes; 430 uint _pad[7]; // align to 32B 431 }; 432 433 struct BatchedBLSDispatchEntry 434 { 435 ///////////////////////////////////////////////////////////// 436 // State data used for communication with command streamer 437 // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl' 438 ///////////////////////////////////////////////////////////// 439 qword p_data_buffer; 440 qword num_elements; // number of elements in p_data_buffer 441 }; 442 443 struct SAHBuildArgsBatchable 444 { 445 qword p_globals_ptrs; 446 qword p_scheduler; 447 qword p_buffers_info; 448 qword p_sah_globals; 449 450 dword num_max_qnode_global_root_buffer_entries; 451 dword num_builds; 452 }; 453 454 #define PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(prefix, obj) \ 455 (struct prefix##_SAHBuildArgsBatchable) { \ 456 .p_globals_ptrs = (obj).p_globals_ptrs, \ 457 .p_scheduler = (obj).p_scheduler, \ 458 .p_buffers_info = (obj).p_buffers_info, \ 459 .p_sah_globals = (obj).p_sah_globals, \ 460 .num_max_qnode_global_root_buffer_entries = \ 461 (obj).num_max_qnode_global_root_buffer_entries, \ 462 .num_builds = (obj).num_builds, \ 463 } 464 465 466 struct SAHBuildBuffersInfo 467 { 468 gpuva_t p_globals; 469 gpuva_t p_primref_index_buffers; 470 gpuva_t p_primrefs_buffer; 471 gpuva_t p_bvh2; 472 gpuva_t p_bvh_base; 473 gpuva_t p_qnode_root_buffer; 474 dword sah_globals_flags; 475 dword _pad; 476 gpuva_t _pad2; 477 }; 478 479 #endif /* GRL_STRUCTS_H */ 480