1// 2// Copyright (C) 2009-2022 Intel Corporation 3// 4// SPDX-License-Identifier: MIT 5// 6// 7 8#include "libs/lsc_intrinsics.h" 9#include "morton/morton_common.h" 10 11GRL_INLINE void SUBGROUP_create_node_phase0( 12 uniform global struct Globals* globals, 13 uniform global struct BinaryMortonCodeHierarchy* bnodes, 14 uniform global char* bvh_mem, 15 uniform global uint *global_refit_startpoints, 16 uniform uint rID, 17 uniform local uint* local_numRecords, 18 uniform local uint* local_QNodeOffset, 19 uniform global struct BuildRecordMorton* records, 20 uniform struct BuildRecordMorton current, 21 uniform local uint* local_startpoints_num) 22{ 23 uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 24 uniform const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET; 25 uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); 26 uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); 27 28 varying ushort lane = get_sub_group_local_id(); 29 30 /* initialize child array */ 31 uniform uint numChildren = 2; 32 varying struct BuildRecordMorton sg_children; 33 sg_children.items = 0; 34 sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild; 35 36 if ( lane < numChildren ) 37 sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID ); 38 39 /* fill QBVH6 node with up to 6 children */ 40 while ( numChildren < BVH_NODE_N6 ) 41 { 42 varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize; 43 if ( sub_group_all( sg_is_leaf ) ) 44 break; 45 46 uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items ); 47 uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) ); 48 uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild ); 49 50 varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild; 51 52 if ( lane == numChildren || lane == bestChild ) 53 { 54 sg_children.nodeID = nodeID; 55 sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID ); 56 } 57 58 numChildren++; 59 } 60 61 const uint current_index = current.current_index; 62 struct QBVHNodeN* qnode = nodeData + current_index; 63 SUBGROUP_QBVHNodeN_setChildIncr1( qnode ); 64 65 uniform uint global_offset; 66 uniform uint child_node_offset; 67 68 // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later 69 // used in global refit after phase1 70 varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0; 71 uniform uchar children_roots_num = sub_group_reduce_add(is_children_root); 72 73 if ( lane == 0 ) 74 { 75 child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren); 76 77 /* create node, but to not set bounds yet as these get calculated during refit */ 78 QBVH6Node_set_type( qnode, BVH_INTERNAL_NODE ); 79 QBVH6Node_set_offset( qnode, (global struct QBVHNodeN*)(bvh_mem + child_node_offset) ); 80 /* set back pointers */ 81 uint backpointer = (current.parent_index << 6) | (numChildren << 3); 82 83 global_offset = atomic_add_local( local_numRecords, numChildren - 1 ); 84 85#if MORTON_VERBOSE_LOG 86 printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d\n", 87 rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren); 88#endif 89 90 if(children_roots_num == numChildren) 91 { 92 uint startpoints_offset = atomic_inc_local( local_startpoints_num ); 93 global_refit_startpoints[startpoints_offset] = current_index; 94 } 95 else 96 { 97 backpointer += children_roots_num; 98 } 99 100 *InnerNode_GetBackPointer(backPointers, current_index) = backpointer; 101 } 102 103 child_node_offset = sub_group_broadcast( child_node_offset, 0 ); 104 global_offset = sub_group_broadcast( global_offset, 0 ); 105 106 uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset); 107 108 sg_children.current_index = childNodes - nodeData + lane; 109 sg_children.parent_index = current_index; 110 111 if ( lane < numChildren ) 112 { 113 uint write_position = (lane == 0) ? rID : global_offset + lane - 1; 114 records[write_position] = sg_children; 115 } 116} 117 118 119GRL_INLINE void SUBGROUP_create_node_phase0_local_sync( 120 uniform global struct Globals* globals, 121 uniform global struct BinaryMortonCodeHierarchy* bnodes, 122 uniform global char* bvh_mem, 123 uniform uint rID, 124 uniform local uint* local_numRecords, 125 uniform local uint* local_QNodeOffset, 126 uniform global struct BuildRecordMorton* records, 127 uniform struct BuildRecordMorton current, 128 uniform local uint* local_p0_total, 129 uniform global struct MortonFlattenedBoxlessNode *boxless_nodes, 130 uniform uint nodeDataStart) 131{ 132 uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 133 uniform const uint rootNodeOffset = bvh->rootNodeOffset; 134 uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); 135 uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); 136 137 varying ushort lane = get_sub_group_local_id(); 138 139 /* initialize child array */ 140 uniform uint numChildren = 2; 141 varying struct BuildRecordMorton sg_children; 142 sg_children.items = 0; 143 sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild; 144 145 if ( lane < numChildren ) 146 sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID ); 147 148 /* fill QBVH6 node with up to 6 children */ 149 while ( numChildren < BVH_NODE_N6 ) 150 { 151 varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize; 152 if ( sub_group_all( sg_is_leaf ) ) 153 break; 154 155 uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items ); 156 uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) ); 157 uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild ); 158 159 varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild; 160 161 if ( lane == numChildren || lane == bestChild ) 162 { 163 sg_children.nodeID = nodeID; 164 sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID ); 165 } 166 167 numChildren++; 168 } 169 170 const uint current_index = current.current_index; 171 uniform uint global_offset; 172 uniform uint child_node_offset; 173 174 // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later 175 // used in global refit after phase1 176 varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0; 177 uniform uchar rootMask = sub_group_reduce_or_N6(is_children_root << lane); 178 uniform uchar children_roots_num = sub_group_reduce_add(is_children_root); 179 180 if ( lane == 0 ) 181 { 182 child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren); 183 184 /* Do not create qnodes here */ 185 uint backpointer = (current.parent_index << 6) | (numChildren << 3); 186 187 global_offset = atomic_add_local( local_numRecords, numChildren - 1 ); 188 189#if MORTON_VERBOSE_LOG 190 printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, nodeDataStart: %d\n", 191 rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren, nodeDataStart); 192#endif 193 194 MortonFlattenedBoxlessNode flattened_node; 195 196 if(children_roots_num != numChildren) 197 backpointer += children_roots_num; 198 199 flattened_node.binary_hierarchy_index = (current_index << 6) | rootMask; 200 201 uint loc_id = atomic_inc_local( local_p0_total ); 202 203 flattened_node.childOffset_type = ((((child_node_offset - nodeDataStart * 64) / 64) - current_index) << 6) | BVH_INTERNAL_NODE; 204 flattened_node.backPointer = backpointer; 205 206 //TODO: change this writes to L1WB or streaming 207 boxless_nodes[loc_id] = flattened_node; 208 209 *InnerNode_GetBackPointer(backPointers, current_index) = backpointer; 210 } 211 212 child_node_offset = sub_group_broadcast( child_node_offset, 0 ); 213 global_offset = sub_group_broadcast( global_offset, 0 ); 214 215 uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset); 216 217 sg_children.current_index = childNodes - nodeData + lane; 218 sg_children.parent_index = current_index; 219 220 if ( lane < numChildren ) 221 { 222 uint write_position = (lane == 0) ? rID : global_offset + lane - 1; 223 records[write_position] = sg_children; 224 } 225} 226 227/* 228 229 In this phase a single large work group performs the construction of 230 the top of the BVH and creates a build record array. 231 232 Two varians of this kernel: 233 1. Refit with global synchronization - Used for big bvh, where number of allocated nodes will not fit 234 in SLM in phase2. Phase0 creates qnodes in bvh, and provides startpoints for bottom up phase 235 that is executed after phase1. This refit uses global synchronizations and mem_fence_gpu_invalidate 236 that is not effective. 237 2. Refit with local synchronization - Flattened boxless nodes are passed via global memory, along with 238 number of created nodes. Phase0 does not create qnodes in bvh, it is done in phase2 during refit. 239 In phase2, flattened boxless nodes are moved to SLM, along with bounding boxes from phase1. 240 Refit is performed only with local synchronization. 241 242*/ 243 244__attribute__((reqd_work_group_size(512, 1, 1))) 245__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel 246parallel_build_phase0(global struct Globals *globals, 247 global struct BinaryMortonCodeHierarchy *bnodes, 248 global char *bvh_mem, 249 global uint *global_refit_startpoints) 250{ 251 global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; 252 global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); 253 254 /* a queue of build records in global memory */ 255 global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); 256 local uint local_numRecords; 257 local uint local_QNodeOffset; 258 local uint local_startpoints_num; 259 260 /* initialize first build record */ 261 if (get_local_id(0) == 0) 262 { 263 /* allocate root node */ 264 uint root_node_offset = 64*bvh->nodeDataCur; 265 global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset); 266 267 //assert(root_node_offset == 0); 268 records[0].nodeID = globals->binary_hierarchy_root; 269 records[0].items = globals->numPrimitives; 270 records[0].current_index = rootNode - nodeData; 271 records[0].parent_index = -1; 272 273 local_numRecords = 1; 274 local_QNodeOffset = root_node_offset + 64; 275 local_startpoints_num = 0; 276 277 mem_fence_workgroup_default(); 278 } 279 280 uint num_records = 1; 281 282 /* terminate when all subtrees are under size threshold */ 283 while(true) 284 { 285 work_group_barrier(CLK_LOCAL_MEM_FENCE); 286 287 /* all work items in the work group pick a subtree to build */ 288 for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() ) 289 { 290 /* small subtrees will get built in next phase */ 291 if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives 292 continue; 293 294 /* create QBVH node */ 295 SUBGROUP_create_node_phase0(globals, bnodes, bvh_mem, global_refit_startpoints, ID, &local_numRecords, &local_QNodeOffset, 296 records, records[ID], &local_startpoints_num); 297 } 298 299 work_group_barrier( CLK_LOCAL_MEM_FENCE ); 300 mem_fence_workgroup_default(); 301 uint old_num_records = num_records; 302 num_records = local_numRecords; 303 if( old_num_records == num_records ) 304 break; 305 306 } 307 308 /* remember number of build records for next phase */ 309 if (get_local_id( 0 ) == 0) 310 { 311 globals->numBuildRecords = local_numRecords; 312 globals->p0_created_num = local_startpoints_num; 313 bvh->nodeDataCur = local_QNodeOffset / 64; 314 315#if MORTON_VERBOSE_LOG 316 printf("PHASE_0: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->p0_created_num); 317#endif 318 } 319} 320 321__attribute__((reqd_work_group_size(512, 1, 1))) 322__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel 323parallel_build_phase0_local_sync(global struct Globals *globals, 324 global struct BinaryMortonCodeHierarchy *bnodes, 325 global char *bvh_mem, 326 global struct MortonFlattenedBoxlessNode *boxless_nodes) 327{ 328 global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem; 329 global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh); 330 uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64; 331 332 /* a queue of build records in global memory */ 333 global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart); 334 local uint local_numRecords; 335 local uint local_QNodeOffset; 336 local uint local_p0_total; 337 338 /* initialize first build record */ 339 if (get_local_id(0) == 0) 340 { 341 /* allocate root node */ 342 uint root_node_offset = 64*bvh->nodeDataCur; 343 global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset); 344 345 //assert(root_node_offset == 0); 346 records[0].nodeID = globals->binary_hierarchy_root; 347 records[0].items = globals->numPrimitives; 348 records[0].current_index = rootNode - nodeData; 349 records[0].parent_index = -1; 350 351 local_numRecords = 1; 352 local_QNodeOffset = root_node_offset + 64; 353 local_p0_total = 0; 354 355 mem_fence_workgroup_default(); 356 } 357 358 uint num_records = 1; 359 360 /* terminate when all subtrees are under size threshold */ 361 while(true) 362 { 363 work_group_barrier(CLK_LOCAL_MEM_FENCE); 364 365 /* all work items in the work group pick a subtree to build */ 366 for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() ) 367 { 368 /* small subtrees will get built in next phase */ 369 if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives 370 continue; 371 372 /* create QBVH node */ 373 SUBGROUP_create_node_phase0_local_sync(globals, bnodes, bvh_mem, ID, &local_numRecords, &local_QNodeOffset, records, 374 records[ID], &local_p0_total, boxless_nodes, nodeDataStart); 375 } 376 377 mem_fence_workgroup_default(); 378 work_group_barrier( CLK_LOCAL_MEM_FENCE ); 379 380 uint old_num_records = num_records; 381 num_records = local_numRecords; 382 if( old_num_records == num_records ) 383 break; 384 385 } 386 387 /* remember number of build records for next phase */ 388 if (get_local_id( 0 ) == 0) 389 { 390 globals->numBuildRecords = local_numRecords; 391 bvh->nodeDataCur = local_QNodeOffset / 64; 392 393 globals->p0_allocated_num = BVHBase_numNodes(bvh); 394 globals->p0_created_num = local_p0_total; 395 396#if MORTON_VERBOSE_LOG 397 printf("PHASE_0_LOCAL_SYNC: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->global_refit_startpoints); 398#endif 399 } 400} 401