1 //
2 // Copyright (C) 2009-2021 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7
8 #pragma once
9
10 #include "common.h"
11 #include "api_interface.h"
12 #include "instance.h"
13 #include "GRLGen12.h"
14 #include "libs/lsc_intrinsics.h"
15
16
17 __attribute__((reqd_work_group_size(16, 1, 1)))
18 void kernel
DO_update_instance_leaves(global struct BVHBase * bvh,uint64_t dxrInstancesArray,uint64_t dxrInstancesPtr,global struct AABB3f * instance_aabb_scratch,uint id,global struct GRL_RAYTRACING_AABB * procedural_box)19 DO_update_instance_leaves(global struct BVHBase* bvh,
20 uint64_t dxrInstancesArray,
21 uint64_t dxrInstancesPtr,
22 global struct AABB3f* instance_aabb_scratch,
23 uint id ,
24 global struct GRL_RAYTRACING_AABB* procedural_box
25 )
26 {
27
28 global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
29 (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
30 global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
31 (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
32
33 global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
34
35
36 /* iterate over all children of the instance node and get their bounds */
37
38 uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex(&leafs[id]);
39 global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
40 if (dxrInstancesArray != NULL)
41 instance = &instancesArray[instanceIdx];
42 else
43 instance = instancesPtrArray[instanceIdx];
44
45 uint mask = GRL_get_InstanceMask(instance);
46 uint offset = NO_NODE_OFFSET;
47
48 struct AffineSpace3f xfm = AffineSpace3f_load_row_major(instance->Transform);
49 struct AABB3f bbox;
50
51 if (procedural_box != 0)
52 {
53 bbox.lower[0] = procedural_box->MinX;
54 bbox.lower[1] = procedural_box->MinY;
55 bbox.lower[2] = procedural_box->MinZ;
56 bbox.upper[0] = procedural_box->MaxX;
57 bbox.upper[1] = procedural_box->MaxY;
58 bbox.upper[2] = procedural_box->MaxZ;
59 }
60 else
61 {
62 global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
63 bbox = instanceBvh->Meta.bounds;
64 offset = BVH_ROOT_NODE_OFFSET;
65 }
66
67
68 const bool valid_min = isfinite(bbox.lower[0]) && isfinite(bbox.lower[1]) && isfinite(bbox.lower[2]);
69 const bool valid_max = isfinite(bbox.upper[0]) && isfinite(bbox.upper[1]) && isfinite(bbox.upper[2]);
70
71 if (!valid_min || !valid_max )
72 {
73 bbox.lower[0] = xfm.p.x;
74 bbox.lower[1] = xfm.p.y;
75 bbox.lower[2] = xfm.p.z;
76 bbox.upper[0] = xfm.p.x;
77 bbox.upper[1] = xfm.p.y;
78 bbox.upper[2] = xfm.p.z;
79 offset = NO_NODE_OFFSET;
80 mask = 0;
81 }
82 else
83 {
84 bbox = AABB3f_transform(xfm, bbox); // JDB TODO: Use faster abs-matrix method
85 }
86
87 instance_aabb_scratch[id] = bbox;
88
89 HwInstanceLeaf_Constructor(&leafs[id], instance, instanceIdx, offset, mask); // TODO: No instance opening for refittable BVH
90 }
91
92 /*
93 This function starts at some BVH node and refits all nodes upwards
94 to the root. At some node the algorithm only proceeds upwards if
95 all children of the current node have already been processed. This
96 is checked as each time a node is reached an atomic counter is
97 incremented, which will reach the number of children of the node at
98 some time.
99 */
100
refit_bottom_up(global struct QBVHNodeN * qnode_start,global struct BVHBase * bvh,struct AABB * childrenAABB,uint numChildrenTotal)101 GRL_INLINE void refit_bottom_up(global struct QBVHNodeN *qnode_start, // start node to refit (already processed)
102 global struct BVHBase *bvh, // pointer to BVH
103 struct AABB *childrenAABB, // temporary data to use
104 uint numChildrenTotal)
105 {
106 global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
107 BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
108
109 /* compute the index of the start node */
110 uint curNodeIndex = qnode_start - nodeData;
111
112 /* the start node got already processed, thus go to its parent node */
113 curNodeIndex = *InnerNode_GetBackPointer(backPointers,curNodeIndex) >> 6;
114
115 /* end at root node */
116 while (curNodeIndex != 0x03FFFFFF)
117 {
118 /* increment refit counter that counts refitted children of current node */
119 const uint parentPointer = 1 + atomic_inc_global( (__global uint *) InnerNode_GetBackPointer(backPointers, curNodeIndex));
120
121 /* if all children got refitted, then continue */
122 const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
123 numChildrenTotal = (parentPointer >> 3) & 0x7;
124 if (numChildrenRefitted != numChildrenTotal)
125 return;
126
127 /* reset refit counter for next refit */
128 *InnerNode_GetBackPointer(backPointers, curNodeIndex) &= 0xfffffff8;
129
130 /* get bounds of all children from child nodes directly */
131 global struct QBVHNodeN *qnode = nodeData + curNodeIndex;
132 global struct QBVHNodeN *qnode_child = (global struct QBVHNodeN *)QBVHNodeN_childrenPointer(qnode);
133 for (uint k = 0; k < numChildrenTotal; k++)
134 childrenAABB[k] = getAABB_QBVHNodeN(qnode_child + k);
135
136 /* update node bounds of all children */
137 QBVHNodeN_setBounds(qnode, childrenAABB, numChildrenTotal);
138
139 write_mem_fence(CLK_GLOBAL_MEM_FENCE);
140
141 /* make parent node the current node */
142 curNodeIndex = parentPointer >> 6;
143 }
144
145 /* update QBVH6 bounds */
146 struct AABB bounds;
147 AABB_init(&bounds);
148
149 for (uint i = 0; i < numChildrenTotal; i++)
150 AABB_extend(&bounds, &childrenAABB[i]);
151
152 setBVHBaseBounds(bvh, &bounds);
153 }
154
155
SUBGROUP_refit_bottom_up(uniform global struct QBVHNodeN * qnode_start,uniform global struct BVHBase * bvh,varying struct AABB reduce_bounds,uniform uint numChildrenTotal,varying ushort lane,varying ushort head_lane)156 GRL_INLINE void SUBGROUP_refit_bottom_up(
157 uniform global struct QBVHNodeN* qnode_start, // start node to refit (already processed)
158 uniform global struct BVHBase* bvh, // pointer to BVH
159 varying struct AABB reduce_bounds,
160 uniform uint numChildrenTotal,
161 varying ushort lane,
162 varying ushort head_lane)
163 {
164 uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
165 uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
166
167 /* compute the index of the start node */
168 uniform uint curNodeIndex = qnode_start - nodeData;
169
170 /* the start node got already processed, thus go to its parent node */
171 uniform curNodeIndex = *InnerNode_GetBackPointer(backPointers, curNodeIndex) >> 6;
172
173 varying struct AABB childrenAABB;
174
175 /* end at root node */
176 while ( curNodeIndex != 0x03FFFFFF )
177 {
178 mem_fence_gpu_invalidate();
179
180 /* increment refit counter that counts refitted children of current node */
181 uniform uint parentPointer = 1;
182 if (lane == 0)
183 {
184 // acquire fence ensures that all previous writes complete before the atomic starts
185 parentPointer += atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, curNodeIndex));
186 }
187
188 parentPointer = intel_sub_group_shuffle( parentPointer, head_lane );
189
190 /* if all children got refitted, then continue */
191 uniform uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
192 numChildrenTotal = (parentPointer >> 3) & 0x7;
193 if ( numChildrenRefitted != numChildrenTotal )
194 return;
195
196 /* reset refit counter for next refit */
197 if (lane == 0)
198 {
199 *InnerNode_GetBackPointer(backPointers, curNodeIndex) = (parentPointer & 0xfffffff8);
200 }
201
202 /* get bounds of all children from child nodes directly */
203 global struct QBVHNodeN* qnode = nodeData + curNodeIndex;
204 global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
205
206 varying ushort child_idx = (lane < numChildrenTotal) ? lane : 0;
207 childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
208
209 /* update node bounds of all children */
210 reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
211 reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, head_lane );
212
213 subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildrenTotal, lane);
214
215 /* update node mask */
216 uchar childrenMask = qnode_child[child_idx].instMask;
217
218 qnode->instMask = sub_group_reduce_or_N6(childrenMask);
219
220 /* make parent node the current node */
221 curNodeIndex = parentPointer >> 6;
222 }
223
224 /* update QBVH6 bounds */
225
226 if( lane == 0 )
227 setBVHBaseBounds( bvh, &reduce_bounds );
228 }
229
230
quadCopyVertices(const struct QuadLeaf * pQuad,struct QuadLeaf * newQuad)231 GRL_INLINE void quadCopyVertices(
232 const struct QuadLeaf* pQuad,
233 struct QuadLeaf* newQuad)
234 {
235 const uint4* s = (const uint4*) & (pQuad->v[0][0]);
236 uint4* d = (uint4*) & (newQuad->v[0][0]);
237 const uint8* s2 = (const uint8*)(s+1);
238 uint8* d2 = (uint8*)(d+1);
239 *d = *s;
240 *d2 = *s2;
241 }
242
243
get_updated_quad(global const struct QuadLeaf * pQuad,global GRL_RAYTRACING_GEOMETRY_DESC * geomDescs,struct QuadLeaf * newQuad)244 GRL_INLINE void get_updated_quad(
245 global const struct QuadLeaf* pQuad,
246 global GRL_RAYTRACING_GEOMETRY_DESC* geomDescs,
247 struct QuadLeaf* newQuad)
248 {
249 struct QuadLeaf tempQuad;
250
251 // fetch non vtx data;
252 {
253 uint4* tempQuad4U = (uint4*)&tempQuad;
254 global const uint4* pQuad4U = (global const uint4*)pQuad;
255 *tempQuad4U = *pQuad4U;
256 }
257
258 /* get the geomID and primID0/1 for both quad triangles */
259 const uint geomID = PrimLeaf_GetGeoIndex(&tempQuad.leafDesc);
260 const uint primID0 = tempQuad.primIndex0;
261 const uint primID1 = tempQuad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&tempQuad);
262 ushort fourth_vert = 0;
263
264 if (primID1 != primID0)
265 {
266 ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(&tempQuad);
267 fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
268 fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
269 }
270
271 global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDescs + geomID;
272
273 uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
274
275 // read the indices of the 4 verts we want
276 float3 vtx0, vtx1, vtx2, vtx3;
277 GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
278
279 QuadLeaf_SetVertices(&tempQuad, vtx0, vtx1, vtx2, vtx3);
280
281 *newQuad = tempQuad;
282 }
283
284 // This calculates children BBs for innerNode having *all* children leafs.
285 // mixed nodes will be updated by passing through bottom-up thread.
refit_bottom(global struct BVHBase * bvh,global GRL_RAYTRACING_GEOMETRY_DESC * geomDesc,global struct AABB3f * instance_leaf_aabbs,global struct QBVHNodeN * curNode,struct AABB * childrenAABB,uint backPointer)286 GRL_INLINE uint refit_bottom( global struct BVHBase* bvh,
287 global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
288 global struct AABB3f* instance_leaf_aabbs,
289 global struct QBVHNodeN* curNode,
290 struct AABB *childrenAABB,
291 uint backPointer)
292 {
293 uint numChildren = 0;
294
295 /* we start refit at leaf nodes, this case is for quad nodes */
296 if (curNode->type == BVH_QUAD_NODE)
297 {
298 global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
299
300 /* iterate over all quads of the quad node and get their bounds */
301 numChildren = (backPointer >> 3) & 0x7;
302 for (uint k = 0; k < numChildren; k++)
303 {
304 struct QuadLeaf Q;
305 get_updated_quad(&quads[k], geomDesc, &Q);
306 quadCopyVertices(&Q, &quads[k]);
307 childrenAABB[k] = getAABB_Quad((struct Quad*)&Q); // FIXME: support leaves with more than one quad
308 }
309 }
310
311 /* we start refit at leaf nodes, this case is for procedural nodes */
312 else if (curNode->type == BVH_PROCEDURAL_NODE)
313 {
314 global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
315
316 /* iterate over all children of the procedural node and get their bounds */
317 numChildren = (backPointer >> 3) & 0x7;
318 for (uint k = 0; k < numChildren; k++)
319 {
320 /* extract geomID and primID from leaf */
321 const uint startPrim = QBVHNodeN_startPrim(curNode, k);
322 const uint geomID = ProceduralLeaf_geomIndex(leaf);
323 const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
324
325 /* read bounds from geometry descriptor */
326 struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
327 childrenAABB[k].lower.x = aabb.MinX;
328 childrenAABB[k].lower.y = aabb.MinY;
329 childrenAABB[k].lower.z = aabb.MinZ;
330 childrenAABB[k].upper.x = aabb.MaxX;
331 childrenAABB[k].upper.y = aabb.MaxY;
332 childrenAABB[k].upper.z = aabb.MaxZ;
333
334 /* advance leaf pointer to next child */
335 leaf += QBVHNodeN_blockIncr(curNode, k);
336 }
337 }
338
339 /* we start refit at leaf nodes, this case is for instance nodes */
340 else if (curNode->type == BVH_INSTANCE_NODE)
341 {
342 global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
343 global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
344
345 /* iterate over all children of the instance node and get their bounds */
346 numChildren = (backPointer >> 3) & 0x7;
347 for (uint k = 0; k < numChildren; k++)
348 {
349 uint leafindex = (instancesLeaves + k) - leafBase;
350 childrenAABB[k].lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
351 childrenAABB[k].upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
352 }
353 }
354
355 return numChildren;
356 }
357
358
359
360
361
362 // This calculates children BBs for innerNode having *all* children leafs.
363 // mixed nodes will be updated by passing through bottom-up thread.
SUBGROUP_refit_bottom(uniform global struct BVHBase * bvh,uniform global GRL_RAYTRACING_GEOMETRY_DESC * geomDesc,uniform global struct AABB3f * instance_leaf_aabbs,uniform global struct QBVHNodeN * curNode,uniform uint backPointer,varying struct AABB * childrenAABB,varying uchar * childrenMask,varying ushort lane,global uchar * is_procedural_instance)364 GRL_INLINE uint SUBGROUP_refit_bottom(
365 uniform global struct BVHBase* bvh,
366 uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
367 uniform global struct AABB3f* instance_leaf_aabbs,
368 uniform global struct QBVHNodeN* curNode,
369 uniform uint backPointer,
370 varying struct AABB* childrenAABB,
371 varying uchar* childrenMask,
372 varying ushort lane,
373 global uchar* is_procedural_instance
374 )
375 {
376 uniform uint numChildren = 0;
377 bool enable_procedural_instance = (is_procedural_instance != 0);
378
379 /* we start refit at leaf nodes, this case is for quad nodes */
380 if (curNode->type == BVH_QUAD_NODE)
381 {
382 /* iterate over all quads of the quad node and get their bounds */
383 numChildren = (backPointer >> 3) & 0x7;
384
385 uniform global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
386
387 struct QuadLeaf Q;
388 if (lane < numChildren)
389 {
390 get_updated_quad(&quads[lane], geomDesc, &Q);
391
392 *childrenAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
393
394 quadCopyVertices(&Q, &quads[lane]);
395 *childrenMask = 0xff;
396 }
397 // FIXME: support leaves with more than one quad
398 }
399
400 /* we start refit at leaf nodes, this case is for procedural nodes */
401 else if (curNode->type == BVH_PROCEDURAL_NODE)
402 {
403 uniform global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
404
405
406
407 /* iterate over all children of the procedural node and get their bounds */
408 numChildren = (backPointer >> 3) & 0x7;
409
410 varying uint incr = (lane < numChildren) ? InternalNode_GetChildBlockIncr((struct InternalNode*)curNode, lane) : 0;
411 incr = sub_group_scan_exclusive_add(incr);
412
413 if( lane < numChildren )
414 {
415 /* extract geomID and primID from leaf */
416 varying uint start_prim = InternalNode_GetChildStartPrim((struct InternalNode*)curNode, lane );
417 varying global struct ProceduralLeaf* my_leaf = leaf + incr;
418 const uint geomID = ProceduralLeaf_geomIndex(my_leaf);
419 const uint primID = ProceduralLeaf_primIndex(my_leaf, start_prim);
420
421 /* read bounds from geometry descriptor */
422 struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
423 childrenAABB->lower.x = aabb.MinX;
424 childrenAABB->lower.y = aabb.MinY;
425 childrenAABB->lower.z = aabb.MinZ;
426 childrenAABB->upper.x = aabb.MaxX;
427 childrenAABB->upper.y = aabb.MaxY;
428 childrenAABB->upper.z = aabb.MaxZ;
429 *childrenMask = 0xff;
430 }
431 }
432
433 /* we start refit at leaf nodes, this case is for instance nodes */
434 else if ( !enable_procedural_instance && curNode->type == BVH_INSTANCE_NODE)
435 {
436 uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
437 uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
438
439 /* iterate over all children of the instance node and get their bounds and masks */
440 numChildren = (backPointer >> 3) & 0x7;
441 if( lane < numChildren )
442 {
443 uint leafindex = (instancesLeaves + lane) - leafBase;
444 childrenAABB->lower.xyz = AABB3f_load_lower(&instance_leaf_aabbs[leafindex]);
445 childrenAABB->upper.xyz = AABB3f_load_upper(&instance_leaf_aabbs[leafindex]);
446 *childrenMask = HwInstanceLeaf_GetInstanceMask(&leafBase[leafindex]);
447 }
448 }
449 else if (enable_procedural_instance && curNode->type == BVH_INTERNAL_NODE)
450 {
451 // Handle procedural-instance leaves
452 // TODO: Generalize this! Should re-write the kernel to work with arbitrary mixed-mode leaves
453
454 numChildren = (backPointer >> 3) & 0x7;
455 uint childType = BVH_INTERNAL_NODE;
456 if ( lane < numChildren )
457 {
458 childType = InternalNode_GetChildType( (struct InternalNode*)curNode, lane );
459 if (childType != BVH_INTERNAL_NODE)
460 {
461 uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer( curNode );
462 uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
463 uint leafindex = (instancesLeaves + lane) - leafBase;
464 childrenAABB->lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
465 childrenAABB->upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
466 *childrenMask = HwInstanceLeaf_GetInstanceMask( &leafBase[leafindex] );
467
468 // see if the child has flipped from procedural to non-procedural and update the child type field as needed
469 uint instanceIndex = HwInstanceLeaf_GetInstanceIndex( &leafBase[leafindex] );
470 uint newChildType = is_procedural_instance[instanceIndex] ? BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
471 if (newChildType != childType)
472 {
473 InternalNode_SetChildType( (struct InternalNode*)curNode, lane, newChildType );
474 }
475 }
476 }
477
478
479 // don't ascend the tree for a true internal node
480 if (sub_group_all(childType == BVH_INTERNAL_NODE))
481 numChildren = 0;
482 }
483
484 return numChildren;
485 }
486
487 #define SG_REFIT_WG_SIZE 8
488
DO_Refit_per_one_startpoint_sg(global struct BVHBase * bvh,global GRL_RAYTRACING_GEOMETRY_DESC * geosArray,global struct AABB3f * instance_leaf_aabbs,global uchar * is_procedural_instance)489 void DO_Refit_per_one_startpoint_sg(
490 global struct BVHBase* bvh,
491 global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
492 global struct AABB3f* instance_leaf_aabbs,
493 global uchar* is_procedural_instance )
494 {
495 /* get pointer to inner nodes and back pointers */
496 global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
497 BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
498
499 /* get the inner node that we will consider as a bottom startpoint */
500 const uint numInnerNodes = BVHBase_numNodes(bvh);
501 const uint innerNodeIdx = get_sub_group_global_id();
502
503 varying ushort lane = get_sub_group_local_id();
504
505 if (innerNodeIdx >= numInnerNodes) return;
506
507 varying struct AABB childrenAABB; // one child AABB per lane
508 AABB_init(&childrenAABB);
509
510 varying uchar childrenMask = 0; // one child mask per lane
511
512 global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
513 uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
514 uint numChildren = SUBGROUP_refit_bottom(
515 bvh,
516 geosArray,
517 instance_leaf_aabbs,
518 curNode,
519 backPointer,
520 &childrenAABB,
521 &childrenMask,
522 lane,
523 is_procedural_instance
524 );
525
526
527 if (numChildren != 0)
528 {
529 /* update bounds of node */
530 struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&childrenAABB);
531 reduce_bounds = AABB_sub_group_shuffle(&reduce_bounds, 0);
532 subgroup_QBVHNodeN_setBounds(curNode, reduce_bounds, childrenAABB, numChildren, lane);
533
534 /* update mask of node */
535 uchar mask = sub_group_reduce_or_N6(childrenMask);
536 curNode->instMask = mask;
537
538 /* Leave this fence for now for all threads, if WG size is increased (tried 128) and fence is done
539 only by the first thread (similar to morton phase1) the machine hangs. */
540 mem_fence_gpu_invalidate();
541
542 /* refit upper parts of the BVH */
543 /* TODO: this will not gonna work for mixed nodes */
544 SUBGROUP_refit_bottom_up(curNode, bvh, reduce_bounds, numChildren, lane, 0);
545 }
546 }