xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/bvh_build_refit.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 //
2 // Copyright (C) 2009-2021 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7 
8 #pragma once
9 
10 #include "common.h"
11 #include "api_interface.h"
12 #include "instance.h"
13 #include "GRLGen12.h"
14 #include "libs/lsc_intrinsics.h"
15 
16 
17 __attribute__((reqd_work_group_size(16, 1, 1)))
18 void kernel
DO_update_instance_leaves(global struct BVHBase * bvh,uint64_t dxrInstancesArray,uint64_t dxrInstancesPtr,global struct AABB3f * instance_aabb_scratch,uint id,global struct GRL_RAYTRACING_AABB * procedural_box)19 DO_update_instance_leaves(global struct BVHBase* bvh,
20     uint64_t dxrInstancesArray,
21     uint64_t dxrInstancesPtr,
22     global struct AABB3f* instance_aabb_scratch,
23     uint id ,
24     global struct GRL_RAYTRACING_AABB* procedural_box
25 )
26 {
27 
28     global struct GRL_RAYTRACING_INSTANCE_DESC* instancesArray =
29         (global struct GRL_RAYTRACING_INSTANCE_DESC*)dxrInstancesArray;
30     global struct GRL_RAYTRACING_INSTANCE_DESC** instancesPtrArray =
31         (global struct GRL_RAYTRACING_INSTANCE_DESC**)dxrInstancesPtr;
32 
33     global struct HwInstanceLeaf* leafs = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
34 
35 
36     /* iterate over all children of the instance node and get their bounds */
37 
38     uint32_t instanceIdx = HwInstanceLeafPart1_getInstanceIndex(&leafs[id]);
39     global struct GRL_RAYTRACING_INSTANCE_DESC* instance = NULL;
40     if (dxrInstancesArray != NULL)
41         instance = &instancesArray[instanceIdx];
42     else
43         instance = instancesPtrArray[instanceIdx];
44 
45     uint mask = GRL_get_InstanceMask(instance);
46     uint offset = NO_NODE_OFFSET;
47 
48     struct AffineSpace3f xfm = AffineSpace3f_load_row_major(instance->Transform);
49     struct AABB3f bbox;
50 
51     if (procedural_box != 0)
52     {
53         bbox.lower[0] = procedural_box->MinX;
54         bbox.lower[1] = procedural_box->MinY;
55         bbox.lower[2] = procedural_box->MinZ;
56         bbox.upper[0] = procedural_box->MaxX;
57         bbox.upper[1] = procedural_box->MaxY;
58         bbox.upper[2] = procedural_box->MaxZ;
59     }
60     else
61     {
62         global struct BVHBase* instanceBvh = (global struct BVHBase*)instance->AccelerationStructure;
63         bbox = instanceBvh->Meta.bounds;
64         offset = BVH_ROOT_NODE_OFFSET;
65     }
66 
67 
68     const bool valid_min = isfinite(bbox.lower[0]) && isfinite(bbox.lower[1]) && isfinite(bbox.lower[2]);
69     const bool valid_max = isfinite(bbox.upper[0]) && isfinite(bbox.upper[1]) && isfinite(bbox.upper[2]);
70 
71     if (!valid_min || !valid_max )
72     {
73         bbox.lower[0] = xfm.p.x;
74         bbox.lower[1] = xfm.p.y;
75         bbox.lower[2] = xfm.p.z;
76         bbox.upper[0] = xfm.p.x;
77         bbox.upper[1] = xfm.p.y;
78         bbox.upper[2] = xfm.p.z;
79         offset = NO_NODE_OFFSET;
80         mask = 0;
81     }
82     else
83     {
84         bbox = AABB3f_transform(xfm, bbox); // JDB TODO:  Use faster abs-matrix method
85     }
86 
87     instance_aabb_scratch[id] = bbox;
88 
89     HwInstanceLeaf_Constructor(&leafs[id], instance, instanceIdx, offset, mask); // TODO: No instance opening for refittable BVH
90 }
91 
92 /*
93    This function starts at some BVH node and refits all nodes upwards
94    to the root. At some node the algorithm only proceeds upwards if
95    all children of the current node have already been processed. This
96    is checked as each time a node is reached an atomic counter is
97    incremented, which will reach the number of children of the node at
98    some time.
99  */
100 
refit_bottom_up(global struct QBVHNodeN * qnode_start,global struct BVHBase * bvh,struct AABB * childrenAABB,uint numChildrenTotal)101 GRL_INLINE void refit_bottom_up(global struct QBVHNodeN *qnode_start, // start node to refit (already processed)
102                             global struct BVHBase *bvh,           // pointer to BVH
103                             struct AABB *childrenAABB,            // temporary data to use
104                             uint numChildrenTotal)
105 {
106     global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
107     BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
108 
109     /* compute the index of the start node */
110     uint curNodeIndex = qnode_start - nodeData;
111 
112     /* the start node got already processed, thus go to its parent node */
113     curNodeIndex = *InnerNode_GetBackPointer(backPointers,curNodeIndex) >> 6;
114 
115     /* end at root node */
116     while (curNodeIndex != 0x03FFFFFF)
117     {
118         /* increment refit counter that counts refitted children of current node */
119         const uint parentPointer = 1 + atomic_inc_global( (__global uint *) InnerNode_GetBackPointer(backPointers, curNodeIndex));
120 
121         /* if all children got refitted, then continue */
122         const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
123         numChildrenTotal = (parentPointer >> 3) & 0x7;
124         if (numChildrenRefitted != numChildrenTotal)
125             return;
126 
127         /* reset refit counter for next refit */
128         *InnerNode_GetBackPointer(backPointers, curNodeIndex) &= 0xfffffff8;
129 
130         /* get bounds of all children from child nodes directly */
131         global struct QBVHNodeN *qnode = nodeData + curNodeIndex;
132         global struct QBVHNodeN *qnode_child = (global struct QBVHNodeN *)QBVHNodeN_childrenPointer(qnode);
133         for (uint k = 0; k < numChildrenTotal; k++)
134             childrenAABB[k] = getAABB_QBVHNodeN(qnode_child + k);
135 
136         /* update node bounds of all children */
137         QBVHNodeN_setBounds(qnode, childrenAABB, numChildrenTotal);
138 
139         write_mem_fence(CLK_GLOBAL_MEM_FENCE);
140 
141         /* make parent node the current node */
142         curNodeIndex = parentPointer >> 6;
143     }
144 
145     /* update QBVH6 bounds */
146     struct AABB bounds;
147     AABB_init(&bounds);
148 
149     for (uint i = 0; i < numChildrenTotal; i++)
150         AABB_extend(&bounds, &childrenAABB[i]);
151 
152     setBVHBaseBounds(bvh, &bounds);
153 }
154 
155 
SUBGROUP_refit_bottom_up(uniform global struct QBVHNodeN * qnode_start,uniform global struct BVHBase * bvh,varying struct AABB reduce_bounds,uniform uint numChildrenTotal,varying ushort lane,varying ushort head_lane)156 GRL_INLINE void SUBGROUP_refit_bottom_up(
157     uniform global struct QBVHNodeN* qnode_start, // start node to refit (already processed)
158     uniform global struct BVHBase* bvh,           // pointer to BVH
159     varying struct AABB reduce_bounds,
160     uniform uint numChildrenTotal,
161     varying ushort lane,
162     varying ushort head_lane)
163 {
164     uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
165     uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
166 
167     /* compute the index of the start node */
168     uniform uint curNodeIndex = qnode_start - nodeData;
169 
170     /* the start node got already processed, thus go to its parent node */
171     uniform curNodeIndex = *InnerNode_GetBackPointer(backPointers, curNodeIndex) >> 6;
172 
173     varying struct AABB childrenAABB;
174 
175     /* end at root node */
176     while ( curNodeIndex != 0x03FFFFFF )
177     {
178         mem_fence_gpu_invalidate();
179 
180         /* increment refit counter that counts refitted children of current node */
181         uniform uint parentPointer = 1;
182         if (lane == 0)
183         {
184             // acquire fence ensures that all previous writes complete before the atomic starts
185             parentPointer += atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, curNodeIndex));
186         }
187 
188         parentPointer = intel_sub_group_shuffle( parentPointer, head_lane );
189 
190         /* if all children got refitted, then continue */
191         uniform uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
192         numChildrenTotal = (parentPointer >> 3) & 0x7;
193         if ( numChildrenRefitted != numChildrenTotal )
194             return;
195 
196         /* reset refit counter for next refit */
197         if (lane == 0)
198         {
199             *InnerNode_GetBackPointer(backPointers, curNodeIndex) = (parentPointer & 0xfffffff8);
200         }
201 
202         /* get bounds of all children from child nodes directly */
203         global struct QBVHNodeN* qnode = nodeData + curNodeIndex;
204         global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
205 
206         varying ushort child_idx = (lane < numChildrenTotal) ? lane : 0;
207         childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
208 
209         /* update node bounds of all children */
210         reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
211         reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, head_lane );
212 
213         subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildrenTotal, lane);
214 
215         /* update node mask */
216         uchar childrenMask = qnode_child[child_idx].instMask;
217 
218         qnode->instMask = sub_group_reduce_or_N6(childrenMask);
219 
220         /* make parent node the current node */
221         curNodeIndex = parentPointer >> 6;
222     }
223 
224     /* update QBVH6 bounds */
225 
226     if( lane == 0 )
227         setBVHBaseBounds( bvh, &reduce_bounds );
228 }
229 
230 
quadCopyVertices(const struct QuadLeaf * pQuad,struct QuadLeaf * newQuad)231 GRL_INLINE void quadCopyVertices(
232     const struct QuadLeaf* pQuad,
233     struct QuadLeaf* newQuad)
234 {
235     const uint4* s = (const uint4*) & (pQuad->v[0][0]);
236     uint4* d = (uint4*) & (newQuad->v[0][0]);
237     const uint8* s2 = (const uint8*)(s+1);
238     uint8* d2 = (uint8*)(d+1);
239     *d = *s;
240     *d2 = *s2;
241 }
242 
243 
get_updated_quad(global const struct QuadLeaf * pQuad,global GRL_RAYTRACING_GEOMETRY_DESC * geomDescs,struct QuadLeaf * newQuad)244 GRL_INLINE void get_updated_quad(
245     global const struct QuadLeaf* pQuad,
246     global GRL_RAYTRACING_GEOMETRY_DESC* geomDescs,
247     struct QuadLeaf* newQuad)
248 {
249     struct QuadLeaf tempQuad;
250 
251     // fetch non vtx data;
252     {
253         uint4* tempQuad4U = (uint4*)&tempQuad;
254         global const uint4* pQuad4U = (global const uint4*)pQuad;
255         *tempQuad4U = *pQuad4U;
256     }
257 
258     /* get the geomID and primID0/1 for both quad triangles */
259     const uint geomID = PrimLeaf_GetGeoIndex(&tempQuad.leafDesc);
260     const uint primID0 = tempQuad.primIndex0;
261     const uint primID1 = tempQuad.primIndex0 + QuadLeaf_GetPrimIndexDelta(&tempQuad);
262     ushort fourth_vert = 0;
263 
264     if (primID1 != primID0)
265     {
266         ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(&tempQuad);
267         fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
268         fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
269     }
270 
271     global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDescs + geomID;
272 
273     uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
274 
275     // read the indices of the 4 verts we want
276     float3 vtx0, vtx1, vtx2, vtx3;
277     GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
278 
279     QuadLeaf_SetVertices(&tempQuad, vtx0, vtx1, vtx2, vtx3);
280 
281     *newQuad = tempQuad;
282 }
283 
284 // This calculates children BBs for innerNode having *all* children leafs.
285 // mixed nodes will be updated by passing through bottom-up thread.
refit_bottom(global struct BVHBase * bvh,global GRL_RAYTRACING_GEOMETRY_DESC * geomDesc,global struct AABB3f * instance_leaf_aabbs,global struct QBVHNodeN * curNode,struct AABB * childrenAABB,uint backPointer)286 GRL_INLINE uint refit_bottom( global struct BVHBase* bvh,
287                           global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
288                           global struct AABB3f* instance_leaf_aabbs,
289                           global struct QBVHNodeN* curNode,
290                           struct AABB *childrenAABB,
291                           uint backPointer)
292 {
293     uint numChildren = 0;
294 
295     /* we start refit at leaf nodes, this case is for quad nodes */
296     if (curNode->type == BVH_QUAD_NODE)
297     {
298         global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
299 
300         /* iterate over all quads of the quad node and get their bounds */
301         numChildren = (backPointer >> 3) & 0x7;
302         for (uint k = 0; k < numChildren; k++)
303         {
304             struct QuadLeaf Q;
305             get_updated_quad(&quads[k], geomDesc, &Q);
306             quadCopyVertices(&Q, &quads[k]);
307             childrenAABB[k] = getAABB_Quad((struct Quad*)&Q); // FIXME: support leaves with more than one quad
308         }
309     }
310 
311     /* we start refit at leaf nodes, this case is for procedural nodes */
312     else if (curNode->type == BVH_PROCEDURAL_NODE)
313     {
314         global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
315 
316         /* iterate over all children of the procedural node and get their bounds */
317         numChildren = (backPointer >> 3) & 0x7;
318         for (uint k = 0; k < numChildren; k++)
319         {
320             /* extract geomID and primID from leaf */
321             const uint startPrim = QBVHNodeN_startPrim(curNode, k);
322             const uint geomID = ProceduralLeaf_geomIndex(leaf);
323             const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
324 
325             /* read bounds from geometry descriptor */
326             struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
327             childrenAABB[k].lower.x = aabb.MinX;
328             childrenAABB[k].lower.y = aabb.MinY;
329             childrenAABB[k].lower.z = aabb.MinZ;
330             childrenAABB[k].upper.x = aabb.MaxX;
331             childrenAABB[k].upper.y = aabb.MaxY;
332             childrenAABB[k].upper.z = aabb.MaxZ;
333 
334             /* advance leaf pointer to next child */
335             leaf += QBVHNodeN_blockIncr(curNode, k);
336         }
337     }
338 
339     /* we start refit at leaf nodes, this case is for instance nodes */
340     else if (curNode->type == BVH_INSTANCE_NODE)
341     {
342         global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
343         global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
344 
345         /* iterate over all children of the instance node and get their bounds */
346         numChildren = (backPointer >> 3) & 0x7;
347         for (uint k = 0; k < numChildren; k++)
348         {
349             uint leafindex = (instancesLeaves + k) - leafBase;
350             childrenAABB[k].lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
351             childrenAABB[k].upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
352         }
353     }
354 
355     return numChildren;
356 }
357 
358 
359 
360 
361 
362 // This calculates children BBs for innerNode having *all* children leafs.
363 // mixed nodes will be updated by passing through bottom-up thread.
SUBGROUP_refit_bottom(uniform global struct BVHBase * bvh,uniform global GRL_RAYTRACING_GEOMETRY_DESC * geomDesc,uniform global struct AABB3f * instance_leaf_aabbs,uniform global struct QBVHNodeN * curNode,uniform uint backPointer,varying struct AABB * childrenAABB,varying uchar * childrenMask,varying ushort lane,global uchar * is_procedural_instance)364 GRL_INLINE uint SUBGROUP_refit_bottom(
365     uniform global struct BVHBase* bvh,
366     uniform global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
367     uniform global struct AABB3f* instance_leaf_aabbs,
368     uniform global struct QBVHNodeN* curNode,
369     uniform uint backPointer,
370     varying struct AABB* childrenAABB,
371     varying uchar* childrenMask,
372     varying ushort lane,
373     global uchar* is_procedural_instance
374     )
375 {
376     uniform uint numChildren = 0;
377     bool enable_procedural_instance = (is_procedural_instance != 0);
378 
379     /* we start refit at leaf nodes, this case is for quad nodes */
380     if (curNode->type == BVH_QUAD_NODE)
381     {
382         /* iterate over all quads of the quad node and get their bounds */
383         numChildren = (backPointer >> 3) & 0x7;
384 
385         uniform global struct QuadLeaf* quads = (global struct QuadLeaf*)QBVHNodeN_childrenPointer(curNode);
386 
387         struct QuadLeaf Q;
388         if (lane < numChildren)
389         {
390             get_updated_quad(&quads[lane], geomDesc, &Q);
391 
392             *childrenAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
393 
394             quadCopyVertices(&Q, &quads[lane]);
395             *childrenMask = 0xff;
396         }
397         // FIXME: support leaves with more than one quad
398     }
399 
400     /* we start refit at leaf nodes, this case is for procedural nodes */
401     else if (curNode->type == BVH_PROCEDURAL_NODE)
402     {
403         uniform global struct ProceduralLeaf* leaf = (global struct ProceduralLeaf*)QBVHNodeN_childrenPointer(curNode);
404 
405 
406 
407         /* iterate over all children of the procedural node and get their bounds */
408         numChildren = (backPointer >> 3) & 0x7;
409 
410         varying uint incr = (lane < numChildren) ? InternalNode_GetChildBlockIncr((struct InternalNode*)curNode, lane) : 0;
411         incr = sub_group_scan_exclusive_add(incr);
412 
413         if( lane < numChildren )
414         {
415             /* extract geomID and primID from leaf */
416             varying uint start_prim = InternalNode_GetChildStartPrim((struct InternalNode*)curNode, lane );
417             varying global struct ProceduralLeaf* my_leaf = leaf + incr;
418             const uint geomID = ProceduralLeaf_geomIndex(my_leaf);
419             const uint primID = ProceduralLeaf_primIndex(my_leaf, start_prim);
420 
421             /* read bounds from geometry descriptor */
422             struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
423             childrenAABB->lower.x = aabb.MinX;
424             childrenAABB->lower.y = aabb.MinY;
425             childrenAABB->lower.z = aabb.MinZ;
426             childrenAABB->upper.x = aabb.MaxX;
427             childrenAABB->upper.y = aabb.MaxY;
428             childrenAABB->upper.z = aabb.MaxZ;
429             *childrenMask = 0xff;
430         }
431     }
432 
433     /* we start refit at leaf nodes, this case is for instance nodes */
434     else if ( !enable_procedural_instance && curNode->type == BVH_INSTANCE_NODE)
435     {
436         uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer(curNode);
437         uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves(bvh);
438 
439         /* iterate over all children of the instance node and get their bounds and masks */
440         numChildren = (backPointer >> 3) & 0x7;
441         if( lane < numChildren )
442         {
443             uint leafindex = (instancesLeaves + lane) - leafBase;
444             childrenAABB->lower.xyz = AABB3f_load_lower(&instance_leaf_aabbs[leafindex]);
445             childrenAABB->upper.xyz = AABB3f_load_upper(&instance_leaf_aabbs[leafindex]);
446             *childrenMask = HwInstanceLeaf_GetInstanceMask(&leafBase[leafindex]);
447         }
448     }
449     else if (enable_procedural_instance && curNode->type == BVH_INTERNAL_NODE)
450     {
451         // Handle procedural-instance leaves
452         //   TODO:  Generalize this!   Should re-write the kernel to work with arbitrary mixed-mode leaves
453 
454         numChildren = (backPointer >> 3) & 0x7;
455         uint childType = BVH_INTERNAL_NODE;
456         if ( lane < numChildren )
457         {
458             childType = InternalNode_GetChildType( (struct InternalNode*)curNode, lane );
459             if (childType != BVH_INTERNAL_NODE)
460             {
461                 uniform global struct HwInstanceLeaf* instancesLeaves = (global struct HwInstanceLeaf*)QBVHNodeN_childrenPointer( curNode );
462                 uniform global struct HwInstanceLeaf* leafBase = (global struct HwInstanceLeaf*) BVHBase_GetHWInstanceLeaves( bvh );
463                 uint leafindex = (instancesLeaves + lane) - leafBase;
464                 childrenAABB->lower.xyz = AABB3f_load_lower( &instance_leaf_aabbs[leafindex] );
465                 childrenAABB->upper.xyz = AABB3f_load_upper( &instance_leaf_aabbs[leafindex] );
466                 *childrenMask = HwInstanceLeaf_GetInstanceMask( &leafBase[leafindex] );
467 
468                 // see if the child has flipped from procedural to non-procedural and update the child type field as needed
469                 uint instanceIndex = HwInstanceLeaf_GetInstanceIndex( &leafBase[leafindex] );
470                 uint newChildType = is_procedural_instance[instanceIndex] ? BVH_PROCEDURAL_NODE : BVH_INSTANCE_NODE;
471                 if (newChildType != childType)
472                 {
473                     InternalNode_SetChildType( (struct InternalNode*)curNode, lane, newChildType );
474                 }
475             }
476         }
477 
478 
479         // don't ascend the tree for a true internal node
480         if (sub_group_all(childType == BVH_INTERNAL_NODE))
481             numChildren = 0;
482     }
483 
484     return numChildren;
485 }
486 
487 #define SG_REFIT_WG_SIZE 8
488 
DO_Refit_per_one_startpoint_sg(global struct BVHBase * bvh,global GRL_RAYTRACING_GEOMETRY_DESC * geosArray,global struct AABB3f * instance_leaf_aabbs,global uchar * is_procedural_instance)489 void DO_Refit_per_one_startpoint_sg(
490     global struct BVHBase* bvh,
491     global GRL_RAYTRACING_GEOMETRY_DESC* geosArray,
492     global struct AABB3f* instance_leaf_aabbs,
493     global uchar* is_procedural_instance )
494 {
495     /* get pointer to inner nodes and back pointers */
496     global struct QBVHNodeN* inner_nodes = BVHBase_rootNode(bvh);
497     BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
498 
499     /* get the inner node that we will consider as a bottom startpoint */
500     const uint numInnerNodes = BVHBase_numNodes(bvh);
501     const uint innerNodeIdx = get_sub_group_global_id();
502 
503     varying ushort lane = get_sub_group_local_id();
504 
505     if (innerNodeIdx >= numInnerNodes) return;
506 
507     varying struct AABB childrenAABB; // one child AABB per lane
508     AABB_init(&childrenAABB);
509 
510     varying uchar childrenMask = 0; // one child mask per lane
511 
512     global struct QBVHNodeN* curNode = &inner_nodes[innerNodeIdx];
513     uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
514     uint numChildren = SUBGROUP_refit_bottom(
515         bvh,
516         geosArray,
517         instance_leaf_aabbs,
518         curNode,
519         backPointer,
520         &childrenAABB,
521         &childrenMask,
522         lane,
523         is_procedural_instance
524          );
525 
526 
527     if (numChildren != 0)
528     {
529         /* update bounds of node */
530         struct AABB reduce_bounds = AABB_sub_group_reduce_N6(&childrenAABB);
531         reduce_bounds = AABB_sub_group_shuffle(&reduce_bounds, 0);
532         subgroup_QBVHNodeN_setBounds(curNode, reduce_bounds, childrenAABB, numChildren, lane);
533 
534         /* update mask of node */
535         uchar mask = sub_group_reduce_or_N6(childrenMask);
536         curNode->instMask = mask;
537 
538         /* Leave this fence for now for all threads, if WG size is increased (tried 128) and fence is done
539            only by the first thread (similar to morton phase1) the machine hangs. */
540         mem_fence_gpu_invalidate();
541 
542         /* refit upper parts of the BVH */
543         /* TODO: this will not gonna work for mixed nodes */
544         SUBGROUP_refit_bottom_up(curNode, bvh, reduce_bounds, numChildren, lane, 0);
545     }
546 }