xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/atomic_update.cl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1//
2// Copyright (C) 2009-2021 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8#include "GRLGen12.h"
9
10#include "bvh_build_refit.h"
11#include "bvh_build_treelet_refit.h"
12
13
14struct RefitScratch
15{
16    float lower[3];
17    uint mask;
18    float upper[3];
19    uint _pad;
20
21};
22
23GRL_ANNOTATE_IGC_DO_NOT_SPILL
24__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
25init_refit_scratch(
26    global struct BVHBase* bvh,
27    global struct RefitScratch* scratch )
28{
29    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
30
31    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
32    {
33        float4 v = (float4) (FLT_MAX,FLT_MAX,FLT_MAX,0);
34        store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 0, as_uint4(v) );
35        store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 1, as_uint4(v) );
36    }
37}
38
39bool is_fat_leaf( InternalNode* curNode )
40{
41    return curNode->nodeType != BVH_INTERNAL_NODE; // TODO:  Not enough for traversal shaders!! if ts enabled need to check child types
42}
43
44GRL_ANNOTATE_IGC_DO_NOT_SPILL
45__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
46build_fatleaf_table(
47    global struct BVHBase* bvh )
48{
49    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
50
51    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
52    {
53        InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
54
55        if ( is_fat_leaf(curNode) )
56        {
57            uint offs = atomic_inc_global( &bvh->fatLeafCount );
58
59            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
60            uint bp = *InnerNode_GetBackPointer(backPointers, tid);
61
62            LeafTableEntry* leaf   = BVHBase_GetFatLeafTable(bvh)+offs;
63            leaf->backpointer      = bp;
64            leaf->inner_node_index = tid;
65            leaf->leaf_index       = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart;
66        }
67    }
68}
69
70GRL_ANNOTATE_IGC_DO_NOT_SPILL
71__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
72build_fatleaf_table_new_update(
73    global struct Globals *globals,
74    global struct BVHBase* bvh )
75{
76    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
77
78    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
79    {
80        InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
81
82        if ( is_fat_leaf(curNode) )
83        {
84            // This implementation uses fatleaf table structure but it is actually quad table
85            // Also tested implementation that process 2 fatleafs per SIMD line as we iterate over the children
86            // but performance was worse
87            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
88            uint bp = *InnerNode_GetBackPointer(backPointers, tid);
89            uint fatLeafTableStart = bvh->fatLeafTableStart;
90
91            uint leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart;
92            uint numChildren = (bp >> 3) & 0x7;
93
94            uint quad_leaf_table_index = leaf_index;
95
96            // Check if num children is outside of the % 256 work group
97            // If so, move these cases to the offset after numQuads and push them to the leftovers part
98            // where fatleaves are stored every 8th pos with additional padding
99            // This way we will not have the case in leftovers table where single fatleaf has children in 2 separate work groups
100
101            uint prev_group = leaf_index & 255;
102            uint next_group = (leaf_index + (numChildren - 1)) & 255;
103            uint slm_pos = prev_group;
104            bool is_leftover = prev_group > next_group;
105
106            if(is_leftover)
107            {
108                LeafTableEntry* leafBase = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index));
109                uint numQuads_aligned_256 = (globals->numPrimitives + 255) & ~255;
110
111                uint leftovers_offset = atomic_add_global( &bvh->quadLeftoversCountNewAtomicUpdate, 8 );
112
113                for(uint i = 0; i < BVH_NODE_N6; i++)
114                {
115                    uint pos = (i < numChildren) ? i : 0;
116                    LeafTableEntry* leaf_null = &leafBase[pos];
117                    leaf_null->leaf_index = -1 << 3;
118                }
119
120                quad_leaf_table_index = numQuads_aligned_256 + leftovers_offset;
121                slm_pos = leftovers_offset & 255;
122            }
123
124            LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index));
125
126            for(uint i = 0; i < BVH_NODE_N6; i++)
127            {
128                uint pos = (i < numChildren) ? i : 0;
129                LeafTableEntry* leafCur = &leaf[pos];
130                leafCur->backpointer = bp;
131                leafCur->inner_node_index = (tid << 8) | slm_pos;
132                leafCur->leaf_index = (leaf_index << 3) | pos;
133            }
134
135            // Need to clean the unused area where we pad to 8 for leftovers
136            if(is_leftover)
137            {
138                for(uint i = 1; i < 8; i++)
139                {
140                    uint pos = (i >= numChildren) ? i : 7;
141                    LeafTableEntry* leafCur = &leaf[pos];
142                    leafCur->leaf_index = -1 << 3;
143                }
144            }
145        }
146    }
147}
148
149GRL_ANNOTATE_IGC_DO_NOT_SPILL
150__attribute__((reqd_work_group_size(64, 1, 1))) void kernel
151build_innernode_table(
152    global struct BVHBase* bvh )
153{
154    uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0);
155
156    if ( tid < BVHBase_GetNumInternalNodes(bvh) )
157    {
158        InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid;
159
160        if ( !is_fat_leaf( curNode ) )
161        {
162            uint offs = atomic_inc_global( &bvh->innerCount );
163
164            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
165            uint bp = *InnerNode_GetBackPointer(backPointers, tid);
166
167            InnerNodeTableEntry* inner   = BVHBase_GetInnerNodeTable(bvh)+offs;
168            inner->node_index_and_numchildren = (tid<<3) | ((bp>>3) &7);
169            inner->first_child = tid + curNode->childOffset;
170        }
171    }
172}
173
174GRL_ANNOTATE_IGC_DO_NOT_SPILL
175__attribute__((reqd_work_group_size(256, 1, 1))) void kernel
176fixup_quad_table(
177    global struct BVHBase* bvh )
178{
179    // This kernel has 2 work groups that set the magic number for unused data in
180    // fatleaf table. One work group for thelast group of the first part where quads are packed,
181    // second one for the last group of the part where quads are stored padded
182
183    uint numQuads = BVHBase_GetNumQuads(bvh);
184    uint numQuadLeftovers = bvh->quadLeftoversCountNewAtomicUpdate;
185    uint numQuadLeftovers_aligned_256 = (numQuadLeftovers + 255) & ~255;
186
187    uint numQuads_aligned_256 = (numQuads + 255) & ~255;
188    uint quadOffsetEnd = numQuads_aligned_256 + get_group_id(0) * numQuadLeftovers_aligned_256;
189    uint quadOffsetStart = quadOffsetEnd - 256;
190
191    uint quads_number_last_group = (get_group_id(0) == 0) ? numQuads : numQuads_aligned_256 + numQuadLeftovers;
192
193    uint leftovers = quadOffsetEnd - quads_number_last_group;
194
195    uint tid = get_local_id(0) > (255 - leftovers) ? get_local_id(0) : 256 - leftovers;
196
197    if(leftovers != 0)
198    {
199        LeafTableEntry* leafBvh = BVHBase_GetFatLeafTable(bvh);
200
201        LeafTableEntry* leaf = &leafBvh[quadOffsetStart + tid];
202        leaf->leaf_index = -1 << 3;
203    }
204
205    if(get_group_id(0) == 1 && get_local_id(0) == 0)
206        bvh->quadTableSizeNewAtomicUpdate = quadOffsetEnd;
207}
208
209
210// updates one quad leaf and gets BBOX contatining it
211GRL_INLINE void refit_bottom_child_quad_WB(
212    global struct QuadLeaf* quad,
213    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
214    struct AABB* childAABB)
215{
216    /* get the geomID and primID0/1 for both quad triangles */
217    const uint geomID = PrimLeaf_GetGeoIndex(&quad->leafDesc);
218    const uint primID0 = quad->primIndex0;
219    const uint primID1 = primID0 + QuadLeaf_GetPrimIndexDelta(quad);
220    ushort fourth_vert = 0;
221
222    if (primID1 != primID0)
223    {
224        ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(quad);
225        fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert;
226        fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert;
227    }
228
229    global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc + geomID;
230
231    uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert);
232
233    // read the indices of the 4 verts we want
234    float3 vtx0, vtx1, vtx2, vtx3;
235    GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices);
236
237    childAABB->lower.xyz = min( min( vtx0, vtx1 ), min(vtx2,vtx3) );
238    childAABB->upper.xyz = max( max( vtx0, vtx1 ), max(vtx2,vtx3) );
239
240    float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x );
241    float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y );
242    float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z );
243
244    global uint4* dst_verts = (global uint4*) &(quad->v[0][0]);
245    store_uint4_L1WB_L3WB( dst_verts, 0, as_uint4(pack0) );
246    store_uint4_L1WB_L3WB( dst_verts, 1, as_uint4(pack1) );
247    store_uint4_L1WB_L3WB( dst_verts, 2, as_uint4(pack2) );
248}
249
250inline uchar4 uchar4_shuffle_down( uchar4 v, uint offs )
251{
252    uint vi = as_uint(v);
253    return as_uchar4(intel_sub_group_shuffle_down(vi,vi,offs));
254}
255inline uchar4 uchar4_broadcast( uchar4 v, uint offs )
256{
257    uint vi = as_uint(v);
258    return as_uchar4(sub_group_broadcast(vi,offs));
259}
260
261GRL_INLINE void sg_InternalNode_setFields(
262    struct InternalNode* node,
263    struct AABB reduced_aabb,
264    const int offset, const uint nodeType, struct AABB* input_aabb,
265    const uint numChildren, const uchar nodeMask )
266{
267    const float up = 1.0f + ulp;
268    const float down = 1.0f - ulp;
269
270    struct AABB conservative_aabb = conservativeAABB(&reduced_aabb);
271    const float3 org = conservative_aabb.lower.xyz;
272
273    const float3 len = AABB_size(&conservative_aabb).xyz * up;
274    int3 exp;
275    const float3 mant = frexp_vec3(len, &exp);
276    exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
277
278    uchar4 lower_uchar = 0x80;
279    uchar4 upper_uchar = 0;
280
281    ushort lane = get_sub_group_local_id();
282    ushort simd8_id     = lane/8;
283    ushort logical_lane = lane%8;
284
285    if( logical_lane < numChildren )
286    {
287        struct AABB child_aabb = conservativeAABB( input_aabb ); // conservative ???
288
289        float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) );
290        lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) );
291        float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) );
292        upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) );
293        lower_uchar.xyz = convert_uchar3_rtn( lower );
294        upper_uchar.xyz = convert_uchar3_rtp( upper );
295    }
296
297    uchar4 lo0 = lower_uchar;
298    uchar4 lo1 = uchar4_shuffle_down( lower_uchar, 1 );
299    uchar4 lo2 = uchar4_shuffle_down( lower_uchar, 2 );
300    uchar4 lo3 = uchar4_shuffle_down( lower_uchar, 3 );
301    uchar4 lo4 = uchar4_shuffle_down( lower_uchar, 4 );
302    uchar4 lo5 = uchar4_shuffle_down( lower_uchar, 5 );
303
304    uchar4 hi0 = upper_uchar;
305    uchar4 hi1 = uchar4_shuffle_down( upper_uchar,1 );
306    uchar4 hi2 = uchar4_shuffle_down( upper_uchar,2 );
307    uchar4 hi3 = uchar4_shuffle_down( upper_uchar,3 );
308    uchar4 hi4 = uchar4_shuffle_down( upper_uchar,4 );
309    uchar4 hi5 = uchar4_shuffle_down( upper_uchar,5 );
310
311    if( logical_lane == 0 )
312    {
313        uchar childBlockStride = 0x01 + (uint)(nodeType == NODE_TYPE_INSTANCE);
314
315        uint4 block0 = (uint4)(as_uint(org.x), as_uint(org.y), as_uint(org.z), offset);
316
317        char3 exp_char = (char3)(exp.x,exp.y,exp.z);
318
319        uint4 block1 = (uint4)(
320            as_uint((uchar4)(nodeType, 0 /* padding */, exp_char.x, exp_char.y)),
321            as_uint((uchar4)(exp_char.z, nodeMask, childBlockStride, childBlockStride)) ,
322            as_uint((uchar4)(childBlockStride, childBlockStride, childBlockStride, childBlockStride)) ,
323            as_uint((uchar4)(lo0.x,lo1.x,lo2.x,lo3.x))
324        );
325
326        uint4 block2 = (uint4)(
327            as_uint((uchar4)(lo4.x,lo5.x,hi0.x,hi1.x)) ,
328            as_uint((uchar4)(hi2.x,hi3.x,hi4.x,hi5.x)) ,
329            as_uint((uchar4)(lo0.y,lo1.y,lo2.y,lo3.y)) ,
330            as_uint((uchar4)(lo4.y,lo5.y,hi0.y,hi1.y))
331            );
332
333        uint4 block3 = (uint4)(
334            as_uint((uchar4)(hi2.y,hi3.y,hi4.y,hi5.y)),
335            as_uint((uchar4)(lo0.z,lo1.z,lo2.z,lo3.z)),
336            as_uint((uchar4)(lo4.z,lo5.z,hi0.z,hi1.z)),
337            as_uint((uchar4)(hi2.z,hi3.z,hi4.z,hi5.z))
338            );
339
340        global uint4* pNode = (global uint4*)node;
341
342#if 0
343        printf(
344            "block0 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n"
345            "block1 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n"
346            "block2 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n"
347            "block3 = %08x,%08x,%08x,%08x    %08x,%08x,%08x,%08x \n" ,
348            block0.x,block0.y,block0.z,block0.w,
349            pNode[0].x, pNode[0].y, pNode[0].z, pNode[0].w,
350            block1.x,block1.y,block1.z,block1.w,
351            pNode[1].x, pNode[1].y, pNode[1].z, pNode[1].w,
352            block2.x,block2.y,block2.z,block2.w,
353            pNode[2].x, pNode[2].y, pNode[2].z, pNode[2].w ,
354            block3.x,block3.y,block3.z,block3.w,
355            pNode[3].x, pNode[3].y, pNode[3].z, pNode[3].w );
356#endif
357
358         store_uint4_L1WB_L3WB( pNode, 0, block0 );
359         store_uint4_L1WB_L3WB( pNode, 1, block1 );
360         store_uint4_L1WB_L3WB( pNode, 2, block2 );
361         store_uint4_L1WB_L3WB( pNode, 3, block3 );
362    }
363
364}
365
366
367
368GRL_ANNOTATE_IGC_DO_NOT_SPILL
369__attribute__((reqd_work_group_size(256, 1, 1)))
370void kernel
371traverse_aabbs_quad(
372        global struct BVHBase* bvh,
373        global struct RefitScratch* scratch,
374        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc
375    )
376{
377
378    uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh);
379    varying ushort lane = get_sub_group_local_id();
380
381    uniform uint num_leaves = bvh->fatLeafCount;
382
383    local struct RefitScratch local_scratch[256];
384    if( get_local_id(0) < min(num_nodes,256u) )
385    {
386        for( uint i=0; i<3; i++ ){
387            local_scratch[get_local_id(0)].lower[i] = FLT_MAX;
388            local_scratch[get_local_id(0)].upper[i] = FLT_MAX;
389        }
390    }
391
392    barrier( CLK_LOCAL_MEM_FENCE );
393
394
395    ushort SIMD8_PER_SG   = get_sub_group_size()/8;
396    ushort SIMD8_PER_WG   = get_num_sub_groups()*SIMD8_PER_SG;
397    ushort simd8_local_id = get_sub_group_local_id()/8;
398    ushort simd8_id       = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id;
399    ushort logical_lane   = lane%8;
400
401    uniform uint fatleaf_index = simd8_id + get_group_id(0)*SIMD8_PER_WG;
402
403
404    if ( fatleaf_index < num_leaves )
405    {
406        LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+fatleaf_index;
407        uint innerNodeIdx = leaf->inner_node_index;
408        uint bp           = leaf->backpointer;
409        uint leaf_index   = leaf->leaf_index;
410
411        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx;
412        varying QuadLeaf* quad =  BVHBase_GetQuadLeaves(bvh) + leaf_index;
413
414        uint childOffs = (((char*)quad) - ((char*)curNode))/64;
415
416        varying struct AABB childrenBox;
417        AABB_init(&childrenBox);
418
419        uint numChildren = (bp >> 3) & 0x7;
420        if (logical_lane < numChildren)
421        {
422            refit_bottom_child_quad_WB(
423                (global struct QuadLeaf*) &quad[logical_lane],
424                geomDesc,
425                &childrenBox );
426        }
427
428        struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox);
429        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
430        for (uint i = 1; i < SIMD8_PER_SG; i++)
431        {
432            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
433            int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
434            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
435            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
436        }
437
438        sg_InternalNode_setFields(
439            curNode,
440            reduce_bounds,
441            childOffs,
442            NODE_TYPE_QUAD,
443            &childrenBox,
444            numChildren,
445            0xff );
446
447        // atomic min operation vectorized across 6 lanes
448        //    [ lower.xyz ][-][upper.xyz][-]
449        //
450        // Lanes 3 and 7 are inactive.   'upper' is negated
451        bool atomic_mask = (1<<logical_lane) & 0x77;
452
453        uint lmod = logical_lane % 4;
454        uint ldiv = logical_lane / 4;
455        float vlo = reduce_bounds.lower.x;
456        float vhi = reduce_bounds.upper.x;
457        vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
458        vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
459        vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
460        vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
461        float v = (ldiv == 0) ? vlo : -vhi;
462
463
464        global float* pv = (global float*) &scratch[innerNodeIdx];
465
466        store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v));
467
468        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
469        uint parent = (bp >> 6);
470
471        // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= 256
472        if(atomic_mask && parent != 0x03FFFFFF)
473        {
474            while( parent >= 256 )
475            {
476                innerNodeIdx = parent;
477                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
478                atomic_min( ((global float*) &(scratch[innerNodeIdx]))+logical_lane, v );
479                parent = bp >> 6;
480            }
481            while( parent != 0x03FFFFFF )
482            {
483                innerNodeIdx = parent;
484                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
485                atomic_min( ((local float*) &(local_scratch[innerNodeIdx]))+logical_lane, v );
486                parent = bp >> 6;
487            }
488        }
489
490    }
491
492
493    barrier( CLK_LOCAL_MEM_FENCE );
494    num_nodes = min(num_nodes,256u);
495
496    local float* in = (local float*)&local_scratch[0];
497    global float* out = (global float*)&scratch[0];
498
499    for (uint i = get_local_id(0); i < num_nodes*6; i += 256 )
500    {
501        // since we want to save [ lower.xyz ][-][upper.xyz][-] i.e 0,1,2, 4,5,6 etc. we need to offset +1 for every triplet
502        uint idx = i + (i/3);
503
504        float v = in[idx];
505        if( v != FLT_MAX )
506            atomic_min( out + idx , v );
507    }
508}
509
510GRL_ANNOTATE_IGC_DO_NOT_SPILL
511__attribute__((reqd_work_group_size(64, 1, 1)))
512void kernel
513write_inner_nodes(
514    global struct BVHBase* bvh,
515    global struct RefitScratch* scratch
516    )
517{
518    uint SIMD8_PER_SG = get_sub_group_size()/8;
519    uniform uint node_id    = SIMD8_PER_SG * get_sub_group_global_id() + (get_sub_group_local_id()/8);
520    varying ushort lane = get_sub_group_local_id() % 8;
521    varying uint num_inners = bvh->innerCount;
522
523    if ( node_id < num_inners )
524    {
525        InnerNodeTableEntry* entry = BVHBase_GetInnerNodeTable(bvh) + node_id;
526        uint node_index  = entry->node_index_and_numchildren>>3;
527        uint numChildren = entry->node_index_and_numchildren & 7;
528        uint first_child = entry->first_child;
529
530        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+node_index;
531
532        varying struct AABB childAABB;
533        AABB_init(&childAABB);
534
535        if( lane < numChildren )
536        {
537            uint child = first_child + lane;
538            childAABB.lower.x = scratch[child].lower[0];
539            childAABB.lower.y = scratch[child].lower[1];
540            childAABB.lower.z = scratch[child].lower[2];
541            childAABB.upper.x = -scratch[child].upper[0];
542            childAABB.upper.y = -scratch[child].upper[1];
543            childAABB.upper.z = -scratch[child].upper[2];
544        }
545
546        varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB);
547        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
548        for (uint i = 1; i < SIMD8_PER_SG; i++)
549        {
550            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
551            int3 is_upper_lane = ((uint3)(i)) ==  (get_sub_group_local_id()/8);
552            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
553            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
554        }
555
556        sg_InternalNode_setFields(
557            curNode,
558            reduce_bounds,
559            first_child - node_index,
560            NODE_TYPE_INTERNAL,
561            &childAABB,
562            numChildren,
563            0xff );
564
565    }
566
567    if (node_id == 0 && lane == 0 )
568    {
569        bvh->Meta.bounds.lower[0] = scratch[0].lower[0];
570        bvh->Meta.bounds.lower[1] = scratch[0].lower[1];
571        bvh->Meta.bounds.lower[2] = scratch[0].lower[2];
572        bvh->Meta.bounds.upper[0] = -scratch[0].upper[0];
573        bvh->Meta.bounds.upper[1] = -scratch[0].upper[1];
574        bvh->Meta.bounds.upper[2] = -scratch[0].upper[2];
575    }
576
577}
578
579
580
581#if 1
582#define SLM_BOX_COUNT 1024
583
584struct AABB load_box( uint place,  local struct AABB* local_boxes, global struct AABB* extra_boxes )
585{
586    if( place < SLM_BOX_COUNT )
587        return local_boxes[place];
588    else
589        return extra_boxes[place-SLM_BOX_COUNT];
590}
591
592void store_box( struct AABB box, uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes )
593{
594    if (place < SLM_BOX_COUNT)
595    {
596        local_boxes[place] = box;
597    }
598    else
599    {
600        global uint4* ptr = (global uint4*)&extra_boxes[place-SLM_BOX_COUNT];
601        store_uint4_L1WB_L3WB( ptr,   0, as_uint4(box.lower) );
602        store_uint4_L1WB_L3WB( ptr+1, 0, as_uint4(box.upper) );
603    }
604}
605
606
607GRL_ANNOTATE_IGC_DO_NOT_SPILL
608__attribute__((reqd_work_group_size(512, 1, 1)))
609__attribute__((intel_reqd_sub_group_size(16)))
610void kernel
611update_single_group_quads(
612    global struct BVHBase* bvh,
613    global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
614    global struct AABB* extra_boxes
615)
616{
617    uniform uint tid = get_sub_group_global_id();
618    uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh);
619    uniform uint num_leaves = bvh->fatLeafCount;
620    uniform uint num_inners = bvh->innerCount;
621
622    varying ushort lane = get_sub_group_local_id();
623
624    local struct AABB local_boxes[SLM_BOX_COUNT]; // == 32KB
625
626    // initialize nodes
627    for (uint i = get_local_id( 0 ); i < num_nodes; i+= get_local_size(0))
628    {
629        struct AABB tmp;
630        AABB_init(&tmp);
631        tmp.upper = -tmp.upper;
632        store_box( tmp, i, local_boxes, extra_boxes );
633    }
634
635
636    if( num_nodes > SLM_BOX_COUNT )
637        mem_fence_workgroup_default();
638
639    barrier( CLK_LOCAL_MEM_FENCE );
640
641
642    ushort SIMD8_PER_SG   = get_sub_group_size()/8;
643    ushort NUM_SIMD8      = get_num_sub_groups()*SIMD8_PER_SG;
644    ushort simd8_local_id = get_sub_group_local_id()/8;
645    ushort simd8_id       = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id;
646    ushort logical_lane = lane%8;
647
648
649    for ( uint i = simd8_id; i < num_leaves; i+= NUM_SIMD8 )
650    {
651        LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+i;
652        uint innerNodeIdx = leaf->inner_node_index;
653        uint bp           = leaf->backpointer;
654        uint leaf_index   = leaf->leaf_index;
655
656        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx;
657        QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index;
658
659        uint childOffs = (((char*)quad) - ((char*)curNode))/64;
660
661        varying struct AABB childrenBox;
662        AABB_init(&childrenBox);
663
664        uint numChildren = (bp >> 3) & 0x7;
665        if (logical_lane < numChildren)
666        {
667
668            refit_bottom_child_quad_WB(
669                (global struct QuadLeaf*) &quad[logical_lane],
670                geomDesc,
671                &childrenBox );
672        }
673
674        struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox);
675        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
676        for (uint i = 1; i < SIMD8_PER_SG; i++)
677        {
678            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
679            int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
680            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
681            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
682        }
683
684
685        if( logical_lane == 0 )
686        {
687            struct AABB negated = reduce_bounds;
688            negated.upper = -negated.upper;
689            store_box( negated, innerNodeIdx, local_boxes, extra_boxes );
690        }
691
692        sg_InternalNode_setFields(
693            curNode,
694            reduce_bounds,
695            childOffs,
696            NODE_TYPE_QUAD,
697            &childrenBox,
698            numChildren,
699            0xff );
700
701
702        // atomic min operation vectorized across 6 lanes
703        //    [ lower.xyz ][-][upper.xyz][-]
704        //
705        // Lanes 3 and 7 are inactive.   'upper' is negated
706        uint lmod = logical_lane % 4;
707        uint ldiv = logical_lane / 4;
708        float vlo = reduce_bounds.lower.x;
709        float vhi = reduce_bounds.upper.x;
710        vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
711        vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
712        vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
713        vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
714        float v = (ldiv == 0) ? vlo : -vhi;
715        bool atomic_mask = (1<<logical_lane) & 0x77;
716
717        BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
718        uint parent = (bp >> 6);
719
720        // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= SLM_BOX_COUNT
721        if(atomic_mask && parent != 0x03FFFFFF)
722        {
723            while( parent >= SLM_BOX_COUNT )
724            {
725                innerNodeIdx = parent;
726                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
727                atomic_min( ((global float*) &(extra_boxes[innerNodeIdx-SLM_BOX_COUNT]))+logical_lane, v );
728                parent = bp >> 6;
729            }
730            while( parent != 0x03FFFFFF )
731            {
732                innerNodeIdx = parent;
733                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
734                atomic_min( ((local float*) &(local_boxes[innerNodeIdx]))+logical_lane, v );
735                parent = bp >> 6;
736            }
737        }
738
739    }
740
741    if( num_nodes > SLM_BOX_COUNT )
742        mem_fence_workgroup_default();
743
744    barrier( CLK_LOCAL_MEM_FENCE );
745
746    for ( uint i = simd8_id; i < num_inners; i+= NUM_SIMD8 )
747    {
748        InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh) + i;
749        uint node_index  = inner->node_index_and_numchildren>>3;
750        uint numChildren = inner->node_index_and_numchildren & 7;
751        uint first_child = inner->first_child;
752
753        varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+ node_index;
754
755        //if (curNode->nodeType == BVH_INTERNAL_NODE) // TODO: Needs updating for traversal shaders
756        {                                           // TODO: Consider using an inner node table or UC load to avoid polluting LSC with these reads
757            uint child = first_child + logical_lane;
758
759            bool child_valid = (logical_lane < numChildren);
760
761            struct AABB childAABB;
762            AABB_init(&childAABB);
763            if (child_valid)
764            {
765                childAABB = load_box( child, local_boxes, extra_boxes );
766                childAABB.upper = -childAABB.upper;
767            }
768
769            varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB);
770            struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
771            for (uint i = 1; i < SIMD8_PER_SG; i++)
772            {
773                struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
774                int3 is_upper_lane = ((uint3)(i)) ==  (get_sub_group_local_id()/8);
775                reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
776                reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
777            }
778
779            sg_InternalNode_setFields(
780                curNode,
781                reduce_bounds,
782                first_child - node_index,
783                NODE_TYPE_INTERNAL,
784                &childAABB,
785                numChildren,
786                0xff );
787        }
788    }
789
790
791    if (get_sub_group_id() == 0 && lane == 0 )
792    {
793        bvh->Meta.bounds.lower[0] = local_boxes[0].lower.x;
794        bvh->Meta.bounds.lower[1] = local_boxes[0].lower.y;
795        bvh->Meta.bounds.lower[2] = local_boxes[0].lower.z;
796        bvh->Meta.bounds.upper[0] = -local_boxes[0].upper.x;
797        bvh->Meta.bounds.upper[1] = -local_boxes[0].upper.y;
798        bvh->Meta.bounds.upper[2] = -local_boxes[0].upper.z;
799    }
800
801}
802#endif
803
804GRL_INLINE void traverse_aabbs_new_update_func(
805        global struct BVHBase* bvh,
806        global char* vertices,
807        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
808        global struct RefitScratch* scratch,
809        uint vertex_format,
810        local struct AABB3f* children_AABBs,
811        local uint* num_fat_leaves,
812        local struct LeafTableEntry* leafTable_local,
813        const bool single_geo
814    )
815{
816    // The first part of the kernel with vertices loads/stores is executed with quad per work item,
817    // using previously prepared QuadDataIndices to get the quad data and vert indices
818    // Second part of the kernel that does the reduction, update fatleaf ain bvh and bottom up is
819    // executed per simd.
820    // For bottom up tested also with local part (using local scratch) but since there is not enough SLM additional
821    // barriers were needed to clean and reuse SLM, which curretnly kills performance. Could be worth to revisit
822    // on future gens.
823
824    varying uint lid = get_local_id(0);
825    varying uint tid = lid + get_group_id(0)*get_local_size(0);
826
827    num_fat_leaves[0] = 0;
828    leafTable_local[lid].leaf_index = -1 << 3;
829
830    LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * bvh->fatLeafTableStart + 12 * tid));
831    uint innerNodeIdx_mem = leaf->inner_node_index;
832    uint bp           = leaf->backpointer;
833    uint leaf_index_mem = leaf->leaf_index;
834
835    uint numChildren = (bp >> 3) & 0x7;
836
837    uint leaf_index = leaf_index_mem >> 3;
838    uint slm_child_offset = leaf_index_mem & 0x7;
839
840    uint innerNodeIdx = innerNodeIdx_mem >> 8;
841    uint slm_pos_main = innerNodeIdx_mem & 0xFF;
842
843    uint first_el_of_group = get_group_id(0)*get_local_size(0);
844    uint quadsNum = BVHBase_GetNumQuads(bvh);
845    uint expected_tid = first_el_of_group < quadsNum ? first_el_of_group : quadsNum - 1;
846
847    // Skip writes when not all children for single fatleaf are present in this work group
848    bool skip_tid = leaf_index == 0x1FFFFFFF;
849    leaf_index = skip_tid ? expected_tid : leaf_index;
850
851    // Compute bounding box for quads
852    varying struct AABB3f childrenBox;
853
854    tid = leaf_index + slm_child_offset;
855
856    // Read vertex indices and quad header from separate buffer
857    uint quadIndicesStart = bvh->quadIndicesDataStart;
858    varying struct QuadDataIndices* vertex_indice_ptr = (QuadDataIndices*)(((char*)bvh) + (64u * quadIndicesStart + 32 * tid));
859    QuadDataIndices vertexMap = vertex_indice_ptr[0];
860
861    varying global uint4* bounds =  (global uint4*)((char*)bvh + (64*bvh->quadLeafStart + 64*tid) );
862    uint4 quad_data = (uint4)(vertexMap.header_data[0], vertexMap.header_data[1], vertexMap.header_data[2], vertexMap.header_data[3]);
863    uint4 indices = (uint4)(vertexMap.vert_idx[0], vertexMap.vert_idx[1], vertexMap.vert_idx[2], vertexMap.vert_idx[3]);
864
865    global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc;
866
867    if(!single_geo)
868    {
869        uint geomID = vertexMap.header_data[0] & 0xFFFFFF;
870        desc += geomID;
871        vertices = (global char*)desc->Desc.Triangles.pVertexBuffer;
872        vertex_format = desc->Desc.Triangles.VertexFormat;
873    }
874
875    float3 vtx0, vtx1, vtx2, vtx3;
876    GRL_load_quad_vertices_no_stride(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices, vertex_format, vertices);
877
878    for(uint i = 0; i < 3; i++)
879        childrenBox.lower[i] = min( min( vtx0[i], vtx1[i] ), min(vtx2[i],vtx3[i]) );
880
881    for(uint i = 0; i < 3; i++)
882        childrenBox.upper[i] = max( max( vtx0[i], vtx1[i] ), max(vtx2[i],vtx3[i]) );
883
884    float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x );
885    float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y );
886    float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z );
887
888    // Store quad data in bvh
889    // Make sure this goes without partial writes to get best perf
890    store_uint4_L1WB_L3WB( bounds, 0, quad_data );
891    store_uint4_L1WB_L3WB( bounds, 1, as_uint4(pack0) );
892    store_uint4_L1WB_L3WB( bounds, 2, as_uint4(pack1) );
893    store_uint4_L1WB_L3WB( bounds, 3, as_uint4(pack2) );
894
895    barrier( CLK_LOCAL_MEM_FENCE );
896
897    struct AABB reduce_bounds;
898
899    if(!skip_tid)
900    {
901        // Store AABB in SLM, to be used later for children quantization in fatleaf
902        children_AABBs[slm_pos_main + slm_child_offset] = childrenBox;
903
904        if(slm_child_offset == 0)
905        {
906           uint offset = atomic_inc_local(&num_fat_leaves[0]);
907           leafTable_local[offset].inner_node_index = innerNodeIdx_mem;
908           leafTable_local[offset].backpointer = bp;
909           leafTable_local[offset].leaf_index = leaf_index_mem;
910        }
911    }
912
913    barrier( CLK_LOCAL_MEM_FENCE );
914
915    varying ushort lane   = get_sub_group_local_id();
916    ushort SIMD8_PER_SG   = get_sub_group_size()/8;
917    ushort SIMD8_PER_WG   = get_num_sub_groups()*SIMD8_PER_SG;
918    ushort simd8_local_id = get_sub_group_local_id()/8;
919    ushort simd8_id       = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id;
920    ushort logical_lane   = lane%8;
921
922    uint fatleaves_aligned_32 = (num_fat_leaves[0] + 31) & ~31;
923
924    for(uint offset = 0; offset < fatleaves_aligned_32; offset += 32)
925    {
926        uniform uint fatleaf_index = simd8_id + offset;
927        uint innerNodeIdx_mem = leafTable_local[fatleaf_index].inner_node_index;
928        uint bp           = leafTable_local[fatleaf_index].backpointer;
929        uint leaf_index_mem   = leafTable_local[fatleaf_index].leaf_index;
930
931        uint numChildren = (bp >> 3) & 0x7;
932
933        uint leaf_index = leaf_index_mem >> 3;
934        uint slm_child_offset = leaf_index_mem & 0x7;
935
936        uint innerNodeIdx = innerNodeIdx_mem >> 8;
937        uint slm_pos_main = innerNodeIdx_mem & 0xFF;
938
939        bool skip_tid = leaf_index == 0x1FFFFFFF;
940        bool active_lane = (logical_lane < numChildren);
941        uint lane_children = active_lane ? logical_lane : 0;
942
943        fatleaf_index = leaf_index;
944
945        varying InternalNode* curNode = (InternalNode*)(((char*)bvh) + (BVH_ROOT_NODE_OFFSET + 64 * innerNodeIdx));
946
947        global struct Quad *quads = (global struct Quad *)((char*)bvh + 64*bvh->quadLeafStart );
948
949        varying struct AABB childrenBox_bu;
950        AABB_init(&childrenBox_bu);
951
952        if(!skip_tid)
953            childrenBox_bu = AABBfromAABB3f(children_AABBs[slm_pos_main + lane_children]);
954
955        struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox_bu);
956        struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0);
957
958        for (uint i = 1; i < SIMD8_PER_SG; i++)
959        {
960            struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i);
961            int3 is_upper_lane = ((uint3)(i)) == simd8_local_id;
962            reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane );
963            reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane );
964        }
965
966        if(!skip_tid)
967        {
968            uint quad_offset = 64u * bvh->quadLeafStart + 64 * fatleaf_index;
969            varying QuadLeaf* quad =  (QuadLeaf*)(((char*)bvh) + quad_offset);
970            uint childOffs = (((char*)quad) - ((char*)curNode))/64;
971
972            sg_InternalNode_setFields(
973            curNode,
974            reduce_bounds,
975            childOffs,
976            NODE_TYPE_QUAD,
977            &childrenBox_bu,
978            numChildren,
979            0xff );
980
981            bool atomic_mask = (1<<logical_lane) & 0x77;
982
983            uint lmod = logical_lane % 4;
984            uint ldiv = logical_lane / 4;
985            float vlo = reduce_bounds.lower.x;
986            float vhi = reduce_bounds.upper.x;
987            vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo;
988            vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi;
989            vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo;
990            vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi;
991            float v = (ldiv == 0) ? vlo : -vhi;
992
993            global float* pv = (global float*) &scratch[innerNodeIdx];
994
995            store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v));
996
997            BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
998            uint parent = (bp >> 6);
999
1000            global float* parent_v = (global float*) &(scratch[parent]) + logical_lane;
1001
1002            if(atomic_mask && (*parent_v >= v) && (parent != 0x03FFFFFF))
1003            {
1004                innerNodeIdx = parent;
1005                bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
1006                atomic_min( parent_v, v );
1007                parent = bp >> 6;
1008
1009                if(parent != 0x03FFFFFF)
1010                {
1011                    while( parent != 0x03FFFFFF )
1012                    {
1013                        innerNodeIdx = parent;
1014                        bp =  *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
1015
1016                        global float* parent_v_global = (global float*) &(scratch[innerNodeIdx]) + logical_lane;
1017                        if(*parent_v_global >= v)
1018                            atomic_min( parent_v_global, v );
1019                        else
1020                            break;
1021
1022                        parent = bp >> 6;
1023                    }
1024                }
1025            }
1026        }
1027    }
1028}
1029
1030GRL_ANNOTATE_IGC_DO_NOT_SPILL
1031__attribute__((reqd_work_group_size(256, 1, 1)))
1032__attribute__( (intel_reqd_sub_group_size( 16 )) )
1033void kernel
1034traverse_aabbs_new_update(
1035        global struct BVHBase* bvh,
1036        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
1037        global struct RefitScratch* scratch
1038    )
1039{
1040    varying uint lid = get_local_id(0);
1041    varying uint tid = lid + get_group_id(0)*get_local_size(0);
1042
1043    local struct AABB3f children_AABBs[256];
1044    local struct LeafTableEntry leafTable_local[256];
1045    local uint num_fat_leaves;
1046
1047    traverse_aabbs_new_update_func(bvh, (global char*)geomDesc /* not used */, geomDesc, scratch, (uint)-1 /* not used */,
1048        &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], false);
1049}
1050
1051GRL_ANNOTATE_IGC_DO_NOT_SPILL
1052__attribute__((reqd_work_group_size(256, 1, 1)))
1053__attribute__( (intel_reqd_sub_group_size( 16 )) )
1054void kernel
1055traverse_aabbs_new_update_single_geo(
1056        global struct BVHBase* bvh,
1057        global char* vertices,
1058        global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
1059        global struct RefitScratch* scratch,
1060        const uint vertex_format
1061    )
1062{
1063    varying uint lid = get_local_id(0);
1064    varying uint tid = lid + get_group_id(0)*get_local_size(0);
1065
1066    local struct AABB3f children_AABBs[256];
1067    local struct LeafTableEntry leafTable_local[256];
1068    local uint num_fat_leaves;
1069
1070    if(vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT)
1071      traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32B32_FLOAT,
1072          &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1073    else if(vertex_format == VERTEX_FORMAT_R32G32_FLOAT)
1074        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32_FLOAT,
1075            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1076    else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT)
1077        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_FLOAT,
1078            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1079    else if(vertex_format == VERTEX_FORMAT_R16G16_FLOAT)
1080        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_FLOAT,
1081            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1082    else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM)
1083        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_SNORM,
1084            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1085    else if(vertex_format == VERTEX_FORMAT_R16G16_SNORM)
1086        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_SNORM,
1087            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1088    else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM)
1089        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_UNORM,
1090            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1091    else if(vertex_format == VERTEX_FORMAT_R16G16_UNORM)
1092        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_UNORM,
1093            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1094    else if(vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM)
1095        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R10G10B10A2_UNORM,
1096            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1097    else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM)
1098        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_UNORM,
1099            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1100    else if(vertex_format == VERTEX_FORMAT_R8G8_UNORM)
1101        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_UNORM,
1102            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1103    else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM)
1104        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_SNORM,
1105            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1106    else if(vertex_format == VERTEX_FORMAT_R8G8_SNORM)
1107        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_SNORM,
1108            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1109    else
1110        traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, (uint)-1,
1111            &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true);
1112}
1113