xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/morton/phase0.cl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1//
2// Copyright (C) 2009-2022 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8#include "libs/lsc_intrinsics.h"
9#include "morton/morton_common.h"
10
11GRL_INLINE void SUBGROUP_create_node_phase0(
12    uniform global struct Globals* globals,
13    uniform global struct BinaryMortonCodeHierarchy* bnodes,
14    uniform global char* bvh_mem,
15    uniform global uint *global_refit_startpoints,
16    uniform uint rID,
17    uniform local uint* local_numRecords,
18    uniform local uint* local_QNodeOffset,
19    uniform global struct BuildRecordMorton* records,
20    uniform struct BuildRecordMorton current,
21    uniform local uint* local_startpoints_num)
22{
23    uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
24    uniform const uint rootNodeOffset = BVH_ROOT_NODE_OFFSET;
25    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
26    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
27
28    varying ushort lane = get_sub_group_local_id();
29
30    /* initialize child array */
31    uniform uint numChildren = 2;
32    varying struct BuildRecordMorton sg_children;
33    sg_children.items = 0;
34    sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
35
36    if ( lane < numChildren )
37        sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
38
39    /* fill QBVH6 node with up to 6 children */
40    while ( numChildren < BVH_NODE_N6 )
41    {
42        varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
43        if ( sub_group_all( sg_is_leaf ) )
44            break;
45
46        uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
47        uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
48        uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
49
50        varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
51
52        if ( lane == numChildren || lane == bestChild )
53        {
54            sg_children.nodeID = nodeID;
55            sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
56        }
57
58        numChildren++;
59    }
60
61    const uint current_index = current.current_index;
62    struct QBVHNodeN* qnode = nodeData + current_index;
63    SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
64
65    uniform uint global_offset;
66    uniform uint child_node_offset;
67
68    // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
69    // used in global refit after phase1
70    varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
71    uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
72
73    if ( lane == 0 )
74    {
75        child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
76
77        /* create node, but to not set bounds yet as these get calculated during refit */
78        QBVH6Node_set_type( qnode, BVH_INTERNAL_NODE );
79        QBVH6Node_set_offset( qnode, (global struct QBVHNodeN*)(bvh_mem + child_node_offset) );
80        /* set back pointers */
81        uint backpointer = (current.parent_index << 6) | (numChildren << 3);
82
83        global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
84
85#if MORTON_VERBOSE_LOG
86        printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d\n",
87               rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren);
88#endif
89
90        if(children_roots_num == numChildren)
91        {
92            uint startpoints_offset = atomic_inc_local( local_startpoints_num );
93            global_refit_startpoints[startpoints_offset] = current_index;
94        }
95        else
96        {
97            backpointer += children_roots_num;
98        }
99
100        *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
101    }
102
103    child_node_offset = sub_group_broadcast( child_node_offset, 0 );
104    global_offset = sub_group_broadcast( global_offset, 0 );
105
106    uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
107
108    sg_children.current_index = childNodes - nodeData + lane;
109    sg_children.parent_index = current_index;
110
111    if ( lane < numChildren )
112    {
113        uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
114        records[write_position] = sg_children;
115    }
116}
117
118
119GRL_INLINE void SUBGROUP_create_node_phase0_local_sync(
120    uniform global struct Globals* globals,
121    uniform global struct BinaryMortonCodeHierarchy* bnodes,
122    uniform global char* bvh_mem,
123    uniform uint rID,
124    uniform local uint* local_numRecords,
125    uniform local uint* local_QNodeOffset,
126    uniform global struct BuildRecordMorton* records,
127    uniform struct BuildRecordMorton current,
128    uniform local uint* local_p0_total,
129    uniform global struct MortonFlattenedBoxlessNode *boxless_nodes,
130    uniform uint nodeDataStart)
131{
132    uniform global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
133    uniform const uint rootNodeOffset = bvh->rootNodeOffset;
134    uniform global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
135    uniform BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
136
137    varying ushort lane = get_sub_group_local_id();
138
139    /* initialize child array */
140    uniform uint numChildren = 2;
141    varying struct BuildRecordMorton sg_children;
142    sg_children.items = 0;
143    sg_children.nodeID = (lane == 0) ? bnodes[current.nodeID].leftChild : bnodes[current.nodeID].rightChild;
144
145    if ( lane < numChildren )
146        sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, sg_children.nodeID );
147
148    /* fill QBVH6 node with up to 6 children */
149    while ( numChildren < BVH_NODE_N6 )
150    {
151        varying bool sg_is_leaf = sg_children.items <= cfg_minLeafSize;
152        if ( sub_group_all( sg_is_leaf ) )
153            break;
154
155        uniform uint bestItems = sub_group_reduce_max_N6( sg_children.items );
156        uniform ushort bestChild = ctz( intel_sub_group_ballot( sg_children.items == bestItems ) );
157        uniform uint bestNodeID = sub_group_broadcast( sg_children.nodeID, bestChild );
158
159        varying uint nodeID = (lane == bestChild) ? bnodes[bestNodeID].leftChild : bnodes[bestNodeID].rightChild;
160
161        if ( lane == numChildren || lane == bestChild )
162        {
163            sg_children.nodeID = nodeID;
164            sg_children.items = BinaryMortonCodeHierarchy_getNumPrimitives( bnodes, nodeID );
165        }
166
167        numChildren++;
168    }
169
170    const uint current_index = current.current_index;
171    uniform uint global_offset;
172    uniform uint child_node_offset;
173
174    // Check if all children will be roots for the local subgtrees in phase1. If so we keep the node ids to be later
175    // used in global refit after phase1
176    varying uchar is_children_root = (lane < numChildren) ? (sg_children.items <= MORTON_BUILDER_SUBTREE_THRESHOLD) : 0;
177    uniform uchar rootMask = sub_group_reduce_or_N6(is_children_root << lane);
178    uniform uchar children_roots_num = sub_group_reduce_add(is_children_root);
179
180    if ( lane == 0 )
181    {
182        child_node_offset = atomic_add_local(local_QNodeOffset,64*numChildren);
183
184        /* Do not create qnodes here */
185        uint backpointer = (current.parent_index << 6) | (numChildren << 3);
186
187        global_offset = atomic_add_local( local_numRecords, numChildren - 1 );
188
189#if MORTON_VERBOSE_LOG
190        printf("PHASE0: loc_id: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, nodeDataStart: %d\n",
191               rID, current_index, current_index + qnode->offset, qnode->offset, current.parent_index, numChildren, nodeDataStart);
192#endif
193
194        MortonFlattenedBoxlessNode flattened_node;
195
196        if(children_roots_num != numChildren)
197            backpointer += children_roots_num;
198
199        flattened_node.binary_hierarchy_index = (current_index << 6) | rootMask;
200
201        uint loc_id = atomic_inc_local( local_p0_total );
202
203        flattened_node.childOffset_type = ((((child_node_offset - nodeDataStart * 64) / 64) - current_index) << 6) | BVH_INTERNAL_NODE;
204        flattened_node.backPointer = backpointer;
205
206        //TODO: change this writes to L1WB or streaming
207        boxless_nodes[loc_id] = flattened_node;
208
209        *InnerNode_GetBackPointer(backPointers, current_index) = backpointer;
210    }
211
212    child_node_offset = sub_group_broadcast( child_node_offset, 0 );
213    global_offset = sub_group_broadcast( global_offset, 0 );
214
215    uniform global struct QBVHNodeN* childNodes = (global struct QBVHNodeN*)(bvh_mem + child_node_offset);
216
217    sg_children.current_index = childNodes - nodeData + lane;
218    sg_children.parent_index = current_index;
219
220    if ( lane < numChildren )
221    {
222        uint write_position = (lane == 0) ? rID : global_offset + lane - 1;
223        records[write_position] = sg_children;
224    }
225}
226
227/*
228
229  In this phase a single large work group performs the construction of
230  the top of the BVH and creates a build record array.
231
232  Two varians of this kernel:
233  1. Refit with global synchronization - Used for big bvh, where number of allocated nodes will not fit
234     in SLM in phase2. Phase0 creates qnodes in bvh, and provides startpoints for bottom up phase
235     that is executed after phase1. This refit uses global synchronizations and mem_fence_gpu_invalidate
236     that is not effective.
237  2. Refit with local synchronization - Flattened boxless nodes are passed via global memory, along with
238     number of created nodes. Phase0 does not create qnodes in bvh, it is done in phase2 during refit.
239     In phase2, flattened boxless nodes are moved to SLM, along with bounding boxes from phase1.
240     Refit is performed only with local synchronization.
241
242*/
243
244__attribute__((reqd_work_group_size(512, 1, 1)))
245__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
246parallel_build_phase0(global struct Globals *globals,
247                      global struct BinaryMortonCodeHierarchy *bnodes,
248                      global char *bvh_mem,
249                      global uint *global_refit_startpoints)
250{
251    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
252    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
253
254    /* a queue of build records in global memory */
255    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
256    local uint local_numRecords;
257    local uint local_QNodeOffset;
258    local uint local_startpoints_num;
259
260    /* initialize first build record */
261    if (get_local_id(0) == 0)
262    {
263        /* allocate root node */
264        uint root_node_offset = 64*bvh->nodeDataCur;
265        global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
266
267        //assert(root_node_offset == 0);
268        records[0].nodeID = globals->binary_hierarchy_root;
269        records[0].items = globals->numPrimitives;
270        records[0].current_index = rootNode - nodeData;
271        records[0].parent_index = -1;
272
273        local_numRecords = 1;
274        local_QNodeOffset = root_node_offset + 64;
275        local_startpoints_num = 0;
276
277        mem_fence_workgroup_default();
278    }
279
280    uint num_records = 1;
281
282    /* terminate when all subtrees are under size threshold */
283    while(true)
284    {
285        work_group_barrier(CLK_LOCAL_MEM_FENCE);
286
287        /* all work items in the work group pick a subtree to build */
288        for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
289        {
290            /* small subtrees will get built in next phase */
291            if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
292                continue;
293
294            /* create QBVH node */
295            SUBGROUP_create_node_phase0(globals, bnodes, bvh_mem, global_refit_startpoints, ID, &local_numRecords, &local_QNodeOffset,
296                                        records, records[ID], &local_startpoints_num);
297        }
298
299        work_group_barrier( CLK_LOCAL_MEM_FENCE );
300        mem_fence_workgroup_default();
301        uint old_num_records = num_records;
302        num_records = local_numRecords;
303        if( old_num_records == num_records )
304            break;
305
306    }
307
308    /* remember number of build records for next phase */
309    if (get_local_id( 0 ) == 0)
310    {
311        globals->numBuildRecords = local_numRecords;
312        globals->p0_created_num = local_startpoints_num;
313        bvh->nodeDataCur = local_QNodeOffset / 64;
314
315#if MORTON_VERBOSE_LOG
316        printf("PHASE_0: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->p0_created_num);
317#endif
318    }
319}
320
321__attribute__((reqd_work_group_size(512, 1, 1)))
322__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
323parallel_build_phase0_local_sync(global struct Globals *globals,
324                      global struct BinaryMortonCodeHierarchy *bnodes,
325                      global char *bvh_mem,
326                      global struct MortonFlattenedBoxlessNode *boxless_nodes)
327{
328    global struct BVHBase *bvh = (global struct BVHBase *)bvh_mem;
329    global struct QBVHNodeN *nodeData = BVHBase_nodeData(bvh);
330    uint nodeDataStart = BVH_ROOT_NODE_OFFSET / 64;
331
332    /* a queue of build records in global memory */
333    global struct BuildRecordMorton *records = (global struct BuildRecordMorton *)(bvh_mem + 64*bvh->quadLeafStart);
334    local uint local_numRecords;
335    local uint local_QNodeOffset;
336    local uint local_p0_total;
337
338    /* initialize first build record */
339    if (get_local_id(0) == 0)
340    {
341        /* allocate root node */
342        uint root_node_offset = 64*bvh->nodeDataCur;
343        global struct QBVHNodeN *rootNode = (global struct QBVHNodeN *)(bvh_mem + root_node_offset);
344
345        //assert(root_node_offset == 0);
346        records[0].nodeID = globals->binary_hierarchy_root;
347        records[0].items = globals->numPrimitives;
348        records[0].current_index = rootNode - nodeData;
349        records[0].parent_index = -1;
350
351        local_numRecords = 1;
352        local_QNodeOffset = root_node_offset + 64;
353        local_p0_total = 0;
354
355        mem_fence_workgroup_default();
356    }
357
358    uint num_records = 1;
359
360    /* terminate when all subtrees are under size threshold */
361    while(true)
362    {
363        work_group_barrier(CLK_LOCAL_MEM_FENCE);
364
365        /* all work items in the work group pick a subtree to build */
366        for (uint ID = get_sub_group_id(); ID < num_records; ID += get_num_sub_groups() )
367        {
368            /* small subtrees will get built in next phase */
369            if (records[ID].items <= MORTON_BUILDER_SUBTREE_THRESHOLD) // FIXME: should break at 64 leaves not 64 primitives
370                continue;
371
372            /* create QBVH node */
373            SUBGROUP_create_node_phase0_local_sync(globals, bnodes, bvh_mem, ID, &local_numRecords, &local_QNodeOffset, records,
374                                                   records[ID], &local_p0_total, boxless_nodes, nodeDataStart);
375        }
376
377        mem_fence_workgroup_default();
378        work_group_barrier( CLK_LOCAL_MEM_FENCE );
379
380        uint old_num_records = num_records;
381        num_records = local_numRecords;
382        if( old_num_records == num_records )
383            break;
384
385    }
386
387    /* remember number of build records for next phase */
388    if (get_local_id( 0 ) == 0)
389    {
390        globals->numBuildRecords = local_numRecords;
391        bvh->nodeDataCur = local_QNodeOffset / 64;
392
393        globals->p0_allocated_num = BVHBase_numNodes(bvh);
394        globals->p0_created_num = local_p0_total;
395
396#if MORTON_VERBOSE_LOG
397            printf("PHASE_0_LOCAL_SYNC: allocated %d nodes. globals->global_refit_startpoints: %d\n", BVHBase_numNodes(bvh), globals->global_refit_startpoints);
398#endif
399    }
400}
401