xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/morton/phase2.cl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1//
2// Copyright (C) 2009-2022 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8#include "bvh_build_refit.h"
9#include "libs/lsc_intrinsics.h"
10#include "morton/morton_common.h"
11
12/*
13
14  POSTSORT PHASE2:
15  Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value.
16  1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate.
17                                   This kernel should be used only for very big bvh, it is faster than non-SLM fallback
18                                   in parallel_build_phase2_refit_local.
19  2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of
20                                   nodes allocated in phase0, but there is also non-SLM fallback there, as the
21                                   decision on which kernel to run is based on the nodes estimates on the host
22                                   side.
23
24*/
25
26
27GRL_INLINE void refit_bottom_up_global_sync(
28    global char* bvh_mem,
29    global uint* global_refit_startpoints,
30    uniform uint nodeId,
31    uniform ushort lane)
32{
33    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
34
35    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
36    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
37
38    // Get the node idx that was put here in phase1
39    const uint innerNodeIdx = global_refit_startpoints[nodeId];
40
41    // Get the qnode and backpointer
42    uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx;
43    uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx);
44
45    varying struct AABB childrenAABB; // one child AABB per lane
46    AABB_init(&childrenAABB);
47
48    uniform uint numChildren = (backPointer >> 3) & 0x7;
49    if(numChildren == 0) return;
50
51    global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
52    varying ushort child_idx = (lane < numChildren) ? lane : 0;
53    childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
54
55#if MORTON_VERBOSE_LOG
56    if(lane == 0)
57        printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx);
58#endif
59
60    struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB );
61    reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 );
62
63    subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane);
64
65    uint children_mask = qnode_child[child_idx].instMask;
66    qnode->instMask = sub_group_reduce_or_N6(children_mask);
67
68    SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 );
69}
70
71__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel
72parallel_build_phase2_refit( global char* bvh_mem,
73    global uint* global_refit_startpoints )
74{
75    refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0));
76}
77
78
79GRL_INLINE void SUBGROUP_refit_bottom_up_global(
80    uniform global struct QBVHNodeN* globalNodeData,
81    uniform struct BackPointers* backPointers,
82    varying ushort lane,
83    varying uint curNodeIndex)
84{
85    uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
86
87    const uint head_lane = 0;
88    uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
89
90    while (curNodeIndex != 0)
91    {
92        global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex;
93        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode );
94        uint numChildren = BackPointer_GetNumChildren(backpointer);
95
96        varying ushort child_idx = (lane < numChildren) ? lane : 0;
97        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
98
99        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
100        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
101
102        /* get bounds of all children from child nodes directly */
103        subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane);
104
105        uchar childrenMask = qnode_child[child_idx].instMask;
106        qnode->instMask = sub_group_reduce_or_N6(childrenMask);
107
108        uint parentIndex = BackPointer_GetParentIndex(backpointer);
109
110        mem_fence_gpu_invalidate();
111
112        if (lane == 0)
113        {
114            backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex));
115
116            uint globalBackpointer = (parentIndex << 6) | (numChildren << 3);
117
118            /* set global back pointer */
119            *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer;
120
121#if MORTON_VERBOSE_LOG
122            printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n",
123                   curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x);
124#endif
125        }
126
127        backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
128        curNodeIndex = parentIndex;
129
130        /* if all children got refitted, then continue */
131        uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
132        uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
133
134        if (numChildrenRefitted != numChildrenTotal)
135                return;
136    }
137
138    // process root of the treelet
139    {
140
141#if MORTON_DEBUG_CHECKS
142        if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
143#endif
144
145        global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData );
146        uint numChildren = BackPointer_GetNumChildren(backpointer);
147
148        varying ushort child_idx = (lane < numChildren) ? lane : 0;
149        child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx );
150
151        struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
152        reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane);
153
154        /* get bounds of all children from child nodes directly */
155        subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane);
156
157        uchar childrenMask = qnode_child[child_idx].instMask;
158        globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask);
159
160        /* reset refit counter for next refit */
161        if (lane == 0)
162        {
163            /* set global back pointer */
164            *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u);
165
166#if MORTON_VERBOSE_LOG
167        printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
168               curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
169#endif
170        }
171    }
172}
173
174
175// TODO: Check why 512 wg size has worse performance than 256
176__attribute__( (reqd_work_group_size( 512, 1, 1 )) )
177__attribute__((intel_reqd_sub_group_size(16))) void kernel
178parallel_build_phase2_refit_local( global struct Globals* globals,
179    global char* bvh_mem,
180    global struct MortonFlattenedBoxlessNode *boxless_nodes)
181{
182    // Number of nodes created in P0, to be refitted in this stage
183    uint p0_created_num = globals->p0_created_num;
184
185    // Return immediately if host executed this kernel but there is nothing to do
186    if(p0_created_num == 0)
187        return;
188
189    global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem;
190    BackPointers* backPointers = BVHBase_GetBackPointers( bvh );
191    global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh );
192    varying ushort lane = get_sub_group_local_id();
193
194    // Hardcode SLM to max here as we do not know upfront how much mem will be needed
195    local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */
196
197    // Number of allocated nodes in phase0 (p0_created_num + children)
198    uint p0_allocated_num = globals->p0_allocated_num;
199
200    // array that will keep 2x8 shorts indices
201    varying uint sg_fatleaf_array = 0x0;
202    uniform uint8_t sg_bu_startpoints_cnt = 0;
203
204    // Determine if we can fit into SLM with all the nodes allocated in phase0,
205    // There are two paths here:
206    // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local,
207    //    which does refit nad creates qnodes in bvh
208    // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization.
209    //    It is not performant to do so, keep it as a guardrail here. On the host side we do fallback
210    //    to the old refit separated path, with wg_size 8 with better EU reuse.
211    if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM)
212    {
213        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
214        {
215            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
216            uint current_id = boxless_node.binary_hierarchy_index >> 6;
217
218            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
219            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
220
221            if(lane == 0)
222                perNodeData[current_id].boxlessNode = boxless_node;
223
224            // When no children are subtree roots, we are done and skip to the next iteration
225            if(children_root_mask == 0x0)
226            {
227                continue;
228            }
229            // When all children are subtree roots, put them to sg_fatleaf_array
230            else if(children_root_mask == 0x3F)
231            {
232                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
233            }
234
235            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
236
237            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
238            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
239            varying ushort child_idx = (lane < numChildren) ? lane : 0;
240
241            varying struct AABB childrenAABB; // one child AABB per lane
242            AABB_init(&childrenAABB);
243
244            uint lead_child_global_id = current_id + lead_child_offset;
245
246            uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id;
247            childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx );
248
249            // Get only AABBs of children that are p1 subtree roots
250            bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx);
251            if(lane_active)
252            {
253                uint child_global_id = lead_child_global_id + child_idx;
254                perNodeData[child_global_id].box = childrenAABB;
255                perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask);
256            }
257
258#if MORTON_VERBOSE_LOG
259            if(lane == 0)
260                printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset);
261#endif
262        }
263
264        work_group_barrier(CLK_LOCAL_MEM_FENCE);
265
266        SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt);
267    }
268    else
269    {
270        for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() )
271        {
272            MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID];
273            uint current_id = boxless_node.binary_hierarchy_index >> 6;
274
275            // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused
276            uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F);
277            uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7;
278
279            uniform global struct QBVHNodeN* qnode = nodeData + current_id;
280            uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node);
281            uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node);
282
283            SUBGROUP_QBVHNodeN_setChildIncr1( qnode );
284            if(lane == 0)
285            {
286                QBVH6Node_set_type( qnode, nodeType );
287                qnode->offset = lead_child_offset;
288            }
289
290            // When no children are subtree roots, we are done and skip to the next iteration
291            if(children_root_mask == 0x0)
292            {
293                continue;
294            }
295            // When all children are subtree roots, put them to sg_fatleaf_array
296            else if(children_root_mask == 0x3F)
297            {
298                set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane);
299            }
300
301#if MORTON_VERBOSE_LOG
302            if(lane == 0)
303                printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset);
304#endif
305        }
306
307        while (sg_bu_startpoints_cnt > 0)
308        {
309            uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane);
310
311            SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex);
312        }
313    }
314}
315