xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/morton/morton_common.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 //
2 // Copyright (C) 2009-2022 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7 
8 #include "common.h"
9 
10 #define MORTON_DEBUG_CHECKS 0
11 #define MORTON_VERBOSE_LOG 0
12 
get_morton_sort_lsb_req_iterations(uint shift)13 GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift )
14 {
15 #if 0 // turn off, because current hierarchy build requires full sort
16     // Difference between max iterations needed for LSB sorting and
17     // number of iterations needed for LSB sorting without primIDs
18     // This indicates how many of first iterations would be skipped in LSB
19     return 8 - (8 - (shift >> 3));
20 #else
21     return 0;
22 #endif
23 }
24 
25 typedef struct BuildRecordLocalMortonFlattener
26 {
27     unsigned int leftChild;  // global
28     unsigned int rightChild; // global
29     unsigned int rangeStart; // global
30     unsigned int local_parent_index__numItems;
31 } BuildRecordLocalMortonFlattener;
32 
33 // TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced
34 typedef union UPerNodeData {
35     float4                           four_DWs;
36     BuildRecordLocalMortonFlattener  buildRecord;
37     MortonFlattenedBoxlessNode             boxlessNode;
38     struct AABB                      box;
39 } UPerNodeData;
40 
MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)41 GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)
42 {
43     return bn.childOffset_type >> 6;
44 }
45 
MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)46 GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)
47 {
48     return bn.childOffset_type & ((1<<6) -1);
49 }
50 
set_2xSG_arr_first_write(uint index,uint * arr,ushort val,short lane)51 GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
52 {
53     short lane_used = index % get_sub_group_size();
54     short shift = (index / get_sub_group_size()) * get_sub_group_size();
55     if (lane_used == lane) {
56         *arr |= (val << shift);
57     }
58 }
59 
get_from_2xSG_arr(uint index,uint arr,short lane)60 GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane)
61 {
62     short r = 0;
63     short lane_used = index % get_sub_group_size();
64     short shift =    (index / get_sub_group_size()) * get_sub_group_size();
65         r = arr >> shift;
66     r = sub_group_broadcast(r, lane_used);
67     return r;
68 }
69 
unpack_from_2xSG_arr(uint count,uint arr,short lane,ushort * dst)70 GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst)
71 {
72     if (lane < count)
73     {
74         dst[lane]=(ushort)(arr & 0xFFFF);
75         short hi_idx = lane + get_sub_group_size();
76         if (hi_idx < count) {
77             dst[hi_idx] = (ushort)(arr >> 16);
78         }
79     }
80 }
81 
82 
pack_from_2xSG_arr(ushort * src,uint count,uint * arr,short lane)83 GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane)
84 {
85     if (lane < count)
86     {
87         *arr = src[lane];
88         short hi_idx = lane + get_sub_group_size();
89         if (hi_idx < count) {
90             *arr |= ((uint)(src[hi_idx])) << 16u;
91         }
92     }
93 }
94 
set_2xSG_arr(uint index,uint * arr,short val,short lane)95 GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane)
96 {
97     short lane_used = index % get_sub_group_size();
98     short shift = (index / get_sub_group_size()) * get_sub_group_size();
99     if (lane_used == lane) {
100         uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint
101         *arr = (val << shift) | rem_val;
102     }
103 }
104 
SUBGROUP_refit_bottom_up_local(uniform struct QBVHNodeN * globalNodeData,uniform struct BackPointers * backPointers,uniform uint treeletRootGlobalIndex,uniform uint globalBaseForInternalNodes,varying ushort lane,uniform local union UPerNodeData * local_nodes,varying uint sg_bu_startpoints,uniform uint sg_bu_startpoints_cnt)105 GRL_INLINE void SUBGROUP_refit_bottom_up_local(
106     uniform struct QBVHNodeN* globalNodeData,
107     uniform struct BackPointers* backPointers,
108     uniform uint treeletRootGlobalIndex,
109     uniform uint globalBaseForInternalNodes,
110     varying ushort lane,
111     uniform local union UPerNodeData* local_nodes,
112     varying uint sg_bu_startpoints,
113     uniform uint sg_bu_startpoints_cnt)
114 {
115     if(sg_bu_startpoints_cnt == 0)
116         return;
117 
118     const uint head_lane = 0;
119     uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
120 
121     uniform uint prev_loc_index = 0;
122     uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
123 
124     uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
125 
126     while (curNodeIndex != 0)
127     {
128         uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode);
129         uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
130         varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane;
131 
132         uint numChildren = BackPointer_GetNumChildren(backpointer);
133         if (child_loc_idx != prev_loc_index &&
134             lane < numChildren)
135         {
136             child_aabb = local_nodes[child_loc_idx].box;
137         }
138         else if (lane >= numChildren) {
139             AABB_init(&child_aabb);
140             child_aabb.lower.w = as_float(0u);
141         }
142 
143         // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
144         struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
145         reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 );
146 
147         uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
148         reduced_bounds.lower.w = as_float((uint)instMask);
149         uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0);
150         local uint* pbox = (local uint*)(local_nodes+ curNodeIndex);
151         if (lane < 8)
152         {
153             pbox[lane] = reduce_bounds_lane;
154         }
155 
156         uint global_node_idx = globalBaseForInternalNodes + curNodeIndex;
157         /* get bounds of all children from child nodes directly */
158         struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
159         subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false);
160         child_aabb = reduced_bounds;
161         uint parentIndex = BackPointer_GetParentIndex(backpointer);
162 
163         write_mem_fence(CLK_LOCAL_MEM_FENCE);
164 
165         if (lane == 0)
166         {
167             backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer));
168             uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex;
169             uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3);
170 
171             /* set global back pointer */
172             *InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer;
173 
174 #if MORTON_VERBOSE_LOG
175             printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n",
176                    global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx);
177 #endif
178         }
179 
180         backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
181         prev_loc_index = curNodeIndex;
182         curNodeIndex = parentIndex;
183 
184         /* if all children got refitted, then continue */
185         uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
186         uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
187         if (numChildrenRefitted != numChildrenTotal)
188         {
189             if(sg_bu_startpoints_cnt)
190             {
191                 curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
192                 backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
193             }
194             else
195                 return;
196         }
197     }
198 
199     // process root of the treelet
200     {
201 
202 #if MORTON_DEBUG_CHECKS
203         if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
204 #endif
205 
206         uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode);
207         varying uint child_loc_idx = lead_child_loc_offset + 0 + lane;
208         uint numChildren = BackPointer_GetNumChildren(backpointer);
209 
210         if (child_loc_idx != prev_loc_index &&
211             lane < numChildren)
212         {
213             child_aabb = local_nodes[child_loc_idx].box;
214         }
215         else if (lane >= numChildren) {
216             AABB_init(&child_aabb);
217             child_aabb.lower.w = as_float(0u);
218         }
219 
220         // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
221         uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
222         uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
223         uint global_node_idx = treeletRootGlobalIndex;
224         uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset;
225 
226         /* get bounds of all children from child nodes directly */
227         struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
228 
229         subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false);
230 
231         /* reset refit counter for next refit */
232         if (lane == 0)
233         {
234             /* set global back pointer */
235             *InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u);
236 
237             // TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes
238 
239 #if MORTON_VERBOSE_LOG
240             printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
241                    curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
242 #endif
243         }
244     }
245 }
246