1 //
2 // Copyright (C) 2009-2022 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7
8 #include "common.h"
9
10 #define MORTON_DEBUG_CHECKS 0
11 #define MORTON_VERBOSE_LOG 0
12
get_morton_sort_lsb_req_iterations(uint shift)13 GRL_INLINE uint get_morton_sort_lsb_req_iterations( uint shift )
14 {
15 #if 0 // turn off, because current hierarchy build requires full sort
16 // Difference between max iterations needed for LSB sorting and
17 // number of iterations needed for LSB sorting without primIDs
18 // This indicates how many of first iterations would be skipped in LSB
19 return 8 - (8 - (shift >> 3));
20 #else
21 return 0;
22 #endif
23 }
24
25 typedef struct BuildRecordLocalMortonFlattener
26 {
27 unsigned int leftChild; // global
28 unsigned int rightChild; // global
29 unsigned int rangeStart; // global
30 unsigned int local_parent_index__numItems;
31 } BuildRecordLocalMortonFlattener;
32
33 // TODO: Currently sizeof UPerNodeData is 32, AABB struct allocates more data than needed and can be reduced
34 typedef union UPerNodeData {
35 float4 four_DWs;
36 BuildRecordLocalMortonFlattener buildRecord;
37 MortonFlattenedBoxlessNode boxlessNode;
38 struct AABB box;
39 } UPerNodeData;
40
MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)41 GRL_INLINE uint MortonFlattenedBoxlessNode_GetChildOffset(MortonFlattenedBoxlessNode bn)
42 {
43 return bn.childOffset_type >> 6;
44 }
45
MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)46 GRL_INLINE uint MortonFlattenedBoxlessNode_GetType(MortonFlattenedBoxlessNode bn)
47 {
48 return bn.childOffset_type & ((1<<6) -1);
49 }
50
set_2xSG_arr_first_write(uint index,uint * arr,ushort val,short lane)51 GRL_INLINE void set_2xSG_arr_first_write(uint index, uint* arr, ushort val, short lane)
52 {
53 short lane_used = index % get_sub_group_size();
54 short shift = (index / get_sub_group_size()) * get_sub_group_size();
55 if (lane_used == lane) {
56 *arr |= (val << shift);
57 }
58 }
59
get_from_2xSG_arr(uint index,uint arr,short lane)60 GRL_INLINE short get_from_2xSG_arr(uint index, uint arr, short lane)
61 {
62 short r = 0;
63 short lane_used = index % get_sub_group_size();
64 short shift = (index / get_sub_group_size()) * get_sub_group_size();
65 r = arr >> shift;
66 r = sub_group_broadcast(r, lane_used);
67 return r;
68 }
69
unpack_from_2xSG_arr(uint count,uint arr,short lane,ushort * dst)70 GRL_INLINE void unpack_from_2xSG_arr(uint count, uint arr, short lane, ushort* dst)
71 {
72 if (lane < count)
73 {
74 dst[lane]=(ushort)(arr & 0xFFFF);
75 short hi_idx = lane + get_sub_group_size();
76 if (hi_idx < count) {
77 dst[hi_idx] = (ushort)(arr >> 16);
78 }
79 }
80 }
81
82
pack_from_2xSG_arr(ushort * src,uint count,uint * arr,short lane)83 GRL_INLINE void pack_from_2xSG_arr(ushort* src, uint count, uint *arr, short lane)
84 {
85 if (lane < count)
86 {
87 *arr = src[lane];
88 short hi_idx = lane + get_sub_group_size();
89 if (hi_idx < count) {
90 *arr |= ((uint)(src[hi_idx])) << 16u;
91 }
92 }
93 }
94
set_2xSG_arr(uint index,uint * arr,short val,short lane)95 GRL_INLINE void set_2xSG_arr(uint index, uint* arr, short val, short lane)
96 {
97 short lane_used = index % get_sub_group_size();
98 short shift = (index / get_sub_group_size()) * get_sub_group_size();
99 if (lane_used == lane) {
100 uint rem_val = (*arr) & (0xFFFF0000 >> shift); //calculate the ramaining other half in the uint
101 *arr = (val << shift) | rem_val;
102 }
103 }
104
SUBGROUP_refit_bottom_up_local(uniform struct QBVHNodeN * globalNodeData,uniform struct BackPointers * backPointers,uniform uint treeletRootGlobalIndex,uniform uint globalBaseForInternalNodes,varying ushort lane,uniform local union UPerNodeData * local_nodes,varying uint sg_bu_startpoints,uniform uint sg_bu_startpoints_cnt)105 GRL_INLINE void SUBGROUP_refit_bottom_up_local(
106 uniform struct QBVHNodeN* globalNodeData,
107 uniform struct BackPointers* backPointers,
108 uniform uint treeletRootGlobalIndex,
109 uniform uint globalBaseForInternalNodes,
110 varying ushort lane,
111 uniform local union UPerNodeData* local_nodes,
112 varying uint sg_bu_startpoints,
113 uniform uint sg_bu_startpoints_cnt)
114 {
115 if(sg_bu_startpoints_cnt == 0)
116 return;
117
118 const uint head_lane = 0;
119 uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
120
121 uniform uint prev_loc_index = 0;
122 uniform struct AABB child_aabb; // this carries reduced aabb between loop turns
123
124 uniform uint backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
125
126 while (curNodeIndex != 0)
127 {
128 uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[curNodeIndex].boxlessNode);
129 uniform uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
130 varying uint child_loc_idx = lead_child_loc_offset + curNodeIndex + lane;
131
132 uint numChildren = BackPointer_GetNumChildren(backpointer);
133 if (child_loc_idx != prev_loc_index &&
134 lane < numChildren)
135 {
136 child_aabb = local_nodes[child_loc_idx].box;
137 }
138 else if (lane >= numChildren) {
139 AABB_init(&child_aabb);
140 child_aabb.lower.w = as_float(0u);
141 }
142
143 // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
144 struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb);
145 reduced_bounds = AABB_sub_group_shuffle( &reduced_bounds, 0 );
146
147 uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
148 reduced_bounds.lower.w = as_float((uint)instMask);
149 uint reduce_bounds_lane = AABB_sub_group_shuffle_coordPerLane(&reduced_bounds, 0);
150 local uint* pbox = (local uint*)(local_nodes+ curNodeIndex);
151 if (lane < 8)
152 {
153 pbox[lane] = reduce_bounds_lane;
154 }
155
156 uint global_node_idx = globalBaseForInternalNodes + curNodeIndex;
157 /* get bounds of all children from child nodes directly */
158 struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
159 subgroup_setQBVHNodeN_setFields(lead_child_loc_offset, nodeType, &child_aabb, numChildren, instMask, qnode, false);
160 child_aabb = reduced_bounds;
161 uint parentIndex = BackPointer_GetParentIndex(backpointer);
162
163 write_mem_fence(CLK_LOCAL_MEM_FENCE);
164
165 if (lane == 0)
166 {
167 backpointer = atomic_inc_local(&(local_nodes[parentIndex].boxlessNode.backPointer));
168 uint globalParentIndex = (parentIndex > 0) ? (parentIndex + globalBaseForInternalNodes) : treeletRootGlobalIndex;
169 uint globalBackpointer = (globalParentIndex << 6) | (numChildren << 3);
170
171 /* set global back pointer */
172 *InnerNode_GetBackPointer(backPointers, global_node_idx) = globalBackpointer;
173
174 #if MORTON_VERBOSE_LOG
175 printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, lead_child_loc_offset: %d, numChildren: %d, child_loc_idx: %d\n",
176 global_node_idx, global_node_idx + qnode->offset, qnode->offset, globalBackpointer >> 6, lead_child_loc_offset, numChildren, child_loc_idx);
177 #endif
178 }
179
180 backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane);
181 prev_loc_index = curNodeIndex;
182 curNodeIndex = parentIndex;
183
184 /* if all children got refitted, then continue */
185 uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7;
186 uniform uint numChildrenTotal = (backpointer >> 3) & 0x7;
187 if (numChildrenRefitted != numChildrenTotal)
188 {
189 if(sg_bu_startpoints_cnt)
190 {
191 curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_bu_startpoints, lane);
192 backpointer = local_nodes[curNodeIndex].boxlessNode.backPointer;
193 }
194 else
195 return;
196 }
197 }
198
199 // process root of the treelet
200 {
201
202 #if MORTON_DEBUG_CHECKS
203 if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n");
204 #endif
205
206 uniform uint lead_child_loc_offset = MortonFlattenedBoxlessNode_GetChildOffset(local_nodes[0].boxlessNode);
207 varying uint child_loc_idx = lead_child_loc_offset + 0 + lane;
208 uint numChildren = BackPointer_GetNumChildren(backpointer);
209
210 if (child_loc_idx != prev_loc_index &&
211 lane < numChildren)
212 {
213 child_aabb = local_nodes[child_loc_idx].box;
214 }
215 else if (lane >= numChildren) {
216 AABB_init(&child_aabb);
217 child_aabb.lower.w = as_float(0u);
218 }
219
220 // TODO: perNode data could hold 7 dwords per node instead of 8 as long as we keep it in SLM
221 uint instMask = (uint)sub_group_reduce_or_N6(as_uint(child_aabb.lower.w));
222 uint nodeType = MortonFlattenedBoxlessNode_GetType(local_nodes[curNodeIndex].boxlessNode);
223 uint global_node_idx = treeletRootGlobalIndex;
224 uint lead_child_global_idx = globalBaseForInternalNodes + lead_child_loc_offset;
225
226 /* get bounds of all children from child nodes directly */
227 struct QBVHNodeN* qnode = globalNodeData + global_node_idx;
228
229 subgroup_setQBVHNodeN_setFields(lead_child_global_idx - global_node_idx, nodeType, &child_aabb, numChildren, instMask, qnode, false);
230
231 /* reset refit counter for next refit */
232 if (lane == 0)
233 {
234 /* set global back pointer */
235 *InnerNode_GetBackPointer(backPointers, global_node_idx) = backpointer & (~7u);
236
237 // TODO: Move AABBs to separate buffer, but for now communicate bottom-tip boxes through qnodes
238
239 #if MORTON_VERBOSE_LOG
240 printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n",
241 curNodeIndex, global_node_idx, global_node_idx + qnode->offset, qnode->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt);
242 #endif
243 }
244 }
245 }
246