xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/bvh_build_treelet_refit.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 //
2 // Copyright (C) 2009-2021 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7 
8 #pragma once
9 
10 #include "bvh_build_refit.h"
11 #include "libs/lsc_intrinsics.h"
12 
13 
14 #define REFIT_DEBUG_CHECKS 0
15 #define REFIT_VERBOSE_LOG 0
16 
17 #define NUM_STARTPOINTS_IN_SLM (1024)
18 
storeAABBToL1(struct AABB aabb,struct AABB * ptr)19 GRL_INLINE void storeAABBToL1(struct AABB aabb, struct AABB* ptr)
20 {
21     uint8 val = (uint8)(
22         as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
23         as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
24 
25     store_uint8_L1WB_L3WB((__global uint8*) ptr, 0, val);
26 }
27 
storeAABBToL3(struct AABB aabb,struct AABB * ptr)28 GRL_INLINE void storeAABBToL3(struct AABB aabb, struct AABB* ptr)
29 {
30     uint8 val = (uint8)(
31         as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
32         as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
33 
34     store_uint8_L1UC_L3WB((__global uint8*) ptr, 0, val);
35 }
36 
37 typedef struct Treelet_by_single_group_locals
38 {
39     uint   startpoints[NUM_STARTPOINTS_IN_SLM];
40 } Treelet_by_single_group_locals;
41 
42 typedef struct SquashedInputGroupDesc {
43     qword bvh;
44     qword scratch;
45     uint  groupInTree;
46     uint  totalNumGroups; //valid only for 0th element in array, otherwise its trash padding
47 } SquashedInputGroupDesc;
48 
49 //
50 //
51 // update primitives
52 //
53 //
54 
55 typedef struct SquashedInput {
56     global struct BVHBase* pBvh;
57     global void* pInput;
58     global struct AABB* bbox_scratch;
59 } SquashedInput;
60 
61 
62 
63 // updates one quad leaf and gets BBOX contatining it
refit_bottom_child_quad(global struct QuadLeaf * quad,global GRL_RAYTRACING_GEOMETRY_DESC * geomDesc,struct AABB * childAABB)64 GRL_INLINE void refit_bottom_child_quad(
65     global struct QuadLeaf* quad,
66     global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
67     struct AABB* childAABB)
68 {
69     struct QuadLeaf Q;
70     get_updated_quad(quad, geomDesc, &Q);
71     quadCopyVertices(&Q, quad);
72     *childAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
73 }
74 
75 // procedurals will have to go old path at first
76 #if 0
77 // updates one procedural leaf and gets BBOX contatining it
78 GRL_INLINE void refit_bottom_child_procedural(
79     global struct ProceduralLeaf** pleaf,
80     global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
81     struct AABB* childAABB)
82 {
83     global struct ProceduralLeaf* leaf = *pleaf;
84     /* extract geomID and primID from leaf */
85     const uint startPrim = QBVHNodeN_startPrim(curNode, child_idx);
86     const uint geomID = ProceduralLeaf_geomIndex(leaf);
87     const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
88 
89     /* read bounds from geometry descriptor */
90     struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
91     childAABB->lower.x = aabb.MinX;
92     childAABB->lower.y = aabb.MinY;
93     childAABB->lower.z = aabb.MinZ;
94     childAABB->upper.x = aabb.MaxX;
95     childAABB->upper.y = aabb.MaxY;
96     childAABB->upper.z = aabb.MaxZ;
97 
98     /* advance leaf pointer to next child */
99     *pleaf = leaf + QBVHNodeN_blockIncr(curNode, child_idx);
100 }
101 
102 
103 GRL_INLINE void update_procedural_leafs(
104     global struct BVHBase* bvh,
105     global void* input,
106     global struct AABB* bbox_scratch,
107     uint id,
108     uint num_done_by_one_thread)
109 {
110     uint numLeaves = BVHBase_GetNumQuads(bvh);
111     uint leafsIndexOffset = bvh->proceduralDataStart - BVH_ROOT_NODE_OFFSET / 64;
112     global ProceduralLeaf* leafs = (global QuadLeaf*)BVHBase_GetProceduralLeaves(bvh);
113     uint start_leaf = id * num_done_by_one_thread;
114     uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
115 
116     global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
117 
118     for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
119     {
120         struct AABB theAABB;
121         refit_bottom_child_procedural(leafs + leaf_id, geosArray, &theAABB);
122         theAABB.lower.w = as_float(0xABBADEFF);
123         theAABB.upper.w = 0x00;
124         storeAABBToL1(theAABB, &bbox[leafsIndexOffset + leaf_id]);
125     }
126 }
127 #endif
128 
update_quads(global struct BVHBase * bvh,global void * input,global struct AABB * bbox_scratch,uint id,uint num_done_by_one_thread)129 GRL_INLINE void update_quads(
130     global struct BVHBase* bvh,
131     global void* input,
132     global struct AABB* bbox_scratch,
133     uint id,
134     uint num_done_by_one_thread)
135 {
136     uint numLeaves = BVHBase_GetNumQuads(bvh);
137     uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
138     global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
139     uint start_leaf = id * num_done_by_one_thread;
140     uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
141 
142     global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
143 
144     for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
145     {
146         struct AABB theAABB;
147         refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
148         theAABB.lower.w = as_float(0xABBADEFF);
149         theAABB.upper.w = 0x00;
150         storeAABBToL1(theAABB, &bbox_scratch[leafsIndexOffset + leaf_id]);
151     }
152 }
153 
154 /////////////////////////////////////////////////////////////////////////////////////////////////////
155 //
156 // core bottom-up update functions
157 //
158 //
159 
quantise_bounds(struct AABB * input_aabb,float3 len,float3 mant,float3 org,int3 exp,uchar3 * lower_uchar,uchar3 * upper_uchar)160 GRL_INLINE void quantise_bounds(
161     struct AABB* input_aabb, float3 len, float3 mant, float3 org, int3 exp,
162     uchar3* lower_uchar,
163     uchar3* upper_uchar)
164 {
165     const float up = 1.0f + ulp;
166     const float down = 1.0f - ulp;
167 
168     struct AABB child_aabb = conservativeAABB(input_aabb); // conservative ???
169 
170     float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
171     lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
172     float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
173     upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
174 
175     *lower_uchar = convert_uchar3_rtn(lower);
176     *upper_uchar = convert_uchar3_rtp(upper);
177 }
178 
179 typedef struct Qbounds_as_DW {
180     uint32_t xLL; uint32_t xLU; uint32_t xUU;
181     uint32_t yLL; uint32_t yLU; uint32_t yUU;
182     uint32_t zLL; uint32_t zLU; uint32_t zUU;
183 } Qbounds_as_DW;
184 
encodeQuantisedDataAsDW(uchar3 lower_uchar,uchar3 upper_uchar,uint idx,Qbounds_as_DW * qbounds)185 GRL_INLINE void encodeQuantisedDataAsDW(
186     uchar3 lower_uchar,
187     uchar3 upper_uchar,
188     uint idx,
189     Qbounds_as_DW* qbounds)
190 {
191     uint shift_init = idx * 8;
192     if (idx >= 4) {
193         uint shift = (shift_init - 32);
194         qbounds->xLU |= ((uint)lower_uchar.x) << shift;
195         qbounds->yLU |= ((uint)lower_uchar.y) << shift;
196         qbounds->zLU |= ((uint)lower_uchar.z) << shift;
197     }
198     else {
199         qbounds->xLL |= ((uint)lower_uchar.x) << shift_init;
200         qbounds->yLL |= ((uint)lower_uchar.y) << shift_init;
201         qbounds->zLL |= ((uint)lower_uchar.z) << shift_init;
202     }
203 
204     if (idx < 2) {
205         uint shift = (shift_init + 16);
206         qbounds->xLU |= ((uint)upper_uchar.x) << shift;
207         qbounds->yLU |= ((uint)upper_uchar.y) << shift;
208         qbounds->zLU |= ((uint)upper_uchar.z) << shift;
209     }
210     else {
211         uint shift = (shift_init - 16);
212 
213         qbounds->xUU |= ((uint)upper_uchar.x) << shift;
214         qbounds->yUU |= ((uint)upper_uchar.y) << shift;
215         qbounds->zUU |= ((uint)upper_uchar.z) << shift;
216     }
217 }
218 
encodeChildBounds(uchar3 lower_uchar,uchar3 upper_uchar,uint ch,struct InternalNode * qnode)219 GRL_INLINE void encodeChildBounds(uchar3 lower_uchar, uchar3 upper_uchar, uint ch, struct InternalNode* qnode)
220 {
221     qnode->lower_x[ch] = lower_uchar.x; qnode->upper_x[ch] = upper_uchar.x;
222     qnode->lower_y[ch] = lower_uchar.y; qnode->upper_y[ch] = upper_uchar.y;
223     qnode->lower_z[ch] = lower_uchar.z; qnode->upper_z[ch] = upper_uchar.z;
224 }
225 
226 
InternalNode_setBounds_skip_prev(struct InternalNode * qbvh_node,uint prevChildIdx,struct AABB * prev_input_aabb,struct AABB * input_aabb,uint childrenIndex,const uint numChildren,struct AABB * aabb_reduced)227 GRL_INLINE GRL_OVERLOADABLE void InternalNode_setBounds_skip_prev(struct InternalNode* qbvh_node, uint prevChildIdx, struct AABB* prev_input_aabb, struct AABB* input_aabb, uint childrenIndex, const uint numChildren, struct AABB* aabb_reduced)
228 {
229 
230     int3 exp;
231     const float up = 1.0f + ulp;
232     struct AABB conservative_aabb = conservativeAABB(aabb_reduced);
233     const float3 len = AABB_size(&conservative_aabb).xyz * up;
234     const float3 mant = frexp_vec3(len, &exp);
235     const float3 org = conservative_aabb.lower.xyz;
236 
237     exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
238 
239     qbvh_node->lower[0] = org.x; qbvh_node->lower[1] = org.y; qbvh_node->lower[2] = org.z;
240 
241     qbvh_node->exp_x = exp.x; qbvh_node->exp_y = exp.y;  qbvh_node->exp_z = exp.z;
242 
243     Qbounds_as_DW qbounds = { 0x0 };
244 
245 
246     {
247         uchar3 lower_uchar, upper_uchar;
248         quantise_bounds(prev_input_aabb, len, mant, org, exp, &lower_uchar, &upper_uchar);
249 
250         //encode invalid children. its enough to set 0x80 as lower_x bytes
251         uint shift = numChildren * 8;
252         uint shift2 = min(shift, 31u);
253         qbounds.xLL = (0x80808080u << shift2);
254         uint shift3 = max(shift, 32u) - 32;
255         qbounds.xLU = (ushort)(((ushort)0x8080) << (ushort)shift3);
256 
257         encodeQuantisedDataAsDW(lower_uchar, upper_uchar, prevChildIdx, &qbounds);
258         //encodeChildBounds(lower_uchar, upper_uchar, prevChildIdx, qbvh_node);
259     }
260 
261     uint ch = prevChildIdx == 0;
262     while (ch < numChildren) {
263         uchar3 lower_uchar, upper_uchar;
264         quantise_bounds(input_aabb + ch, len, mant, org, exp, &lower_uchar, &upper_uchar);
265         encodeQuantisedDataAsDW(lower_uchar, upper_uchar, ch, &qbounds);
266         //encodeChildBounds(lower_uchar, upper_uchar, ch, qbvh_node);
267         ch += 1 + (prevChildIdx == (ch + 1));
268     }
269     Qbounds_as_DW* qbounds_dst = (Qbounds_as_DW*)(&qbvh_node->lower_x[0]);
270     *qbounds_dst = qbounds;
271     return;
272 }
273 
refitReduce2Boxes(struct AABB A,struct AABB B)274 GRL_INLINE struct AABB refitReduce2Boxes(struct AABB A, struct AABB B)
275 {
276     AABB_extend(&A, &B);
277     // to make it work for TLAS node masks change to this:
278     // A.lower.w = as_float(as_uint(A.lower.w) | as_uint(B.lower.w));
279     A.lower.w = as_float(0xABBADE00u);
280     return A;
281 }
282 
refitReduceNodePrev(uint prevIdx,uint leadChildIdx,uint numChildren,struct AABB * globalBox,struct AABB * reduceBox,uint depth,uint NodeIndex)283 GRL_INLINE void refitReduceNodePrev(
284     uint prevIdx,
285     uint leadChildIdx,
286     uint numChildren,
287     struct AABB* globalBox,
288     struct AABB* reduceBox,
289     uint depth,
290     uint NodeIndex)
291 {
292     uint8_t childIgnored = (prevIdx - leadChildIdx);
293 
294 #   if REFIT_DEBUG_CHECKS
295     bool err = false;
296     if ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)
297     {
298         printf("refitReduceNode6 (loc_id %d): prev (used as child %d) not updated! NodeIndex %d, child nodeIdx %d at depth %d\n",
299             get_local_id(0),
300             childIgnored,
301             NodeIndex,
302             prevIdx,
303             depth);
304         err = true;
305     }
306 
307     if ((as_uint(globalBox[NodeIndex].lower.w) & 0xFFFFFF00) == 0xABBADE00u)
308     {
309         printf("refitReduceNode6 (loc_id %d): dst node already updated. NodeIndex %d depth %d\n",
310             get_local_id(0),
311             NodeIndex,
312             depth);
313     }
314 
315     bool fail = false;
316     for (uint k = 0; (k < numChildren) && !err; ++k) {
317         if (k != childIgnored) {
318             if ((as_uint(globalBox[leadChildIdx + k].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
319                 printf("refitReduceNode6 (loc_id %d): child %d not updated! use prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
320                     get_local_id(0),
321                     k,
322                     prevIdx - leadChildIdx,
323                     NodeIndex,
324                     leadChildIdx + k,
325                     depth);
326                 fail = true;
327             }
328         }
329     }
330     err |= fail;
331 #   endif
332 
333     // for each child 3 bits contains load index
334     const uint32_t indicesEncoded =
335         (1 << 0) +
336         (2 << 3) +
337         (3 << 6) +
338         (4 << 9) +
339         (5 << 12) +
340         (0 << 15) +
341         (1 << 18) +
342         (2 << 21) +
343         (3 << 24) +
344         (4 << 27);
345     // 1,2,3,4,5
346 
347 
348     uint32_t indicesEncodedShifted = indicesEncoded >> (childIgnored * 3);
349 
350     struct AABB* childAABB = globalBox + leadChildIdx;
351     struct AABB  temp = childAABB[indicesEncodedShifted & 7];
352     indicesEncodedShifted >>= 3;
353     struct AABB* nextChild = childAABB + (indicesEncodedShifted & 7);
354     struct AABB  backlog = temp;
355 
356     for (uint child = 2; child < numChildren; child++)
357     {
358         temp = *nextChild;
359         *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
360         indicesEncodedShifted >>= 3;
361         nextChild = childAABB + (indicesEncodedShifted & 7);
362         backlog = temp;
363     }
364 
365     *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
366 
367 #if REFIT_DEBUG_CHECKS
368     for (uint k = 0; (k < numChildren) && !err; ++k) {
369         if (k != childIgnored) {
370             if (!AABB_subset(&globalBox[leadChildIdx + k], reduceBox)) {
371                 printf("refitReduceNode6 (loc_id %d): child AABB %d/%d reduction went wrong! skipped prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
372                     get_local_id(0),
373                     k, numChildren,
374                     prevIdx - leadChildIdx,
375                     NodeIndex,
376                     leadChildIdx + k,
377                     depth);
378 
379                 err = true;
380             }
381         }
382     }
383     if (!err && ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)) {
384         printf("refitReduceNode6: havent set the 0xABBADEXXu marker in result node %d at depth %d!\n",
385             NodeIndex,
386             depth);
387     }
388 #endif
389 }
390 
391 
hash_local_id()392 GRL_INLINE uint hash_local_id()
393 {
394     return get_sub_group_local_id() * get_num_sub_groups() + get_sub_group_id();
395 }
396 
397 //===============================================================
398 //
399 //  Core update function
400 //
401 //===============================================================
refit_treelet_by_single_group(global struct AABB * bbox,local Treelet_by_single_group_locals * loc,uniform global BVHBase * pBvh,uniform RefitTreelet trltDsc,bool encodeQnodes,bool isTipTreelet)402 GRL_INLINE bool refit_treelet_by_single_group(
403     global  struct AABB* bbox,
404     local Treelet_by_single_group_locals* loc,
405     uniform global BVHBase* pBvh,
406     uniform RefitTreelet   trltDsc,
407     bool encodeQnodes,
408     bool isTipTreelet)
409 {
410     BackPointers* backpointers = BVHBase_GetBackPointers(pBvh);
411     InternalNode* internalNodes = BVHBase_GetInternalNodes(pBvh);
412     uint local_id = get_local_id(0);
413     StartPoint* startPoints = BVHBase_GetRefitStartPoints(pBvh) + trltDsc.startpoint_offset;
414 
415     // special case for single path treelets, TODO rewrite it as subgroups based
416     if (trltDsc.numStartpoints == 1) {
417         if (local_id == 0) {
418             RefitTreeletTrivial desc = *((RefitTreeletTrivial*)& trltDsc);
419             uint innerNodeIdx   = desc.theOnlyNodeIndex;
420             uint numChildren    = desc.numChildrenOfTheNode;
421             uint childIndex     = desc.childrenOffsetOfTheNode;
422             uint maxDepth       = desc.maxDepth;
423 
424             uint prevIdx = childIndex;
425             struct AABB myBox = bbox[childIndex];
426             struct AABB prevAABB;
427             uint backpointer = maxDepth > 0 ? *InnerNode_GetBackPointer(backpointers, innerNodeIdx) : 0;
428             InternalNode* curNode = internalNodes + innerNodeIdx;
429             uint currDepth = 0;
430 
431             while (1)
432             {
433                 prevAABB = myBox;
434                 if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
435 
436                 if (!encodeQnodes) { myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); }
437 
438                 if (++currDepth > maxDepth) { break; }
439 
440                 if (encodeQnodes) {
441                     InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
442                 }
443 #if !REFIT_DEBUG_CHECKS
444                 else
445 #endif
446                 { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
447 
448                 prevIdx = innerNodeIdx;
449                 innerNodeIdx = BackPointer_GetParentIndex(backpointer);
450                 backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
451                 numChildren = BackPointer_GetNumChildren(backpointer);
452                 curNode = internalNodes + innerNodeIdx;
453                 childIndex = innerNodeIdx + curNode->childOffset;
454             }
455 
456             if (isTipTreelet) {
457                 AABB3f reduced3f = AABB3fFromAABB(myBox);
458                 pBvh->Meta.bounds = reduced3f;
459             }
460             else {
461                 storeAABBToL3(myBox, &bbox[innerNodeIdx]);
462             }
463 
464             if (encodeQnodes || isTipTreelet) {
465                 InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
466             }
467 
468 #if REFIT_VERBOSE_LOG
469             printf("single node treelet: storing node idx %d \n", innerNodeIdx);
470 #endif
471         }
472 
473         return local_id == 0;
474     }
475 
476     local uint* loc_startpoints = loc->startpoints;
477 
478 
479 #if REFIT_DEBUG_CHECKS
480     if ((trltDsc.numNonTrivialStartpoints > NUM_STARTPOINTS_IN_SLM)) {
481         if(local_id == 0) printf("out of SLM space, trltDsc.depthSub_NUM_STARTPOINTS_IN_SLM > 0\n");
482         return local_id == 0;
483     }
484 #endif
485 
486     uint SLMedStartpointsOffset = trltDsc.numStartpoints - trltDsc.numNonTrivialStartpoints;
487 
488     /*=====================================================================
489     first phase where we update startpoints nodes only
490     ----------------------------------------------------------------------*/
491     for (uint startpoint_i = local_id; startpoint_i < trltDsc.numStartpoints; startpoint_i += get_local_size(0)) {
492         uint startpoint = (uint)intel_sub_group_block_read_ui((global uint*)(startPoints + startpoint_i));
493         uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
494         uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
495         if (startpoint_i >= SLMedStartpointsOffset) {
496             uint idx = startpoint_i - SLMedStartpointsOffset;
497             loc_startpoints[idx] = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
498         }
499 
500         uint numChildren = BackPointer_GetNumChildren(backpointer);
501         InternalNode* curNode = internalNodes + innerNodeIdx;
502         uint childIndex = innerNodeIdx + curNode->childOffset;
503 
504         uint prevIdx = childIndex;
505         struct AABB myBox = bbox[childIndex];
506         struct AABB prevAABB = myBox;
507 
508 #   if REFIT_DEBUG_CHECKS
509         if (numChildren == 0) {
510             printf("this node has no chidren!\n", 0);
511             AABB_init(&myBox);
512         }
513 #   endif
514 
515         if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
516         myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
517 
518 #if REFIT_VERBOSE_LOG
519         printf("init phase: at depth 0 storing node idx %d \n", innerNodeIdx);
520 #endif
521         storeAABBToL1(myBox, &bbox[innerNodeIdx]);
522 
523         if (encodeQnodes) {
524             InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
525         }
526     }
527 
528     uniform uint CurrPeeledDepth = 1;
529     uniform uint numStartpoints = trltDsc.numNonTrivialStartpoints;
530     uint nextFloorStartpoint = hash_local_id();
531 
532     uint depthOnionEnd = trltDsc.depthLess64;
533     if (get_local_size(0) == 128) { depthOnionEnd = trltDsc.depthLess128; }
534     if (get_local_size(0) == 256) { depthOnionEnd = trltDsc.depthLess256; }
535 
536     /*=====================================================================
537     second phase, we update horizontally untill
538     we reach number of active path below grou size
539     ----------------------------------------------------------------------*/
540     while (CurrPeeledDepth < depthOnionEnd) {
541         mem_fence_workgroup_default();
542 
543         work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
544         uint start = nextFloorStartpoint;
545         nextFloorStartpoint = numStartpoints;
546 
547         for (uint startpoint_i = start; startpoint_i < numStartpoints; startpoint_i += get_local_size(0)) {
548             uint startpoint   = loc_startpoints[startpoint_i];
549             uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
550             uint backpointer  = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
551 
552             if (StartPoint_GetDepth(startpoint) > CurrPeeledDepth) {
553                 StartPoint newSP = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
554                 loc_startpoints[startpoint_i] = newSP;
555                 nextFloorStartpoint = min(nextFloorStartpoint, startpoint_i);
556             }
557 
558             InternalNode* curNode = internalNodes + innerNodeIdx;
559             uint childIndex = innerNodeIdx + curNode->childOffset;
560             uint numChildren = BackPointer_GetNumChildren(backpointer);
561 
562             uint prevIdx = childIndex;
563             struct AABB myBox = bbox[childIndex];
564             struct AABB prevAABB = myBox;
565             refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
566 
567             myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
568 
569 #if REFIT_VERBOSE_LOG
570             printf("onion: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_i, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
571 #endif
572             storeAABBToL1(myBox, &bbox[innerNodeIdx]);
573             if (encodeQnodes) {
574                 InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
575             }
576         }
577         CurrPeeledDepth++;
578     }
579 
580     uint startpoint_idx = nextFloorStartpoint;
581     bool active = startpoint_idx < numStartpoints;
582 
583     work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
584     StartPoint startpoint = loc_startpoints[startpoint_idx];
585 
586     struct AABB myBox;
587     uint prevIdx = 0;
588     uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
589 
590     /*=====================================================================
591     last phase, each thread just continues path to its end
592 
593     only thread that computes the longest path leaves prematurely
594     (thats why while condition isn't <=) the code for finalizing root of treelet
595     is special and hendled afterwards
596 
597     TODO: with proper assigning of paths to lanes we should reach only three
598     active lanes per physical thread quite soon for this subgroups could be used
599     ----------------------------------------------------------------------*/
600     bool prevActive = active;
601     while (CurrPeeledDepth < trltDsc.maxDepth) {
602         uint backpointer;
603         uint childIndex;
604         InternalNode* curNode = internalNodes + innerNodeIdx;
605         if (active) {
606             childIndex = innerNodeIdx + curNode->childOffset;
607             backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
608         } else if(prevActive){
609             mem_fence_workgroup_default();
610         }
611 
612         prevActive = active;
613 
614         work_group_barrier(0, memory_scope_work_group);
615         //printf("Start node %d at depth %d, innerNodeIdx %d dying! \n", StartPoint_GetNodeIdx(startpoint), CurrPeeledDepth, innerNodeIdx);
616         if (active) {
617 
618 #if REFIT_DEBUG_CHECKS
619             if (CurrPeeledDepth > StartPoint_GetDepth(startpoint))
620             {
621                 printf("uppath: startpoint %d <n=%d , d=%d> at depth %d shouldn't be active!\n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth);
622             }
623 #endif
624             if (prevIdx == 0) {
625                 myBox = bbox[childIndex];
626                 prevIdx = childIndex;
627             }
628             uint numChildren = BackPointer_GetNumChildren(backpointer);
629 
630             struct AABB prevAABB = myBox;
631             refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
632             myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
633 #if REFIT_VERBOSE_LOG
634             printf("uppath: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
635 #endif
636             active = CurrPeeledDepth < StartPoint_GetDepth(startpoint);
637 
638             if (encodeQnodes) {
639 #if !REFIT_DEBUG_CHECKS
640                 if (!active)
641 #endif
642                 { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
643                 InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
644             } else {
645                 storeAABBToL1(myBox, &bbox[innerNodeIdx]);
646             }
647 
648             prevIdx = innerNodeIdx;
649             innerNodeIdx = BackPointer_GetParentIndex(backpointer);
650         }
651 
652         CurrPeeledDepth++;
653     }
654 
655     {
656         uint backpointer;
657         uint childIndex;
658         InternalNode* curNode = internalNodes + innerNodeIdx;
659         if (active) {
660             childIndex = innerNodeIdx + curNode->childOffset;
661             backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
662         } else if(prevActive) {
663             mem_fence_workgroup_default();
664         }
665 
666         work_group_barrier(0, memory_scope_work_group);
667 
668         /*=====================================================================
669         final step, is special processing of root,
670         its different, since its box is transfered cross group (written to L3)
671         or is root of whole tree and hence fill global box in bvh MD
672         TODO: this should be done in SG as only one thread is active
673         ----------------------------------------------------------------------*/
674         if (active) {
675             if (prevIdx == 0) {
676                 myBox = bbox[childIndex];
677                 prevIdx = childIndex;
678             }
679             uint numChildren = BackPointer_GetNumChildren(backpointer);
680             struct AABB prevAABB = myBox;
681             refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
682             myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
683 
684 #if REFIT_VERBOSE_LOG
685             printf("root: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx/*,WeReInSIMD*/);
686 #endif
687             if (isTipTreelet) {
688                 AABB3f reduced3f = AABB3fFromAABB(myBox);
689                 pBvh->Meta.bounds = reduced3f;
690                 InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
691             } else {
692                 storeAABBToL3(myBox, &bbox[innerNodeIdx]);
693                 if (encodeQnodes) {
694                     InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
695                 }
696             }
697         }
698     }
699 
700     return active;
701 }
702 
703 
704 //////////////////////////////////////////////////////////////////////////////////////
705 //
706 // Internal nodes enocding as a separate dispatch
707 //
708 //
709 
710 // encode qnodes as a separate pass
post_refit_encode_qnode_tree_per_group(global struct AABB * bbox_scratch,global struct BVHBase * bvh)711 GRL_INLINE void post_refit_encode_qnode_tree_per_group(
712     global struct AABB* bbox_scratch,
713     global struct BVHBase* bvh)
714 {
715     uint numInnerNodes = BVHBase_GetNumInternalNodes(bvh);
716     InternalNode* internalNodes = BVHBase_GetInternalNodes(bvh);
717 
718     for (uint nodeIdx = get_local_id(0) + 1 /*+1 because node 0 is already updated*/; nodeIdx < numInnerNodes; nodeIdx += get_local_size(0))
719     {
720         struct AABB reduced = bbox_scratch[nodeIdx];
721 #   if REFIT_DEBUG_CHECKS
722         if ((as_uint(reduced.lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
723             printf("qnode enc group: NodeIndex %d not updated! \n", nodeIdx);
724             return;
725         }
726         for (uint k = 0; k < (as_uint(reduced.upper.w) & 7); ++k) {
727             uint childIdx = (as_uint(reduced.upper.w) >> 4) + k;
728             if ((as_uint(bbox_scratch[childIdx].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
729                 printf("qnode enc group: child not updated! NodeIndex %d, child nodeIdx %d \n", nodeIdx, childIdx);
730                 return;
731             }
732         }
733 #   endif
734         struct InternalNode* qbvh_node = internalNodes + nodeIdx;
735         uint childIndex = as_uint(reduced.upper.w) >> 4;
736         uint numChildren = as_uint(reduced.upper.w) & 7;
737         struct AABB* children = bbox_scratch + childIndex;
738         //InternalNode_setBounds(internalNodes + nodeIdx, bbox_scratch + (as_uint(reduced.upper.w) >> 4), as_uint(reduced.upper.w) & 7, &reduced);
739         InternalNode_setBounds_skip_prev(qbvh_node, 0, children, children, childIndex, numChildren, &reduced);
740     }
741 }
742 
743 //////////////////////////////////////////////////////////////////////////////////////
744 //
745 // Construction of treelets and paths
746 //
747 //
748 
749 // this is tiny bit tricky, when bottom-up thread haven't yet closed treelet this is number of startpoints that are under the node
750 // when thread closed treelets it the data is starts to be treelet ID
751 typedef uint TreeletNodeData;
752 
753 typedef struct TreeletsOpenNodeInfo {
754     // bool isTreeletRoot; // : 1
755     short   maxDepth;      // : 14
756     uint    numStartpoints;// : 16
757 } TreeletsOpenNodeInfo;
758 
759 typedef struct TreeletsClosedNodeInfo {
760     // bool isTreeletRoot; // : 1
761     uint    treeletId;     // : 31 (when treelet is closed)
762 } TreeletsClosedNodeInfo;
763 
ClearTreeletRoot(TreeletNodeData D)764 GRL_INLINE TreeletNodeData ClearTreeletRoot(TreeletNodeData D)
765 {
766     return D & ((1u << 31u) - 1u);
767 }
768 
isTreeletRoot(TreeletNodeData E)769 GRL_INLINE uint isTreeletRoot(TreeletNodeData E)
770 {
771     return E >> 31;
772 }
773 
getNumStartpoints(TreeletNodeData E)774 GRL_INLINE uint getNumStartpoints(TreeletNodeData E)
775 {
776     return E & ((1 << 16) - 1);
777 }
778 
getMaxDepth(TreeletNodeData E)779 GRL_INLINE uint getMaxDepth(TreeletNodeData E)
780 {
781     return (E >> 16) & ((1 << 14) - 1);
782 }
783 
784 // single startpoint treelet
isTrivialTreeletRoot(TreeletNodeData E)785 GRL_INLINE uint isTrivialTreeletRoot(TreeletNodeData E)
786 {
787     return (E >> 31) && (getMaxDepth(E) == 0);
788 }
789 
SetTipStartpoint(TreeletNodeData D)790 GRL_INLINE TreeletNodeData SetTipStartpoint(TreeletNodeData D)
791 {
792     return ClearTreeletRoot(D) | (1 << 30);
793 }
794 
SetTreeletRoot(TreeletNodeData D)795 GRL_INLINE TreeletNodeData SetTreeletRoot(TreeletNodeData D)
796 {
797     return D | (1 << 31);
798 }
799 
DecodeOpenInfo(TreeletNodeData E)800 GRL_INLINE TreeletsOpenNodeInfo DecodeOpenInfo(TreeletNodeData E)
801 {
802     TreeletsOpenNodeInfo I;
803     I.maxDepth = getMaxDepth(E);
804     I.numStartpoints = getNumStartpoints(E);
805     return I;
806 }
807 
EncodeOpenInfo(TreeletsOpenNodeInfo I,bool isRoot)808 GRL_INLINE TreeletNodeData EncodeOpenInfo(TreeletsOpenNodeInfo I, bool isRoot)
809 {
810     TreeletNodeData D = isRoot ? (1 << 31) : 0;
811     D |= (I.maxDepth & ((1 << 14) - 1)) << 16;
812     D |= I.numStartpoints & ((1 << 16) - 1);
813     return D;
814 }
815 
DecodeClosedInfo(TreeletNodeData E)816 GRL_INLINE TreeletsClosedNodeInfo DecodeClosedInfo(TreeletNodeData E)
817 {
818     TreeletsClosedNodeInfo I;
819     I.treeletId = E & ((1u << 31u) - 1u);
820     return I;
821 }
822 
EncodeClosedInfo(TreeletsClosedNodeInfo I)823 GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(TreeletsClosedNodeInfo I)
824 {
825     TreeletNodeData D = (1u << 31u); // closed is always a root!
826     D |= I.treeletId & ((1u << 31u) - 1u);
827     return D;
828 }
829 
EncodeClosedInfo(uint treeletId)830 GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(uint treeletId)
831 {
832     TreeletNodeData D = (1 << 31); // closed is always a root!
833     D |= treeletId & ((1u << 31u) - 1u);
834     return D;
835 }
836 
chk_close_Treelet(RefitTreelet * TreeletDescsArr,TreeletNodeData * nodeTreeletDataArr,uint * StartPointBuffer,uint * currStartpoint,TreeletNodeData nodeData,TreeletsOpenNodeInfo * nodeOpenInfo,uint nodeIdx,uint * treeletDescIdx)837 GRL_INLINE void chk_close_Treelet(
838     RefitTreelet* TreeletDescsArr,
839     TreeletNodeData* nodeTreeletDataArr,
840     uint* StartPointBuffer,
841     uint* currStartpoint,
842     TreeletNodeData nodeData,
843     TreeletsOpenNodeInfo* nodeOpenInfo,
844     uint nodeIdx,
845     uint* treeletDescIdx)
846 {
847     if (isTreeletRoot(nodeData))
848     {
849         TreeletNodeData encoded = 0;
850         if (nodeOpenInfo->numStartpoints == 1)
851         {
852             encoded = ClearTreeletRoot(SetTipStartpoint(nodeData));
853         }
854         else
855         {
856             RefitTreelet RTdesc;
857             RTdesc.startpoint_offset = *currStartpoint;
858             *currStartpoint += nodeOpenInfo->numStartpoints;
859             RTdesc.numStartpoints = nodeOpenInfo->numStartpoints;
860             RTdesc.maxDepth = nodeOpenInfo->maxDepth;
861             TreeletDescsArr[*treeletDescIdx] = RTdesc;
862             encoded = EncodeClosedInfo(*treeletDescIdx);
863             *treeletDescIdx = *treeletDescIdx + 1;
864             TreeletsOpenNodeInfo infoDefault = { 0, 0 };
865             *nodeOpenInfo = infoDefault;
866         }
867 
868         nodeTreeletDataArr[nodeIdx] = encoded;
869     }
870     // printf("close_Treelet %d, nodeOpenInfo.numStartpoints %d, RTdesc.maxDepth %d, RTdesc.startpoint_offset %d\n", treeletDescIdx, nodeOpenInfo.numStartpoints, RTdesc.maxDepth, RTdesc.startpoint_offset);
871 }
872 
873 
874 // TreeletNodeData* treelets holds per node property, after running this some of them are marked as treelet root
treelet_bottom_up_mark_treelets(global struct BVHBase * bvh,global InternalNode * internalNodes,global StartPoint * scratch_startpoints,uint curNodeIndex,BackPointers * backPointers,global TreeletNodeData * treelets,uint refitTreeletsDataStart,uint * startpointAlloc)875 GRL_INLINE void treelet_bottom_up_mark_treelets(
876     global struct BVHBase* bvh,
877     global InternalNode* internalNodes,
878     global StartPoint* scratch_startpoints,
879     uint curNodeIndex,
880     BackPointers* backPointers,
881     global TreeletNodeData* treelets,
882     uint refitTreeletsDataStart,
883     uint* startpointAlloc)
884 {
885     TreeletsOpenNodeInfo currInfo;
886     currInfo.maxDepth = 0;
887     currInfo.numStartpoints = 1;
888 
889     global RefitTreelet* treeletDescs = (global RefitTreelet*) (((global char*)bvh) + (refitTreeletsDataStart * 64));
890 
891     treelets[curNodeIndex] = EncodeOpenInfo(currInfo, true);
892 
893     /* the start node got already processed, thus go to its parent node */
894     uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
895     curNodeIndex = parentPointer >> 6;
896 
897     bool isInTip = false;
898     while (curNodeIndex != 0x03FFFFFF)
899     {
900         uint numChildrenTotal = 0;
901         // numChildrenTotal and parentPointer gets updated...
902         // atomic trickery, on backpointers, only the last one thread enters up
903         {
904             /* increment refit counter that counts refitted children of current node */
905             global uint* pCurrentBackpointer = (global uint*)InnerNode_GetBackPointer(backPointers, curNodeIndex);
906             mem_fence_gpu_invalidate();
907             parentPointer = 1 + atomic_inc_global(pCurrentBackpointer);
908 
909             /* if all children got refitted, then continue */
910             const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
911             numChildrenTotal = (parentPointer >> 3) & 0x7;
912 
913             if (numChildrenRefitted != numChildrenTotal)
914                 return;
915 
916             /* reset refit counter for next refit */
917             *pCurrentBackpointer = (parentPointer & 0xfffffff8);
918         }
919 
920         /* get children treelets */
921         global struct InternalNode* node = internalNodes + curNodeIndex;
922         uint childrenIndices = curNodeIndex + node->childOffset;
923         global TreeletNodeData* childrenTreelets = treelets + childrenIndices;
924 
925         // yeah, it is possible we are pulling trash here, but we wont use it.
926         // this is for the sake of one non control flow spoiled data pull
927         TreeletNodeData dataCh0 = childrenTreelets[0]; TreeletNodeData dataCh1 = childrenTreelets[1];
928         TreeletNodeData dataCh2 = childrenTreelets[2]; TreeletNodeData dataCh3 = childrenTreelets[3];
929         TreeletNodeData dataCh4 = childrenTreelets[4]; TreeletNodeData dataCh5 = childrenTreelets[5];
930 
931         // zero out the potential trash
932         if (numChildrenTotal < 3) dataCh2 = 0;
933         if (numChildrenTotal < 4) dataCh3 = 0;
934         if (numChildrenTotal < 5) dataCh4 = 0;
935         if (numChildrenTotal < 6) dataCh5 = 0;
936 
937         TreeletsOpenNodeInfo infoCh0 = DecodeOpenInfo(dataCh0);
938         TreeletsOpenNodeInfo infoCh1 = DecodeOpenInfo(dataCh1);
939         TreeletsOpenNodeInfo infoCh2 = DecodeOpenInfo(dataCh2);
940         TreeletsOpenNodeInfo infoCh3 = DecodeOpenInfo(dataCh3);
941         TreeletsOpenNodeInfo infoCh4 = DecodeOpenInfo(dataCh4);
942         TreeletsOpenNodeInfo infoCh5 = DecodeOpenInfo(dataCh5);
943 
944         uint numChildrenBeingRoots = isTreeletRoot(dataCh0) + isTreeletRoot(dataCh1) + isTreeletRoot(dataCh2) + isTreeletRoot(dataCh3) + isTreeletRoot(dataCh4) + isTreeletRoot(dataCh5);
945         // see if we should merge the trees, if not then we should move to tip.
946         currInfo.numStartpoints = infoCh0.numStartpoints + infoCh1.numStartpoints + infoCh2.numStartpoints + infoCh3.numStartpoints + infoCh4.numStartpoints + infoCh5.numStartpoints;
947 
948         bool isTipStartpoint = false;
949         if (!isInTip)
950         {
951             // TODO: threshold could be a dynamic parameter based on the number of actual inner nodes
952             bool mergeTreelets = ((currInfo.numStartpoints > 0) && (currInfo.numStartpoints < TREELET_NUM_STARTPOINTS));
953             bool allChildrenRootsCurrently = numChildrenTotal == numChildrenBeingRoots;
954             if (mergeTreelets && allChildrenRootsCurrently)
955             {
956                 childrenTreelets[0] = ClearTreeletRoot(dataCh0);
957                 childrenTreelets[1] = ClearTreeletRoot(dataCh1); // -1 will be recognised then as this is not a treelet root.
958                 if (numChildrenTotal > 2) childrenTreelets[2] = ClearTreeletRoot(dataCh2);
959                 if (numChildrenTotal > 3) childrenTreelets[3] = ClearTreeletRoot(dataCh3);
960                 if (numChildrenTotal > 4) childrenTreelets[4] = ClearTreeletRoot(dataCh4);
961                 if (numChildrenTotal > 5) childrenTreelets[5] = ClearTreeletRoot(dataCh5);
962             }
963             else
964             {
965                 isInTip = true;
966                 isTipStartpoint = allChildrenRootsCurrently;
967             }
968         }
969 
970         // close any roots underneath
971         if (isInTip && numChildrenBeingRoots)
972         {
973             uint trivialRoots = isTrivialTreeletRoot(dataCh0) + isTrivialTreeletRoot(dataCh1) + isTrivialTreeletRoot(dataCh2) +
974                                 isTrivialTreeletRoot(dataCh3) + isTrivialTreeletRoot(dataCh4) + isTrivialTreeletRoot(dataCh5);
975 
976             uint treeletId = 0;
977             uint bottomStartpointSpace = 0;
978 
979             uint startpointsFromTiptree = trivialRoots;
980 
981             if (trivialRoots) isTipStartpoint = false;
982 
983             if (numChildrenBeingRoots > trivialRoots)
984             {
985                 startpointsFromTiptree += // startpoint ONLY from tiptree
986                     (1 - isTreeletRoot(dataCh0)) * infoCh0.numStartpoints +
987                     (1 - isTreeletRoot(dataCh1)) * infoCh1.numStartpoints +
988                     (1 - isTreeletRoot(dataCh2)) * infoCh2.numStartpoints +
989                     (1 - isTreeletRoot(dataCh3)) * infoCh3.numStartpoints +
990                     (1 - isTreeletRoot(dataCh4)) * infoCh4.numStartpoints +
991                     (1 - isTreeletRoot(dataCh5)) * infoCh5.numStartpoints;
992 
993                 treeletId = atomic_add_global((global uint*)BVHBase_GetRefitTreeletCntPtr(bvh), numChildrenBeingRoots - trivialRoots);
994                 bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints - startpointsFromTiptree);
995             }
996 
997             currInfo.numStartpoints = startpointsFromTiptree;
998 
999             chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh0, &infoCh0, childrenIndices + 0, &treeletId);
1000             chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh1, &infoCh1, childrenIndices + 1, &treeletId);
1001             chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh2, &infoCh2, childrenIndices + 2, &treeletId);
1002             chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh3, &infoCh3, childrenIndices + 3, &treeletId);
1003             chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh4, &infoCh4, childrenIndices + 4, &treeletId);
1004             chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh5, &infoCh5, childrenIndices + 5, &treeletId);
1005         }
1006 
1007         if (isTipStartpoint)
1008         {
1009             currInfo.maxDepth = 0;
1010             currInfo.numStartpoints = 1;
1011         }
1012         else
1013         {
1014             // reduce max depth and number of startpoint underneath
1015             currInfo.maxDepth = max(max(max(infoCh0.maxDepth, infoCh1.maxDepth),
1016                 max(infoCh2.maxDepth, infoCh3.maxDepth)),
1017                 max(infoCh4.maxDepth, infoCh5.maxDepth)) + 1;
1018         }
1019 
1020         treelets[curNodeIndex] = EncodeOpenInfo(
1021             currInfo,
1022             !isInTip /*mark marged treelet as an new root iff we are in bottom we */);
1023 
1024         /* make parent node the current node */
1025         curNodeIndex = parentPointer >> 6;
1026     }
1027 
1028     uint treeletId = *BVHBase_GetRefitTreeletCntPtr(bvh);
1029 
1030     uint bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints);
1031 
1032     treelets[0] = EncodeClosedInfo(treeletId);
1033     RefitTreelet tipTreeletDesc;
1034     tipTreeletDesc.startpoint_offset = bottomStartpointSpace;
1035     tipTreeletDesc.numStartpoints = currInfo.numStartpoints;
1036     tipTreeletDesc.maxDepth = currInfo.maxDepth;
1037 
1038     treeletDescs[treeletId] = tipTreeletDesc;
1039 
1040     uint realNumberOfTreelets = treeletId + 1;
1041     // intentionally we set less by 1, because this number is used in num groups for dispatch which is number of bottom treelets
1042     // so substract 1. Except single treelet tree which is should stay 1.
1043     uint numStartingTreelets = (treeletId == 0) ? 1 : treeletId;
1044 
1045     *BVHBase_GetRefitTreeletCntPtr(bvh) = numStartingTreelets;
1046 
1047     uint treeletDescSpaceIn64B = (realNumberOfTreelets * sizeof(RefitTreelet) + 63) >> 6;
1048     uint startpointSpaceIn64B = ((bottomStartpointSpace + currInfo.numStartpoints) * sizeof(StartPoint) + 63) >> 6;
1049     bvh->refitStartPointDataStart = refitTreeletsDataStart + treeletDescSpaceIn64B;
1050     bvh->BVHDataEnd = refitTreeletsDataStart +treeletDescSpaceIn64B + startpointSpaceIn64B;
1051     *startpointAlloc = 0;
1052 }
1053 
1054 
find_refit_treelets(global struct BVHBase * bvh,global TreeletNodeData * treelets,global uint * scratchStartpoints,global uint * startpointAlloc)1055 GRL_INLINE void find_refit_treelets(
1056     global struct BVHBase* bvh,
1057     global TreeletNodeData* treelets,
1058     global uint* scratchStartpoints,
1059     global uint* startpointAlloc)
1060 {
1061     /* get pointer to inner nodes and back pointers */
1062     uniform global InternalNode* inner_nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
1063 
1064     /* construct range of nodes that each work group will process */
1065     uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
1066 
1067     varying ushort lane = get_sub_group_local_id();
1068     varying uint global_id = get_local_id(0) + get_group_id(0) * get_local_size(0);
1069 
1070     uint numBackpointers = BVHBase_GetNumInternalNodes(bvh);
1071 
1072     // align to 64B and divide
1073     uint treeletOffsetIn64B = ((numBackpointers * sizeof(uint)) + 63) >> 6;
1074 
1075     uint refitTreeletsDataStart = bvh->backPointerDataStart + treeletOffsetIn64B;
1076     if (global_id == 0)
1077     {
1078         bvh->refitTreeletsDataStart = refitTreeletsDataStart;
1079     }
1080 
1081     global struct InternalNode* curNode = &inner_nodes[global_id];
1082 
1083     varying ushort has_startpoint = 0;
1084     if (global_id < numInnerNodes) {
1085         if ((curNode->nodeType != BVH_INTERNAL_NODE))
1086         {
1087             has_startpoint = 1;
1088         }
1089     }
1090 
1091     if (has_startpoint == 0)
1092         return;
1093 
1094     treelet_bottom_up_mark_treelets(
1095         bvh,
1096         inner_nodes,
1097         scratchStartpoints,
1098         global_id,
1099         BVHBase_GetBackPointers(bvh),
1100         treelets,
1101         refitTreeletsDataStart,
1102         startpointAlloc);
1103 }
1104 
assign_refit_startpoints_to_treelets(global struct BVHBase * bvh,global TreeletNodeData * treelets,global uint * scratchStartpoints)1105 GRL_INLINE void assign_refit_startpoints_to_treelets(
1106     global struct BVHBase*  bvh,
1107     global TreeletNodeData* treelets,
1108     global uint*            scratchStartpoints)
1109 {
1110     /* get pointer to inner nodes and back pointers */
1111     uniform global struct InternalNode* inner_nodes = (global struct InternalNode*) BVHBase_GetInternalNodes(bvh);
1112 
1113     /* construct range of nodes that each work group will process */
1114     uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
1115 
1116     varying ushort lane = get_sub_group_local_id();
1117     varying uint starPointNode = get_local_id(0) + get_group_id(0) * get_local_size(0);
1118     varying uint curNodeIndex = starPointNode;
1119     global struct InternalNode* curNode = &inner_nodes[curNodeIndex];
1120 
1121     varying ushort is_startpoint = 0;
1122 
1123     if (curNodeIndex < numInnerNodes)
1124     {
1125         if ((curNode->nodeType != BVH_INTERNAL_NODE))
1126         {
1127             is_startpoint = 1;
1128         }
1129     }
1130 
1131     if (is_startpoint == 0)
1132     {
1133         return;
1134     }
1135 
1136     BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
1137 
1138     RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
1139     uint numTreelets = *BVHBase_GetRefitTreeletCntPtr(bvh);
1140     if (numTreelets > 1) numTreelets++;
1141 
1142     uint myDepthWhenDead = 0;
1143     uint startpointsBeforeMe = 0;
1144     bool dead = false;
1145 
1146     uint prevNodeIndex = 0x03FFFFFF;
1147 
1148     while (curNodeIndex != 0x03FFFFFF)
1149     {
1150         TreeletNodeData nodeData = treelets[curNodeIndex];
1151 
1152         uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
1153         uint numChildren = BackPointer_GetNumChildren(parentPointer);
1154 
1155         // this is counterpart of atomic based entrance decision.
1156         // the alive path is the longest, if two are equal take the one that came through child with smaller index.
1157         if (prevNodeIndex != 0x03FFFFFF)
1158         {
1159             uint leadChildOfCur = curNodeIndex + inner_nodes[curNodeIndex].childOffset;
1160             uint childEnd = numChildren + leadChildOfCur;
1161 
1162             uint longestPath = 0;
1163             uint longestPathChildIdx = leadChildOfCur;
1164 
1165             for (uint child = leadChildOfCur; child < childEnd; child++)
1166             {
1167                 TreeletNodeData childData = treelets[child];
1168                 if (!isTreeletRoot(childData))
1169                 {
1170                     TreeletsOpenNodeInfo childinfo = DecodeOpenInfo(childData);
1171                     if (longestPath <= childinfo.maxDepth) {
1172                         longestPathChildIdx = child;
1173                         longestPath = childinfo.maxDepth + 1;
1174                     }
1175 
1176                     if (child < prevNodeIndex)
1177                     {
1178                         // also count how many startpoints are there before me (used to place startpoint in proper slot)
1179                         startpointsBeforeMe += childinfo.numStartpoints;
1180                     }
1181                 }
1182             }
1183 
1184             if (!dead && prevNodeIndex != longestPathChildIdx)
1185             {
1186                 dead = true;
1187                 //printf("starPointNode %d dies in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
1188             }
1189 
1190             if (!dead) // this "if" is not an "else" to abouve as we might be dead before and comming through the same child index
1191             {
1192                 myDepthWhenDead = longestPath;
1193                 // it is a startpoint
1194                 //printf("starPointNode %d in node %d lives up, its myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
1195             }
1196 
1197             if (starPointNode == (uint)-1) {
1198                 // we just entered upper treelet as treelet if we are alive, we can be a new startpoint in new treelet
1199                 if (dead)
1200                 {
1201                     //printf("starPointNode %d disappears in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
1202                     // and we are dead, so we are not a startpoint of tip,
1203                     // so we must disappear to not be added as a startpoint.
1204                     return;
1205                 }
1206                 else
1207                 {
1208                     // it is a startpoint
1209                     //printf("starPointNode %d in node %d becoming its new startpoint\n", starPointNode, curNodeIndex);
1210                     starPointNode = curNodeIndex;
1211                 }
1212             }
1213         }
1214 
1215         if (isTreeletRoot(nodeData))
1216         {
1217             TreeletsClosedNodeInfo info = DecodeClosedInfo(nodeData);
1218             RefitTreelet treeletDesc = treeletDescs[info.treeletId];
1219             uint startpointSlot = treeletDesc.startpoint_offset + startpointsBeforeMe;
1220             scratchStartpoints[startpointSlot] = (starPointNode << 6) + (myDepthWhenDead & ((1 << 6) - 1));
1221 
1222             //printf("Adding to treeletID %d at root %d startpoint %d StartNodeIdx %d, depth %d\n", info.treeletId, curNodeIndex, startpointSlot, starPointNode, myDepthWhenDead);
1223 
1224             if (dead) return;
1225             myDepthWhenDead = 0;
1226             startpointsBeforeMe = 0;
1227             starPointNode = (uint)-1;
1228         }
1229 
1230         /* make parent node the current node */
1231         prevNodeIndex = curNodeIndex;
1232         curNodeIndex = BackPointer_GetParentIndex(parentPointer);
1233         //if(!dead)
1234         //printf("starPointNode %d move from node %d to %d\n", starPointNode, prevNodeIndex, curNodeIndex);
1235     }
1236 }
1237 
1238 const uint FINALIZE_TREELETS_SLM_DEPTHS_SPACE = 32;
1239 
finalize_treelets_in_groups(global struct BVHBase * bvh,global uint * scratchStartpoints,local uint * depths)1240 GRL_INLINE void finalize_treelets_in_groups(
1241     global struct BVHBase* bvh,
1242     global uint* scratchStartpoints,
1243     local uint* depths)
1244 {
1245     uint numTreeletsExecuted = *BVHBase_GetRefitTreeletCntPtr(bvh);
1246 
1247     uint local_id = get_local_id(0);
1248 
1249     uint numTreelets = (numTreeletsExecuted > 1) ? numTreeletsExecuted + 1 : numTreeletsExecuted;
1250 
1251     RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
1252 
1253     for (uint treeletId = get_group_id(0); treeletId < numTreelets; treeletId += numTreeletsExecuted)
1254     {
1255         if (treeletId == numTreeletsExecuted && treeletId != 0) { work_group_barrier(CLK_LOCAL_MEM_FENCE); }
1256 
1257         RefitTreelet treeletDesc = treeletDescs[treeletId];
1258         StartPoint* srcStartpoints = scratchStartpoints + treeletDesc.startpoint_offset;
1259         if (treeletDesc.numStartpoints <= 1)
1260         {
1261             // for smaller latency we store 1 element treelets as RefitTreeletTrivial,
1262             // this happens most of the time for tip treelet
1263             if (local_id == 0)
1264             {
1265                 RefitTreeletTrivial tr = { 0, treeletDesc.numStartpoints, 0, treeletDesc.maxDepth, 0 };
1266                 if (treeletDesc.numStartpoints == 1)
1267                 {
1268                     StartPoint sp               = srcStartpoints[0];
1269 
1270                     tr.theOnlyNodeIndex         = StartPoint_GetNodeIdx(sp);
1271                     uint backpointer            = *InnerNode_GetBackPointer(BVHBase_GetBackPointers(bvh), tr.theOnlyNodeIndex);
1272                     tr.numChildrenOfTheNode     = BackPointer_GetNumChildren(backpointer);
1273                     tr.childrenOffsetOfTheNode  = BVHBase_GetInternalNodes(bvh)[tr.theOnlyNodeIndex].childOffset + tr.theOnlyNodeIndex;
1274                 }
1275                 RefitTreeletTrivial* trivial = (RefitTreeletTrivial*)(treeletDescs + treeletId);
1276                 *trivial = tr;
1277 #if REFIT_VERBOSE_LOG
1278                 printf("treelet trivial %d {\n  theOnlyNodeIndex = %d;\n  numStartpoints = %d;\n  childrenOffsetOfTheNode = %d;\n  maxDepth =%d;\n  numChildrenOfTheNode = %d;\n}\n",
1279                     treeletId,
1280                     tr.theOnlyNodeIndex,
1281                     tr.numStartpoints,
1282                     tr.childrenOffsetOfTheNode,
1283                     tr.maxDepth,
1284                     tr.numChildrenOfTheNode);
1285 #endif
1286             }
1287         }
1288         else
1289         {
1290 #define SKIP_PATHS_SORTING 0
1291 #if SKIP_PATHS_SORTING
1292             StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
1293             for (uint startpointID = local_id; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
1294             {
1295                 dstStartpoints[startpointID] = srcStartpoints[startpointID];
1296             }
1297 #else
1298             //if (local_id == 0) { printf("treelet %d, numStartpoints = %d\n", treeletId, numStartpoints); }
1299 
1300             if (local_id <= treeletDesc.maxDepth) {
1301                 depths[local_id] = 0;
1302                 //    printf("initializing slm treelet %d, depths[%d] = 0\n", treeletId, local_id);
1303             }
1304             work_group_barrier(CLK_LOCAL_MEM_FENCE);
1305 
1306             uint loopSize = ((treeletDesc.numStartpoints + (get_sub_group_size() - 1)) / get_sub_group_size()) * get_sub_group_size();
1307 
1308             // collect histogram of how many paths of given length we have
1309 
1310             // keep count of depth 0
1311             uint val = 0;
1312 
1313             // optimize: we will load Startpoint only once to
1314             uint S_c[8];
1315             // optimize: keep accumulated numbers in registers to limit number of atomic ops
1316             uint D_c[8] = { 0 };
1317 
1318             uint cached_threshold = 8 * get_local_size(0);
1319             cached_threshold = min(cached_threshold, treeletDesc.numStartpoints);
1320 
1321             uint loop_turn = 0;
1322             uint sgid = get_sub_group_local_id();
1323 
1324             for (uint startpointID = local_id+ cached_threshold; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
1325             {
1326                 uint dstSlot = StartPoint_GetDepth(srcStartpoints[startpointID]);
1327                 atomic_inc((volatile local uint*) (depths + dstSlot));
1328             }
1329 
1330             uint HistogramSG = 0;
1331             if (treeletDesc.maxDepth < 8)
1332             {
1333                 for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
1334                 {
1335                     StartPoint S = srcStartpoints[startpointID];
1336                     S_c[loop_turn++] = S;
1337                     uint dstSlot = StartPoint_GetDepth(S);
1338                     D_c[dstSlot]++;
1339                 }
1340 
1341                 for (uint d = 0; d <= treeletDesc.maxDepth; d++)
1342                 {
1343                     val = sub_group_reduce_add(D_c[d]);
1344                     if (sgid == d)
1345                     {
1346                         HistogramSG = val;
1347                     }
1348                 }
1349                 if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
1350                 {
1351                     atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
1352                 }
1353             }
1354             else
1355             {
1356                 for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
1357                 {
1358                     StartPoint S = srcStartpoints[startpointID];
1359                     S_c[loop_turn++] = S;
1360                     uint dstSlot = StartPoint_GetDepth(S);
1361                     atomic_inc((volatile local uint*) (depths + dstSlot));
1362                 }
1363             }
1364 
1365             work_group_barrier(CLK_LOCAL_MEM_FENCE);
1366 
1367 #if REFIT_VERBOSE_LOG
1368             if (local_id == 0)
1369             {
1370                 for (uint d = 0; d <= treeletDesc.maxDepth; d++)
1371                 {
1372                     printf("treelet %d depths[%d] = %d\n", treeletId, d, depths[d]);
1373                 }
1374             }
1375 #endif
1376 
1377             if (treeletDesc.maxDepth < get_sub_group_size())
1378             {
1379                 if (get_sub_group_id() == 0)
1380                 {
1381 
1382                     uint cntOfDepth = 0;
1383                     if (sgid <= treeletDesc.maxDepth) {
1384                         cntOfDepth = depths[sgid];
1385                     }
1386                     uint pref_sum = sub_group_scan_exclusive_add(cntOfDepth);
1387                     depths[sgid] = pref_sum;
1388 
1389                     uint numLeft = treeletDesc.numStartpoints - (pref_sum);
1390                     uint depthLess64  = (numLeft < 64 ) ? (uint)sgid : (uint)treeletDesc.maxDepth;
1391                     uint depthLess128 = (numLeft < 128) ? (uint)sgid : (uint)treeletDesc.maxDepth;
1392                     uint depthLess256 = (numLeft < 256) ? (uint)sgid : (uint)treeletDesc.maxDepth;
1393 
1394                     // filling data for thread 0 who will save this to mem
1395                     treeletDesc.depthLess64 = sub_group_reduce_min(depthLess64);
1396                     treeletDesc.depthLess128 = sub_group_reduce_min(depthLess128);
1397                     treeletDesc.depthLess256 = sub_group_reduce_min(depthLess256);
1398                     treeletDesc.numNonTrivialStartpoints = treeletDesc.numStartpoints - cntOfDepth;
1399 
1400                     if (sgid == 0) {
1401                         treeletDescs[treeletId] = treeletDesc;
1402 #if REFIT_VERBOSE_LOG
1403                         printf("treelet %d {\n  startpoint_offset = %d;\n  numStartpoints = %d;\n  numNonTrivialStartpoints = %d;  \n  maxDepth = %d;\n  depthLess64 = %d;\n  depthLess128 = %d;\n  depthLess256 = %d;\n}\n",
1404                             treeletId,
1405                             treeletDesc.startpoint_offset,
1406                             treeletDesc.numStartpoints,
1407                             treeletDesc.numNonTrivialStartpoints,
1408                             treeletDesc.maxDepth,
1409                             treeletDesc.depthLess64,
1410                             treeletDesc.depthLess128,
1411                             treeletDesc.depthLess256);
1412 #endif
1413                     }
1414                 }
1415             }
1416             else if (local_id <= treeletDesc.maxDepth) {
1417                 uint thisdepthcount = depths[local_id];
1418                 treeletDesc.depthLess64 = 0;
1419                 treeletDesc.depthLess128 = 0;
1420                 treeletDesc.depthLess256 = 0;
1421                 uint numLeft = treeletDesc.numStartpoints;
1422                 uint pref_sum = 0;
1423 
1424                 for (uint d = 0; d < local_id; d++)
1425                 {
1426                     uint depthCnt = depths[d];
1427                     if (numLeft > 64) { treeletDesc.depthLess64 = d + 1; }
1428                     if (numLeft > 128) { treeletDesc.depthLess128 = d + 1; }
1429                     if (numLeft > 256) { treeletDesc.depthLess256 = d + 1; }
1430                     pref_sum += depthCnt;
1431                     numLeft -= depthCnt;
1432                     if (d == 0) { treeletDesc.numNonTrivialStartpoints = numLeft; }
1433                 }
1434 
1435                 if (local_id == treeletDesc.maxDepth)
1436                 {
1437                     treeletDescs[treeletId] = treeletDesc;
1438 #if REFIT_VERBOSE_LOG
1439                     printf("treelet %d {\n  startpoint_offset = %d;\n  numStartpoints = %d;\n  numNonTrivialStartpoints = %d;  maxDepth = %d;\n  depthLess64 = %d;  depthLess128 = %d;  depthLess256 = %d;\n}\n",
1440                         treeletId,
1441                         treeletDesc.startpoint_offset,
1442                         treeletDesc.numStartpoints,
1443                         treeletDesc.numNonTrivialStartpoints,
1444                         treeletDesc.maxDepth,
1445                         treeletDesc.depthLess64,
1446                         treeletDesc.depthLess128,
1447                         treeletDesc.depthLess256);
1448 #endif
1449                 }
1450             }
1451 
1452             StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
1453 
1454             work_group_barrier(CLK_LOCAL_MEM_FENCE);
1455 
1456             loop_turn = 0;
1457             if (treeletDesc.maxDepth < 8)
1458             {
1459                 uint prefixSG = 0;
1460 
1461                 // make prefixSG keep interval for paths with sglid depth that is separated out for sg.
1462                 if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
1463                 {
1464                     prefixSG = atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
1465                 }
1466 
1467                 // from now on all sgs run independently
1468 
1469                 // make D_c keep offset interval that is separated out for given lane
1470                 for (uint d = 0; d <= treeletDesc.maxDepth; d++)
1471                 {
1472                     uint thisDPrefixSg = sub_group_broadcast(prefixSG, d);
1473                     uint thisLaneCount = D_c[d];
1474                     uint laneOffset = sub_group_scan_exclusive_add(thisLaneCount);
1475                     D_c[d] = laneOffset + thisDPrefixSg;
1476                 }
1477 
1478                 for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
1479                 {
1480                     StartPoint S = S_c[loop_turn++];
1481                     uint d = StartPoint_GetDepth(S);
1482                     uint dstSlot = D_c[d]++;
1483                     dstStartpoints[dstSlot] = S;
1484                 }
1485             }
1486             else
1487             {
1488                 for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
1489                 {
1490                     StartPoint S = S_c[loop_turn++];
1491                     uint d = StartPoint_GetDepth(S);
1492                     uint dstSlot = atomic_inc((volatile local uint*) (depths + d));
1493                     dstStartpoints[dstSlot] = S;
1494                 }
1495             }
1496 
1497             for (uint srcStartpointID = local_id+ cached_threshold; srcStartpointID < treeletDesc.numStartpoints; srcStartpointID += get_local_size(0))
1498             {
1499                 StartPoint S = srcStartpoints[srcStartpointID];
1500                 uint d = StartPoint_GetDepth(srcStartpoints[srcStartpointID]);
1501                 uint dstSlot = atomic_inc((volatile local uint*) (depths+ d));
1502                 dstStartpoints[dstSlot] = S;
1503             }
1504 #endif //skip sorting
1505         }
1506     }
1507 }
1508