1 //
2 // Copyright (C) 2009-2021 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7
8 #pragma once
9
10 #include "bvh_build_refit.h"
11 #include "libs/lsc_intrinsics.h"
12
13
14 #define REFIT_DEBUG_CHECKS 0
15 #define REFIT_VERBOSE_LOG 0
16
17 #define NUM_STARTPOINTS_IN_SLM (1024)
18
storeAABBToL1(struct AABB aabb,struct AABB * ptr)19 GRL_INLINE void storeAABBToL1(struct AABB aabb, struct AABB* ptr)
20 {
21 uint8 val = (uint8)(
22 as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
23 as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
24
25 store_uint8_L1WB_L3WB((__global uint8*) ptr, 0, val);
26 }
27
storeAABBToL3(struct AABB aabb,struct AABB * ptr)28 GRL_INLINE void storeAABBToL3(struct AABB aabb, struct AABB* ptr)
29 {
30 uint8 val = (uint8)(
31 as_uint(aabb.lower.x), as_uint(aabb.lower.y), as_uint(aabb.lower.z), as_uint(aabb.lower.w),
32 as_uint(aabb.upper.x), as_uint(aabb.upper.y), as_uint(aabb.upper.z), as_uint(aabb.upper.w));
33
34 store_uint8_L1UC_L3WB((__global uint8*) ptr, 0, val);
35 }
36
37 typedef struct Treelet_by_single_group_locals
38 {
39 uint startpoints[NUM_STARTPOINTS_IN_SLM];
40 } Treelet_by_single_group_locals;
41
42 typedef struct SquashedInputGroupDesc {
43 qword bvh;
44 qword scratch;
45 uint groupInTree;
46 uint totalNumGroups; //valid only for 0th element in array, otherwise its trash padding
47 } SquashedInputGroupDesc;
48
49 //
50 //
51 // update primitives
52 //
53 //
54
55 typedef struct SquashedInput {
56 global struct BVHBase* pBvh;
57 global void* pInput;
58 global struct AABB* bbox_scratch;
59 } SquashedInput;
60
61
62
63 // updates one quad leaf and gets BBOX contatining it
refit_bottom_child_quad(global struct QuadLeaf * quad,global GRL_RAYTRACING_GEOMETRY_DESC * geomDesc,struct AABB * childAABB)64 GRL_INLINE void refit_bottom_child_quad(
65 global struct QuadLeaf* quad,
66 global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
67 struct AABB* childAABB)
68 {
69 struct QuadLeaf Q;
70 get_updated_quad(quad, geomDesc, &Q);
71 quadCopyVertices(&Q, quad);
72 *childAABB = getAABB_Quad((struct Quad*) & Q); // FIXME: support leaves with more than one quad
73 }
74
75 // procedurals will have to go old path at first
76 #if 0
77 // updates one procedural leaf and gets BBOX contatining it
78 GRL_INLINE void refit_bottom_child_procedural(
79 global struct ProceduralLeaf** pleaf,
80 global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc,
81 struct AABB* childAABB)
82 {
83 global struct ProceduralLeaf* leaf = *pleaf;
84 /* extract geomID and primID from leaf */
85 const uint startPrim = QBVHNodeN_startPrim(curNode, child_idx);
86 const uint geomID = ProceduralLeaf_geomIndex(leaf);
87 const uint primID = ProceduralLeaf_primIndex(leaf, startPrim); // FIXME: have to iterate over all primitives of leaf!
88
89 /* read bounds from geometry descriptor */
90 struct GRL_RAYTRACING_AABB aabb = GRL_load_aabb(&geomDesc[geomID], primID);
91 childAABB->lower.x = aabb.MinX;
92 childAABB->lower.y = aabb.MinY;
93 childAABB->lower.z = aabb.MinZ;
94 childAABB->upper.x = aabb.MaxX;
95 childAABB->upper.y = aabb.MaxY;
96 childAABB->upper.z = aabb.MaxZ;
97
98 /* advance leaf pointer to next child */
99 *pleaf = leaf + QBVHNodeN_blockIncr(curNode, child_idx);
100 }
101
102
103 GRL_INLINE void update_procedural_leafs(
104 global struct BVHBase* bvh,
105 global void* input,
106 global struct AABB* bbox_scratch,
107 uint id,
108 uint num_done_by_one_thread)
109 {
110 uint numLeaves = BVHBase_GetNumQuads(bvh);
111 uint leafsIndexOffset = bvh->proceduralDataStart - BVH_ROOT_NODE_OFFSET / 64;
112 global ProceduralLeaf* leafs = (global QuadLeaf*)BVHBase_GetProceduralLeaves(bvh);
113 uint start_leaf = id * num_done_by_one_thread;
114 uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
115
116 global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
117
118 for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
119 {
120 struct AABB theAABB;
121 refit_bottom_child_procedural(leafs + leaf_id, geosArray, &theAABB);
122 theAABB.lower.w = as_float(0xABBADEFF);
123 theAABB.upper.w = 0x00;
124 storeAABBToL1(theAABB, &bbox[leafsIndexOffset + leaf_id]);
125 }
126 }
127 #endif
128
update_quads(global struct BVHBase * bvh,global void * input,global struct AABB * bbox_scratch,uint id,uint num_done_by_one_thread)129 GRL_INLINE void update_quads(
130 global struct BVHBase* bvh,
131 global void* input,
132 global struct AABB* bbox_scratch,
133 uint id,
134 uint num_done_by_one_thread)
135 {
136 uint numLeaves = BVHBase_GetNumQuads(bvh);
137 uint leafsIndexOffset = bvh->quadLeafStart - BVH_ROOT_NODE_OFFSET / 64;
138 global QuadLeaf* leafs = (global QuadLeaf*)BVHBase_GetQuadLeaves(bvh);
139 uint start_leaf = id * num_done_by_one_thread;
140 uint end_leaf = min(start_leaf + num_done_by_one_thread, numLeaves);
141
142 global GRL_RAYTRACING_GEOMETRY_DESC* geosArray = (global GRL_RAYTRACING_GEOMETRY_DESC*) input;
143
144 for (uint leaf_id = start_leaf; leaf_id < end_leaf; leaf_id++)
145 {
146 struct AABB theAABB;
147 refit_bottom_child_quad(leafs + leaf_id, geosArray, &theAABB);
148 theAABB.lower.w = as_float(0xABBADEFF);
149 theAABB.upper.w = 0x00;
150 storeAABBToL1(theAABB, &bbox_scratch[leafsIndexOffset + leaf_id]);
151 }
152 }
153
154 /////////////////////////////////////////////////////////////////////////////////////////////////////
155 //
156 // core bottom-up update functions
157 //
158 //
159
quantise_bounds(struct AABB * input_aabb,float3 len,float3 mant,float3 org,int3 exp,uchar3 * lower_uchar,uchar3 * upper_uchar)160 GRL_INLINE void quantise_bounds(
161 struct AABB* input_aabb, float3 len, float3 mant, float3 org, int3 exp,
162 uchar3* lower_uchar,
163 uchar3* upper_uchar)
164 {
165 const float up = 1.0f + ulp;
166 const float down = 1.0f - ulp;
167
168 struct AABB child_aabb = conservativeAABB(input_aabb); // conservative ???
169
170 float3 lower = floor(bitShiftLdexp3((child_aabb.lower.xyz - org) * down, -exp + 8));
171 lower = clamp(lower, (float)(QUANT_MIN), (float)(QUANT_MAX));
172 float3 upper = ceil(bitShiftLdexp3((child_aabb.upper.xyz - org) * up, -exp + 8));
173 upper = clamp(upper, (float)(QUANT_MIN), (float)(QUANT_MAX));
174
175 *lower_uchar = convert_uchar3_rtn(lower);
176 *upper_uchar = convert_uchar3_rtp(upper);
177 }
178
179 typedef struct Qbounds_as_DW {
180 uint32_t xLL; uint32_t xLU; uint32_t xUU;
181 uint32_t yLL; uint32_t yLU; uint32_t yUU;
182 uint32_t zLL; uint32_t zLU; uint32_t zUU;
183 } Qbounds_as_DW;
184
encodeQuantisedDataAsDW(uchar3 lower_uchar,uchar3 upper_uchar,uint idx,Qbounds_as_DW * qbounds)185 GRL_INLINE void encodeQuantisedDataAsDW(
186 uchar3 lower_uchar,
187 uchar3 upper_uchar,
188 uint idx,
189 Qbounds_as_DW* qbounds)
190 {
191 uint shift_init = idx * 8;
192 if (idx >= 4) {
193 uint shift = (shift_init - 32);
194 qbounds->xLU |= ((uint)lower_uchar.x) << shift;
195 qbounds->yLU |= ((uint)lower_uchar.y) << shift;
196 qbounds->zLU |= ((uint)lower_uchar.z) << shift;
197 }
198 else {
199 qbounds->xLL |= ((uint)lower_uchar.x) << shift_init;
200 qbounds->yLL |= ((uint)lower_uchar.y) << shift_init;
201 qbounds->zLL |= ((uint)lower_uchar.z) << shift_init;
202 }
203
204 if (idx < 2) {
205 uint shift = (shift_init + 16);
206 qbounds->xLU |= ((uint)upper_uchar.x) << shift;
207 qbounds->yLU |= ((uint)upper_uchar.y) << shift;
208 qbounds->zLU |= ((uint)upper_uchar.z) << shift;
209 }
210 else {
211 uint shift = (shift_init - 16);
212
213 qbounds->xUU |= ((uint)upper_uchar.x) << shift;
214 qbounds->yUU |= ((uint)upper_uchar.y) << shift;
215 qbounds->zUU |= ((uint)upper_uchar.z) << shift;
216 }
217 }
218
encodeChildBounds(uchar3 lower_uchar,uchar3 upper_uchar,uint ch,struct InternalNode * qnode)219 GRL_INLINE void encodeChildBounds(uchar3 lower_uchar, uchar3 upper_uchar, uint ch, struct InternalNode* qnode)
220 {
221 qnode->lower_x[ch] = lower_uchar.x; qnode->upper_x[ch] = upper_uchar.x;
222 qnode->lower_y[ch] = lower_uchar.y; qnode->upper_y[ch] = upper_uchar.y;
223 qnode->lower_z[ch] = lower_uchar.z; qnode->upper_z[ch] = upper_uchar.z;
224 }
225
226
InternalNode_setBounds_skip_prev(struct InternalNode * qbvh_node,uint prevChildIdx,struct AABB * prev_input_aabb,struct AABB * input_aabb,uint childrenIndex,const uint numChildren,struct AABB * aabb_reduced)227 GRL_INLINE GRL_OVERLOADABLE void InternalNode_setBounds_skip_prev(struct InternalNode* qbvh_node, uint prevChildIdx, struct AABB* prev_input_aabb, struct AABB* input_aabb, uint childrenIndex, const uint numChildren, struct AABB* aabb_reduced)
228 {
229
230 int3 exp;
231 const float up = 1.0f + ulp;
232 struct AABB conservative_aabb = conservativeAABB(aabb_reduced);
233 const float3 len = AABB_size(&conservative_aabb).xyz * up;
234 const float3 mant = frexp_vec3(len, &exp);
235 const float3 org = conservative_aabb.lower.xyz;
236
237 exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0);
238
239 qbvh_node->lower[0] = org.x; qbvh_node->lower[1] = org.y; qbvh_node->lower[2] = org.z;
240
241 qbvh_node->exp_x = exp.x; qbvh_node->exp_y = exp.y; qbvh_node->exp_z = exp.z;
242
243 Qbounds_as_DW qbounds = { 0x0 };
244
245
246 {
247 uchar3 lower_uchar, upper_uchar;
248 quantise_bounds(prev_input_aabb, len, mant, org, exp, &lower_uchar, &upper_uchar);
249
250 //encode invalid children. its enough to set 0x80 as lower_x bytes
251 uint shift = numChildren * 8;
252 uint shift2 = min(shift, 31u);
253 qbounds.xLL = (0x80808080u << shift2);
254 uint shift3 = max(shift, 32u) - 32;
255 qbounds.xLU = (ushort)(((ushort)0x8080) << (ushort)shift3);
256
257 encodeQuantisedDataAsDW(lower_uchar, upper_uchar, prevChildIdx, &qbounds);
258 //encodeChildBounds(lower_uchar, upper_uchar, prevChildIdx, qbvh_node);
259 }
260
261 uint ch = prevChildIdx == 0;
262 while (ch < numChildren) {
263 uchar3 lower_uchar, upper_uchar;
264 quantise_bounds(input_aabb + ch, len, mant, org, exp, &lower_uchar, &upper_uchar);
265 encodeQuantisedDataAsDW(lower_uchar, upper_uchar, ch, &qbounds);
266 //encodeChildBounds(lower_uchar, upper_uchar, ch, qbvh_node);
267 ch += 1 + (prevChildIdx == (ch + 1));
268 }
269 Qbounds_as_DW* qbounds_dst = (Qbounds_as_DW*)(&qbvh_node->lower_x[0]);
270 *qbounds_dst = qbounds;
271 return;
272 }
273
refitReduce2Boxes(struct AABB A,struct AABB B)274 GRL_INLINE struct AABB refitReduce2Boxes(struct AABB A, struct AABB B)
275 {
276 AABB_extend(&A, &B);
277 // to make it work for TLAS node masks change to this:
278 // A.lower.w = as_float(as_uint(A.lower.w) | as_uint(B.lower.w));
279 A.lower.w = as_float(0xABBADE00u);
280 return A;
281 }
282
refitReduceNodePrev(uint prevIdx,uint leadChildIdx,uint numChildren,struct AABB * globalBox,struct AABB * reduceBox,uint depth,uint NodeIndex)283 GRL_INLINE void refitReduceNodePrev(
284 uint prevIdx,
285 uint leadChildIdx,
286 uint numChildren,
287 struct AABB* globalBox,
288 struct AABB* reduceBox,
289 uint depth,
290 uint NodeIndex)
291 {
292 uint8_t childIgnored = (prevIdx - leadChildIdx);
293
294 # if REFIT_DEBUG_CHECKS
295 bool err = false;
296 if ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)
297 {
298 printf("refitReduceNode6 (loc_id %d): prev (used as child %d) not updated! NodeIndex %d, child nodeIdx %d at depth %d\n",
299 get_local_id(0),
300 childIgnored,
301 NodeIndex,
302 prevIdx,
303 depth);
304 err = true;
305 }
306
307 if ((as_uint(globalBox[NodeIndex].lower.w) & 0xFFFFFF00) == 0xABBADE00u)
308 {
309 printf("refitReduceNode6 (loc_id %d): dst node already updated. NodeIndex %d depth %d\n",
310 get_local_id(0),
311 NodeIndex,
312 depth);
313 }
314
315 bool fail = false;
316 for (uint k = 0; (k < numChildren) && !err; ++k) {
317 if (k != childIgnored) {
318 if ((as_uint(globalBox[leadChildIdx + k].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
319 printf("refitReduceNode6 (loc_id %d): child %d not updated! use prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
320 get_local_id(0),
321 k,
322 prevIdx - leadChildIdx,
323 NodeIndex,
324 leadChildIdx + k,
325 depth);
326 fail = true;
327 }
328 }
329 }
330 err |= fail;
331 # endif
332
333 // for each child 3 bits contains load index
334 const uint32_t indicesEncoded =
335 (1 << 0) +
336 (2 << 3) +
337 (3 << 6) +
338 (4 << 9) +
339 (5 << 12) +
340 (0 << 15) +
341 (1 << 18) +
342 (2 << 21) +
343 (3 << 24) +
344 (4 << 27);
345 // 1,2,3,4,5
346
347
348 uint32_t indicesEncodedShifted = indicesEncoded >> (childIgnored * 3);
349
350 struct AABB* childAABB = globalBox + leadChildIdx;
351 struct AABB temp = childAABB[indicesEncodedShifted & 7];
352 indicesEncodedShifted >>= 3;
353 struct AABB* nextChild = childAABB + (indicesEncodedShifted & 7);
354 struct AABB backlog = temp;
355
356 for (uint child = 2; child < numChildren; child++)
357 {
358 temp = *nextChild;
359 *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
360 indicesEncodedShifted >>= 3;
361 nextChild = childAABB + (indicesEncodedShifted & 7);
362 backlog = temp;
363 }
364
365 *reduceBox = refitReduce2Boxes(*reduceBox, backlog);
366
367 #if REFIT_DEBUG_CHECKS
368 for (uint k = 0; (k < numChildren) && !err; ++k) {
369 if (k != childIgnored) {
370 if (!AABB_subset(&globalBox[leadChildIdx + k], reduceBox)) {
371 printf("refitReduceNode6 (loc_id %d): child AABB %d/%d reduction went wrong! skipped prev %d, NodeIndex %d, child nodeIdx %d at depth %d\n",
372 get_local_id(0),
373 k, numChildren,
374 prevIdx - leadChildIdx,
375 NodeIndex,
376 leadChildIdx + k,
377 depth);
378
379 err = true;
380 }
381 }
382 }
383 if (!err && ((as_uint(reduceBox->lower.w) & 0xFFFFFF00) != 0xABBADE00u)) {
384 printf("refitReduceNode6: havent set the 0xABBADEXXu marker in result node %d at depth %d!\n",
385 NodeIndex,
386 depth);
387 }
388 #endif
389 }
390
391
hash_local_id()392 GRL_INLINE uint hash_local_id()
393 {
394 return get_sub_group_local_id() * get_num_sub_groups() + get_sub_group_id();
395 }
396
397 //===============================================================
398 //
399 // Core update function
400 //
401 //===============================================================
refit_treelet_by_single_group(global struct AABB * bbox,local Treelet_by_single_group_locals * loc,uniform global BVHBase * pBvh,uniform RefitTreelet trltDsc,bool encodeQnodes,bool isTipTreelet)402 GRL_INLINE bool refit_treelet_by_single_group(
403 global struct AABB* bbox,
404 local Treelet_by_single_group_locals* loc,
405 uniform global BVHBase* pBvh,
406 uniform RefitTreelet trltDsc,
407 bool encodeQnodes,
408 bool isTipTreelet)
409 {
410 BackPointers* backpointers = BVHBase_GetBackPointers(pBvh);
411 InternalNode* internalNodes = BVHBase_GetInternalNodes(pBvh);
412 uint local_id = get_local_id(0);
413 StartPoint* startPoints = BVHBase_GetRefitStartPoints(pBvh) + trltDsc.startpoint_offset;
414
415 // special case for single path treelets, TODO rewrite it as subgroups based
416 if (trltDsc.numStartpoints == 1) {
417 if (local_id == 0) {
418 RefitTreeletTrivial desc = *((RefitTreeletTrivial*)& trltDsc);
419 uint innerNodeIdx = desc.theOnlyNodeIndex;
420 uint numChildren = desc.numChildrenOfTheNode;
421 uint childIndex = desc.childrenOffsetOfTheNode;
422 uint maxDepth = desc.maxDepth;
423
424 uint prevIdx = childIndex;
425 struct AABB myBox = bbox[childIndex];
426 struct AABB prevAABB;
427 uint backpointer = maxDepth > 0 ? *InnerNode_GetBackPointer(backpointers, innerNodeIdx) : 0;
428 InternalNode* curNode = internalNodes + innerNodeIdx;
429 uint currDepth = 0;
430
431 while (1)
432 {
433 prevAABB = myBox;
434 if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
435
436 if (!encodeQnodes) { myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4)); }
437
438 if (++currDepth > maxDepth) { break; }
439
440 if (encodeQnodes) {
441 InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
442 }
443 #if !REFIT_DEBUG_CHECKS
444 else
445 #endif
446 { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
447
448 prevIdx = innerNodeIdx;
449 innerNodeIdx = BackPointer_GetParentIndex(backpointer);
450 backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
451 numChildren = BackPointer_GetNumChildren(backpointer);
452 curNode = internalNodes + innerNodeIdx;
453 childIndex = innerNodeIdx + curNode->childOffset;
454 }
455
456 if (isTipTreelet) {
457 AABB3f reduced3f = AABB3fFromAABB(myBox);
458 pBvh->Meta.bounds = reduced3f;
459 }
460 else {
461 storeAABBToL3(myBox, &bbox[innerNodeIdx]);
462 }
463
464 if (encodeQnodes || isTipTreelet) {
465 InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
466 }
467
468 #if REFIT_VERBOSE_LOG
469 printf("single node treelet: storing node idx %d \n", innerNodeIdx);
470 #endif
471 }
472
473 return local_id == 0;
474 }
475
476 local uint* loc_startpoints = loc->startpoints;
477
478
479 #if REFIT_DEBUG_CHECKS
480 if ((trltDsc.numNonTrivialStartpoints > NUM_STARTPOINTS_IN_SLM)) {
481 if(local_id == 0) printf("out of SLM space, trltDsc.depthSub_NUM_STARTPOINTS_IN_SLM > 0\n");
482 return local_id == 0;
483 }
484 #endif
485
486 uint SLMedStartpointsOffset = trltDsc.numStartpoints - trltDsc.numNonTrivialStartpoints;
487
488 /*=====================================================================
489 first phase where we update startpoints nodes only
490 ----------------------------------------------------------------------*/
491 for (uint startpoint_i = local_id; startpoint_i < trltDsc.numStartpoints; startpoint_i += get_local_size(0)) {
492 uint startpoint = (uint)intel_sub_group_block_read_ui((global uint*)(startPoints + startpoint_i));
493 uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
494 uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
495 if (startpoint_i >= SLMedStartpointsOffset) {
496 uint idx = startpoint_i - SLMedStartpointsOffset;
497 loc_startpoints[idx] = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
498 }
499
500 uint numChildren = BackPointer_GetNumChildren(backpointer);
501 InternalNode* curNode = internalNodes + innerNodeIdx;
502 uint childIndex = innerNodeIdx + curNode->childOffset;
503
504 uint prevIdx = childIndex;
505 struct AABB myBox = bbox[childIndex];
506 struct AABB prevAABB = myBox;
507
508 # if REFIT_DEBUG_CHECKS
509 if (numChildren == 0) {
510 printf("this node has no chidren!\n", 0);
511 AABB_init(&myBox);
512 }
513 # endif
514
515 if (numChildren > 1) { refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, 0, innerNodeIdx); }
516 myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
517
518 #if REFIT_VERBOSE_LOG
519 printf("init phase: at depth 0 storing node idx %d \n", innerNodeIdx);
520 #endif
521 storeAABBToL1(myBox, &bbox[innerNodeIdx]);
522
523 if (encodeQnodes) {
524 InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
525 }
526 }
527
528 uniform uint CurrPeeledDepth = 1;
529 uniform uint numStartpoints = trltDsc.numNonTrivialStartpoints;
530 uint nextFloorStartpoint = hash_local_id();
531
532 uint depthOnionEnd = trltDsc.depthLess64;
533 if (get_local_size(0) == 128) { depthOnionEnd = trltDsc.depthLess128; }
534 if (get_local_size(0) == 256) { depthOnionEnd = trltDsc.depthLess256; }
535
536 /*=====================================================================
537 second phase, we update horizontally untill
538 we reach number of active path below grou size
539 ----------------------------------------------------------------------*/
540 while (CurrPeeledDepth < depthOnionEnd) {
541 mem_fence_workgroup_default();
542
543 work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
544 uint start = nextFloorStartpoint;
545 nextFloorStartpoint = numStartpoints;
546
547 for (uint startpoint_i = start; startpoint_i < numStartpoints; startpoint_i += get_local_size(0)) {
548 uint startpoint = loc_startpoints[startpoint_i];
549 uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
550 uint backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
551
552 if (StartPoint_GetDepth(startpoint) > CurrPeeledDepth) {
553 StartPoint newSP = (BackPointer_GetParentIndex(backpointer) << 6) | StartPoint_GetDepth(startpoint);
554 loc_startpoints[startpoint_i] = newSP;
555 nextFloorStartpoint = min(nextFloorStartpoint, startpoint_i);
556 }
557
558 InternalNode* curNode = internalNodes + innerNodeIdx;
559 uint childIndex = innerNodeIdx + curNode->childOffset;
560 uint numChildren = BackPointer_GetNumChildren(backpointer);
561
562 uint prevIdx = childIndex;
563 struct AABB myBox = bbox[childIndex];
564 struct AABB prevAABB = myBox;
565 refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
566
567 myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
568
569 #if REFIT_VERBOSE_LOG
570 printf("onion: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_i, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
571 #endif
572 storeAABBToL1(myBox, &bbox[innerNodeIdx]);
573 if (encodeQnodes) {
574 InternalNode_setBounds_skip_prev(curNode, 0, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
575 }
576 }
577 CurrPeeledDepth++;
578 }
579
580 uint startpoint_idx = nextFloorStartpoint;
581 bool active = startpoint_idx < numStartpoints;
582
583 work_group_barrier(CLK_LOCAL_MEM_FENCE, memory_scope_work_group);
584 StartPoint startpoint = loc_startpoints[startpoint_idx];
585
586 struct AABB myBox;
587 uint prevIdx = 0;
588 uint innerNodeIdx = StartPoint_GetNodeIdx(startpoint);
589
590 /*=====================================================================
591 last phase, each thread just continues path to its end
592
593 only thread that computes the longest path leaves prematurely
594 (thats why while condition isn't <=) the code for finalizing root of treelet
595 is special and hendled afterwards
596
597 TODO: with proper assigning of paths to lanes we should reach only three
598 active lanes per physical thread quite soon for this subgroups could be used
599 ----------------------------------------------------------------------*/
600 bool prevActive = active;
601 while (CurrPeeledDepth < trltDsc.maxDepth) {
602 uint backpointer;
603 uint childIndex;
604 InternalNode* curNode = internalNodes + innerNodeIdx;
605 if (active) {
606 childIndex = innerNodeIdx + curNode->childOffset;
607 backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
608 } else if(prevActive){
609 mem_fence_workgroup_default();
610 }
611
612 prevActive = active;
613
614 work_group_barrier(0, memory_scope_work_group);
615 //printf("Start node %d at depth %d, innerNodeIdx %d dying! \n", StartPoint_GetNodeIdx(startpoint), CurrPeeledDepth, innerNodeIdx);
616 if (active) {
617
618 #if REFIT_DEBUG_CHECKS
619 if (CurrPeeledDepth > StartPoint_GetDepth(startpoint))
620 {
621 printf("uppath: startpoint %d <n=%d , d=%d> at depth %d shouldn't be active!\n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth);
622 }
623 #endif
624 if (prevIdx == 0) {
625 myBox = bbox[childIndex];
626 prevIdx = childIndex;
627 }
628 uint numChildren = BackPointer_GetNumChildren(backpointer);
629
630 struct AABB prevAABB = myBox;
631 refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
632 myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
633 #if REFIT_VERBOSE_LOG
634 printf("uppath: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx);
635 #endif
636 active = CurrPeeledDepth < StartPoint_GetDepth(startpoint);
637
638 if (encodeQnodes) {
639 #if !REFIT_DEBUG_CHECKS
640 if (!active)
641 #endif
642 { storeAABBToL1(myBox, &bbox[innerNodeIdx]); }
643 InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
644 } else {
645 storeAABBToL1(myBox, &bbox[innerNodeIdx]);
646 }
647
648 prevIdx = innerNodeIdx;
649 innerNodeIdx = BackPointer_GetParentIndex(backpointer);
650 }
651
652 CurrPeeledDepth++;
653 }
654
655 {
656 uint backpointer;
657 uint childIndex;
658 InternalNode* curNode = internalNodes + innerNodeIdx;
659 if (active) {
660 childIndex = innerNodeIdx + curNode->childOffset;
661 backpointer = *InnerNode_GetBackPointer(backpointers, innerNodeIdx);
662 } else if(prevActive) {
663 mem_fence_workgroup_default();
664 }
665
666 work_group_barrier(0, memory_scope_work_group);
667
668 /*=====================================================================
669 final step, is special processing of root,
670 its different, since its box is transfered cross group (written to L3)
671 or is root of whole tree and hence fill global box in bvh MD
672 TODO: this should be done in SG as only one thread is active
673 ----------------------------------------------------------------------*/
674 if (active) {
675 if (prevIdx == 0) {
676 myBox = bbox[childIndex];
677 prevIdx = childIndex;
678 }
679 uint numChildren = BackPointer_GetNumChildren(backpointer);
680 struct AABB prevAABB = myBox;
681 refitReduceNodePrev(prevIdx, childIndex, numChildren, bbox, &myBox, CurrPeeledDepth, innerNodeIdx);
682 myBox.upper.w = encodeQnodes ? 0 : as_float(numChildren + (childIndex << 4));
683
684 #if REFIT_VERBOSE_LOG
685 printf("root: startpoint %d <n=%d , d=%d> at depth %d storing node idx %d \n", startpoint_idx, StartPoint_GetNodeIdx(startpoint), StartPoint_GetDepth(startpoint), CurrPeeledDepth, innerNodeIdx/*,WeReInSIMD*/);
686 #endif
687 if (isTipTreelet) {
688 AABB3f reduced3f = AABB3fFromAABB(myBox);
689 pBvh->Meta.bounds = reduced3f;
690 InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
691 } else {
692 storeAABBToL3(myBox, &bbox[innerNodeIdx]);
693 if (encodeQnodes) {
694 InternalNode_setBounds_skip_prev(curNode, prevIdx - childIndex, &prevAABB, bbox + childIndex, childIndex, numChildren, &myBox);
695 }
696 }
697 }
698 }
699
700 return active;
701 }
702
703
704 //////////////////////////////////////////////////////////////////////////////////////
705 //
706 // Internal nodes enocding as a separate dispatch
707 //
708 //
709
710 // encode qnodes as a separate pass
post_refit_encode_qnode_tree_per_group(global struct AABB * bbox_scratch,global struct BVHBase * bvh)711 GRL_INLINE void post_refit_encode_qnode_tree_per_group(
712 global struct AABB* bbox_scratch,
713 global struct BVHBase* bvh)
714 {
715 uint numInnerNodes = BVHBase_GetNumInternalNodes(bvh);
716 InternalNode* internalNodes = BVHBase_GetInternalNodes(bvh);
717
718 for (uint nodeIdx = get_local_id(0) + 1 /*+1 because node 0 is already updated*/; nodeIdx < numInnerNodes; nodeIdx += get_local_size(0))
719 {
720 struct AABB reduced = bbox_scratch[nodeIdx];
721 # if REFIT_DEBUG_CHECKS
722 if ((as_uint(reduced.lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
723 printf("qnode enc group: NodeIndex %d not updated! \n", nodeIdx);
724 return;
725 }
726 for (uint k = 0; k < (as_uint(reduced.upper.w) & 7); ++k) {
727 uint childIdx = (as_uint(reduced.upper.w) >> 4) + k;
728 if ((as_uint(bbox_scratch[childIdx].lower.w) & 0xFFFFFF00) != 0xABBADE00u) {
729 printf("qnode enc group: child not updated! NodeIndex %d, child nodeIdx %d \n", nodeIdx, childIdx);
730 return;
731 }
732 }
733 # endif
734 struct InternalNode* qbvh_node = internalNodes + nodeIdx;
735 uint childIndex = as_uint(reduced.upper.w) >> 4;
736 uint numChildren = as_uint(reduced.upper.w) & 7;
737 struct AABB* children = bbox_scratch + childIndex;
738 //InternalNode_setBounds(internalNodes + nodeIdx, bbox_scratch + (as_uint(reduced.upper.w) >> 4), as_uint(reduced.upper.w) & 7, &reduced);
739 InternalNode_setBounds_skip_prev(qbvh_node, 0, children, children, childIndex, numChildren, &reduced);
740 }
741 }
742
743 //////////////////////////////////////////////////////////////////////////////////////
744 //
745 // Construction of treelets and paths
746 //
747 //
748
749 // this is tiny bit tricky, when bottom-up thread haven't yet closed treelet this is number of startpoints that are under the node
750 // when thread closed treelets it the data is starts to be treelet ID
751 typedef uint TreeletNodeData;
752
753 typedef struct TreeletsOpenNodeInfo {
754 // bool isTreeletRoot; // : 1
755 short maxDepth; // : 14
756 uint numStartpoints;// : 16
757 } TreeletsOpenNodeInfo;
758
759 typedef struct TreeletsClosedNodeInfo {
760 // bool isTreeletRoot; // : 1
761 uint treeletId; // : 31 (when treelet is closed)
762 } TreeletsClosedNodeInfo;
763
ClearTreeletRoot(TreeletNodeData D)764 GRL_INLINE TreeletNodeData ClearTreeletRoot(TreeletNodeData D)
765 {
766 return D & ((1u << 31u) - 1u);
767 }
768
isTreeletRoot(TreeletNodeData E)769 GRL_INLINE uint isTreeletRoot(TreeletNodeData E)
770 {
771 return E >> 31;
772 }
773
getNumStartpoints(TreeletNodeData E)774 GRL_INLINE uint getNumStartpoints(TreeletNodeData E)
775 {
776 return E & ((1 << 16) - 1);
777 }
778
getMaxDepth(TreeletNodeData E)779 GRL_INLINE uint getMaxDepth(TreeletNodeData E)
780 {
781 return (E >> 16) & ((1 << 14) - 1);
782 }
783
784 // single startpoint treelet
isTrivialTreeletRoot(TreeletNodeData E)785 GRL_INLINE uint isTrivialTreeletRoot(TreeletNodeData E)
786 {
787 return (E >> 31) && (getMaxDepth(E) == 0);
788 }
789
SetTipStartpoint(TreeletNodeData D)790 GRL_INLINE TreeletNodeData SetTipStartpoint(TreeletNodeData D)
791 {
792 return ClearTreeletRoot(D) | (1 << 30);
793 }
794
SetTreeletRoot(TreeletNodeData D)795 GRL_INLINE TreeletNodeData SetTreeletRoot(TreeletNodeData D)
796 {
797 return D | (1 << 31);
798 }
799
DecodeOpenInfo(TreeletNodeData E)800 GRL_INLINE TreeletsOpenNodeInfo DecodeOpenInfo(TreeletNodeData E)
801 {
802 TreeletsOpenNodeInfo I;
803 I.maxDepth = getMaxDepth(E);
804 I.numStartpoints = getNumStartpoints(E);
805 return I;
806 }
807
EncodeOpenInfo(TreeletsOpenNodeInfo I,bool isRoot)808 GRL_INLINE TreeletNodeData EncodeOpenInfo(TreeletsOpenNodeInfo I, bool isRoot)
809 {
810 TreeletNodeData D = isRoot ? (1 << 31) : 0;
811 D |= (I.maxDepth & ((1 << 14) - 1)) << 16;
812 D |= I.numStartpoints & ((1 << 16) - 1);
813 return D;
814 }
815
DecodeClosedInfo(TreeletNodeData E)816 GRL_INLINE TreeletsClosedNodeInfo DecodeClosedInfo(TreeletNodeData E)
817 {
818 TreeletsClosedNodeInfo I;
819 I.treeletId = E & ((1u << 31u) - 1u);
820 return I;
821 }
822
EncodeClosedInfo(TreeletsClosedNodeInfo I)823 GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(TreeletsClosedNodeInfo I)
824 {
825 TreeletNodeData D = (1u << 31u); // closed is always a root!
826 D |= I.treeletId & ((1u << 31u) - 1u);
827 return D;
828 }
829
EncodeClosedInfo(uint treeletId)830 GRL_INLINE TreeletNodeData GRL_OVERLOADABLE EncodeClosedInfo(uint treeletId)
831 {
832 TreeletNodeData D = (1 << 31); // closed is always a root!
833 D |= treeletId & ((1u << 31u) - 1u);
834 return D;
835 }
836
chk_close_Treelet(RefitTreelet * TreeletDescsArr,TreeletNodeData * nodeTreeletDataArr,uint * StartPointBuffer,uint * currStartpoint,TreeletNodeData nodeData,TreeletsOpenNodeInfo * nodeOpenInfo,uint nodeIdx,uint * treeletDescIdx)837 GRL_INLINE void chk_close_Treelet(
838 RefitTreelet* TreeletDescsArr,
839 TreeletNodeData* nodeTreeletDataArr,
840 uint* StartPointBuffer,
841 uint* currStartpoint,
842 TreeletNodeData nodeData,
843 TreeletsOpenNodeInfo* nodeOpenInfo,
844 uint nodeIdx,
845 uint* treeletDescIdx)
846 {
847 if (isTreeletRoot(nodeData))
848 {
849 TreeletNodeData encoded = 0;
850 if (nodeOpenInfo->numStartpoints == 1)
851 {
852 encoded = ClearTreeletRoot(SetTipStartpoint(nodeData));
853 }
854 else
855 {
856 RefitTreelet RTdesc;
857 RTdesc.startpoint_offset = *currStartpoint;
858 *currStartpoint += nodeOpenInfo->numStartpoints;
859 RTdesc.numStartpoints = nodeOpenInfo->numStartpoints;
860 RTdesc.maxDepth = nodeOpenInfo->maxDepth;
861 TreeletDescsArr[*treeletDescIdx] = RTdesc;
862 encoded = EncodeClosedInfo(*treeletDescIdx);
863 *treeletDescIdx = *treeletDescIdx + 1;
864 TreeletsOpenNodeInfo infoDefault = { 0, 0 };
865 *nodeOpenInfo = infoDefault;
866 }
867
868 nodeTreeletDataArr[nodeIdx] = encoded;
869 }
870 // printf("close_Treelet %d, nodeOpenInfo.numStartpoints %d, RTdesc.maxDepth %d, RTdesc.startpoint_offset %d\n", treeletDescIdx, nodeOpenInfo.numStartpoints, RTdesc.maxDepth, RTdesc.startpoint_offset);
871 }
872
873
874 // TreeletNodeData* treelets holds per node property, after running this some of them are marked as treelet root
treelet_bottom_up_mark_treelets(global struct BVHBase * bvh,global InternalNode * internalNodes,global StartPoint * scratch_startpoints,uint curNodeIndex,BackPointers * backPointers,global TreeletNodeData * treelets,uint refitTreeletsDataStart,uint * startpointAlloc)875 GRL_INLINE void treelet_bottom_up_mark_treelets(
876 global struct BVHBase* bvh,
877 global InternalNode* internalNodes,
878 global StartPoint* scratch_startpoints,
879 uint curNodeIndex,
880 BackPointers* backPointers,
881 global TreeletNodeData* treelets,
882 uint refitTreeletsDataStart,
883 uint* startpointAlloc)
884 {
885 TreeletsOpenNodeInfo currInfo;
886 currInfo.maxDepth = 0;
887 currInfo.numStartpoints = 1;
888
889 global RefitTreelet* treeletDescs = (global RefitTreelet*) (((global char*)bvh) + (refitTreeletsDataStart * 64));
890
891 treelets[curNodeIndex] = EncodeOpenInfo(currInfo, true);
892
893 /* the start node got already processed, thus go to its parent node */
894 uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
895 curNodeIndex = parentPointer >> 6;
896
897 bool isInTip = false;
898 while (curNodeIndex != 0x03FFFFFF)
899 {
900 uint numChildrenTotal = 0;
901 // numChildrenTotal and parentPointer gets updated...
902 // atomic trickery, on backpointers, only the last one thread enters up
903 {
904 /* increment refit counter that counts refitted children of current node */
905 global uint* pCurrentBackpointer = (global uint*)InnerNode_GetBackPointer(backPointers, curNodeIndex);
906 mem_fence_gpu_invalidate();
907 parentPointer = 1 + atomic_inc_global(pCurrentBackpointer);
908
909 /* if all children got refitted, then continue */
910 const uint numChildrenRefitted = (parentPointer >> 0) & 0x7;
911 numChildrenTotal = (parentPointer >> 3) & 0x7;
912
913 if (numChildrenRefitted != numChildrenTotal)
914 return;
915
916 /* reset refit counter for next refit */
917 *pCurrentBackpointer = (parentPointer & 0xfffffff8);
918 }
919
920 /* get children treelets */
921 global struct InternalNode* node = internalNodes + curNodeIndex;
922 uint childrenIndices = curNodeIndex + node->childOffset;
923 global TreeletNodeData* childrenTreelets = treelets + childrenIndices;
924
925 // yeah, it is possible we are pulling trash here, but we wont use it.
926 // this is for the sake of one non control flow spoiled data pull
927 TreeletNodeData dataCh0 = childrenTreelets[0]; TreeletNodeData dataCh1 = childrenTreelets[1];
928 TreeletNodeData dataCh2 = childrenTreelets[2]; TreeletNodeData dataCh3 = childrenTreelets[3];
929 TreeletNodeData dataCh4 = childrenTreelets[4]; TreeletNodeData dataCh5 = childrenTreelets[5];
930
931 // zero out the potential trash
932 if (numChildrenTotal < 3) dataCh2 = 0;
933 if (numChildrenTotal < 4) dataCh3 = 0;
934 if (numChildrenTotal < 5) dataCh4 = 0;
935 if (numChildrenTotal < 6) dataCh5 = 0;
936
937 TreeletsOpenNodeInfo infoCh0 = DecodeOpenInfo(dataCh0);
938 TreeletsOpenNodeInfo infoCh1 = DecodeOpenInfo(dataCh1);
939 TreeletsOpenNodeInfo infoCh2 = DecodeOpenInfo(dataCh2);
940 TreeletsOpenNodeInfo infoCh3 = DecodeOpenInfo(dataCh3);
941 TreeletsOpenNodeInfo infoCh4 = DecodeOpenInfo(dataCh4);
942 TreeletsOpenNodeInfo infoCh5 = DecodeOpenInfo(dataCh5);
943
944 uint numChildrenBeingRoots = isTreeletRoot(dataCh0) + isTreeletRoot(dataCh1) + isTreeletRoot(dataCh2) + isTreeletRoot(dataCh3) + isTreeletRoot(dataCh4) + isTreeletRoot(dataCh5);
945 // see if we should merge the trees, if not then we should move to tip.
946 currInfo.numStartpoints = infoCh0.numStartpoints + infoCh1.numStartpoints + infoCh2.numStartpoints + infoCh3.numStartpoints + infoCh4.numStartpoints + infoCh5.numStartpoints;
947
948 bool isTipStartpoint = false;
949 if (!isInTip)
950 {
951 // TODO: threshold could be a dynamic parameter based on the number of actual inner nodes
952 bool mergeTreelets = ((currInfo.numStartpoints > 0) && (currInfo.numStartpoints < TREELET_NUM_STARTPOINTS));
953 bool allChildrenRootsCurrently = numChildrenTotal == numChildrenBeingRoots;
954 if (mergeTreelets && allChildrenRootsCurrently)
955 {
956 childrenTreelets[0] = ClearTreeletRoot(dataCh0);
957 childrenTreelets[1] = ClearTreeletRoot(dataCh1); // -1 will be recognised then as this is not a treelet root.
958 if (numChildrenTotal > 2) childrenTreelets[2] = ClearTreeletRoot(dataCh2);
959 if (numChildrenTotal > 3) childrenTreelets[3] = ClearTreeletRoot(dataCh3);
960 if (numChildrenTotal > 4) childrenTreelets[4] = ClearTreeletRoot(dataCh4);
961 if (numChildrenTotal > 5) childrenTreelets[5] = ClearTreeletRoot(dataCh5);
962 }
963 else
964 {
965 isInTip = true;
966 isTipStartpoint = allChildrenRootsCurrently;
967 }
968 }
969
970 // close any roots underneath
971 if (isInTip && numChildrenBeingRoots)
972 {
973 uint trivialRoots = isTrivialTreeletRoot(dataCh0) + isTrivialTreeletRoot(dataCh1) + isTrivialTreeletRoot(dataCh2) +
974 isTrivialTreeletRoot(dataCh3) + isTrivialTreeletRoot(dataCh4) + isTrivialTreeletRoot(dataCh5);
975
976 uint treeletId = 0;
977 uint bottomStartpointSpace = 0;
978
979 uint startpointsFromTiptree = trivialRoots;
980
981 if (trivialRoots) isTipStartpoint = false;
982
983 if (numChildrenBeingRoots > trivialRoots)
984 {
985 startpointsFromTiptree += // startpoint ONLY from tiptree
986 (1 - isTreeletRoot(dataCh0)) * infoCh0.numStartpoints +
987 (1 - isTreeletRoot(dataCh1)) * infoCh1.numStartpoints +
988 (1 - isTreeletRoot(dataCh2)) * infoCh2.numStartpoints +
989 (1 - isTreeletRoot(dataCh3)) * infoCh3.numStartpoints +
990 (1 - isTreeletRoot(dataCh4)) * infoCh4.numStartpoints +
991 (1 - isTreeletRoot(dataCh5)) * infoCh5.numStartpoints;
992
993 treeletId = atomic_add_global((global uint*)BVHBase_GetRefitTreeletCntPtr(bvh), numChildrenBeingRoots - trivialRoots);
994 bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints - startpointsFromTiptree);
995 }
996
997 currInfo.numStartpoints = startpointsFromTiptree;
998
999 chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh0, &infoCh0, childrenIndices + 0, &treeletId);
1000 chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh1, &infoCh1, childrenIndices + 1, &treeletId);
1001 chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh2, &infoCh2, childrenIndices + 2, &treeletId);
1002 chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh3, &infoCh3, childrenIndices + 3, &treeletId);
1003 chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh4, &infoCh4, childrenIndices + 4, &treeletId);
1004 chk_close_Treelet(treeletDescs, treelets, scratch_startpoints, &bottomStartpointSpace, dataCh5, &infoCh5, childrenIndices + 5, &treeletId);
1005 }
1006
1007 if (isTipStartpoint)
1008 {
1009 currInfo.maxDepth = 0;
1010 currInfo.numStartpoints = 1;
1011 }
1012 else
1013 {
1014 // reduce max depth and number of startpoint underneath
1015 currInfo.maxDepth = max(max(max(infoCh0.maxDepth, infoCh1.maxDepth),
1016 max(infoCh2.maxDepth, infoCh3.maxDepth)),
1017 max(infoCh4.maxDepth, infoCh5.maxDepth)) + 1;
1018 }
1019
1020 treelets[curNodeIndex] = EncodeOpenInfo(
1021 currInfo,
1022 !isInTip /*mark marged treelet as an new root iff we are in bottom we */);
1023
1024 /* make parent node the current node */
1025 curNodeIndex = parentPointer >> 6;
1026 }
1027
1028 uint treeletId = *BVHBase_GetRefitTreeletCntPtr(bvh);
1029
1030 uint bottomStartpointSpace = atomic_add_global((global uint*)startpointAlloc, currInfo.numStartpoints);
1031
1032 treelets[0] = EncodeClosedInfo(treeletId);
1033 RefitTreelet tipTreeletDesc;
1034 tipTreeletDesc.startpoint_offset = bottomStartpointSpace;
1035 tipTreeletDesc.numStartpoints = currInfo.numStartpoints;
1036 tipTreeletDesc.maxDepth = currInfo.maxDepth;
1037
1038 treeletDescs[treeletId] = tipTreeletDesc;
1039
1040 uint realNumberOfTreelets = treeletId + 1;
1041 // intentionally we set less by 1, because this number is used in num groups for dispatch which is number of bottom treelets
1042 // so substract 1. Except single treelet tree which is should stay 1.
1043 uint numStartingTreelets = (treeletId == 0) ? 1 : treeletId;
1044
1045 *BVHBase_GetRefitTreeletCntPtr(bvh) = numStartingTreelets;
1046
1047 uint treeletDescSpaceIn64B = (realNumberOfTreelets * sizeof(RefitTreelet) + 63) >> 6;
1048 uint startpointSpaceIn64B = ((bottomStartpointSpace + currInfo.numStartpoints) * sizeof(StartPoint) + 63) >> 6;
1049 bvh->refitStartPointDataStart = refitTreeletsDataStart + treeletDescSpaceIn64B;
1050 bvh->BVHDataEnd = refitTreeletsDataStart +treeletDescSpaceIn64B + startpointSpaceIn64B;
1051 *startpointAlloc = 0;
1052 }
1053
1054
find_refit_treelets(global struct BVHBase * bvh,global TreeletNodeData * treelets,global uint * scratchStartpoints,global uint * startpointAlloc)1055 GRL_INLINE void find_refit_treelets(
1056 global struct BVHBase* bvh,
1057 global TreeletNodeData* treelets,
1058 global uint* scratchStartpoints,
1059 global uint* startpointAlloc)
1060 {
1061 /* get pointer to inner nodes and back pointers */
1062 uniform global InternalNode* inner_nodes = (global InternalNode*) BVHBase_GetInternalNodes(bvh);
1063
1064 /* construct range of nodes that each work group will process */
1065 uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
1066
1067 varying ushort lane = get_sub_group_local_id();
1068 varying uint global_id = get_local_id(0) + get_group_id(0) * get_local_size(0);
1069
1070 uint numBackpointers = BVHBase_GetNumInternalNodes(bvh);
1071
1072 // align to 64B and divide
1073 uint treeletOffsetIn64B = ((numBackpointers * sizeof(uint)) + 63) >> 6;
1074
1075 uint refitTreeletsDataStart = bvh->backPointerDataStart + treeletOffsetIn64B;
1076 if (global_id == 0)
1077 {
1078 bvh->refitTreeletsDataStart = refitTreeletsDataStart;
1079 }
1080
1081 global struct InternalNode* curNode = &inner_nodes[global_id];
1082
1083 varying ushort has_startpoint = 0;
1084 if (global_id < numInnerNodes) {
1085 if ((curNode->nodeType != BVH_INTERNAL_NODE))
1086 {
1087 has_startpoint = 1;
1088 }
1089 }
1090
1091 if (has_startpoint == 0)
1092 return;
1093
1094 treelet_bottom_up_mark_treelets(
1095 bvh,
1096 inner_nodes,
1097 scratchStartpoints,
1098 global_id,
1099 BVHBase_GetBackPointers(bvh),
1100 treelets,
1101 refitTreeletsDataStart,
1102 startpointAlloc);
1103 }
1104
assign_refit_startpoints_to_treelets(global struct BVHBase * bvh,global TreeletNodeData * treelets,global uint * scratchStartpoints)1105 GRL_INLINE void assign_refit_startpoints_to_treelets(
1106 global struct BVHBase* bvh,
1107 global TreeletNodeData* treelets,
1108 global uint* scratchStartpoints)
1109 {
1110 /* get pointer to inner nodes and back pointers */
1111 uniform global struct InternalNode* inner_nodes = (global struct InternalNode*) BVHBase_GetInternalNodes(bvh);
1112
1113 /* construct range of nodes that each work group will process */
1114 uniform const uint numInnerNodes = BVHBase_numNodes(bvh);
1115
1116 varying ushort lane = get_sub_group_local_id();
1117 varying uint starPointNode = get_local_id(0) + get_group_id(0) * get_local_size(0);
1118 varying uint curNodeIndex = starPointNode;
1119 global struct InternalNode* curNode = &inner_nodes[curNodeIndex];
1120
1121 varying ushort is_startpoint = 0;
1122
1123 if (curNodeIndex < numInnerNodes)
1124 {
1125 if ((curNode->nodeType != BVH_INTERNAL_NODE))
1126 {
1127 is_startpoint = 1;
1128 }
1129 }
1130
1131 if (is_startpoint == 0)
1132 {
1133 return;
1134 }
1135
1136 BackPointers* backPointers = BVHBase_GetBackPointers(bvh);
1137
1138 RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
1139 uint numTreelets = *BVHBase_GetRefitTreeletCntPtr(bvh);
1140 if (numTreelets > 1) numTreelets++;
1141
1142 uint myDepthWhenDead = 0;
1143 uint startpointsBeforeMe = 0;
1144 bool dead = false;
1145
1146 uint prevNodeIndex = 0x03FFFFFF;
1147
1148 while (curNodeIndex != 0x03FFFFFF)
1149 {
1150 TreeletNodeData nodeData = treelets[curNodeIndex];
1151
1152 uint parentPointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex);
1153 uint numChildren = BackPointer_GetNumChildren(parentPointer);
1154
1155 // this is counterpart of atomic based entrance decision.
1156 // the alive path is the longest, if two are equal take the one that came through child with smaller index.
1157 if (prevNodeIndex != 0x03FFFFFF)
1158 {
1159 uint leadChildOfCur = curNodeIndex + inner_nodes[curNodeIndex].childOffset;
1160 uint childEnd = numChildren + leadChildOfCur;
1161
1162 uint longestPath = 0;
1163 uint longestPathChildIdx = leadChildOfCur;
1164
1165 for (uint child = leadChildOfCur; child < childEnd; child++)
1166 {
1167 TreeletNodeData childData = treelets[child];
1168 if (!isTreeletRoot(childData))
1169 {
1170 TreeletsOpenNodeInfo childinfo = DecodeOpenInfo(childData);
1171 if (longestPath <= childinfo.maxDepth) {
1172 longestPathChildIdx = child;
1173 longestPath = childinfo.maxDepth + 1;
1174 }
1175
1176 if (child < prevNodeIndex)
1177 {
1178 // also count how many startpoints are there before me (used to place startpoint in proper slot)
1179 startpointsBeforeMe += childinfo.numStartpoints;
1180 }
1181 }
1182 }
1183
1184 if (!dead && prevNodeIndex != longestPathChildIdx)
1185 {
1186 dead = true;
1187 //printf("starPointNode %d dies in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
1188 }
1189
1190 if (!dead) // this "if" is not an "else" to abouve as we might be dead before and comming through the same child index
1191 {
1192 myDepthWhenDead = longestPath;
1193 // it is a startpoint
1194 //printf("starPointNode %d in node %d lives up, its myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
1195 }
1196
1197 if (starPointNode == (uint)-1) {
1198 // we just entered upper treelet as treelet if we are alive, we can be a new startpoint in new treelet
1199 if (dead)
1200 {
1201 //printf("starPointNode %d disappears in node %d, myDepthWhenDead %d\n", starPointNode, curNodeIndex, myDepthWhenDead);
1202 // and we are dead, so we are not a startpoint of tip,
1203 // so we must disappear to not be added as a startpoint.
1204 return;
1205 }
1206 else
1207 {
1208 // it is a startpoint
1209 //printf("starPointNode %d in node %d becoming its new startpoint\n", starPointNode, curNodeIndex);
1210 starPointNode = curNodeIndex;
1211 }
1212 }
1213 }
1214
1215 if (isTreeletRoot(nodeData))
1216 {
1217 TreeletsClosedNodeInfo info = DecodeClosedInfo(nodeData);
1218 RefitTreelet treeletDesc = treeletDescs[info.treeletId];
1219 uint startpointSlot = treeletDesc.startpoint_offset + startpointsBeforeMe;
1220 scratchStartpoints[startpointSlot] = (starPointNode << 6) + (myDepthWhenDead & ((1 << 6) - 1));
1221
1222 //printf("Adding to treeletID %d at root %d startpoint %d StartNodeIdx %d, depth %d\n", info.treeletId, curNodeIndex, startpointSlot, starPointNode, myDepthWhenDead);
1223
1224 if (dead) return;
1225 myDepthWhenDead = 0;
1226 startpointsBeforeMe = 0;
1227 starPointNode = (uint)-1;
1228 }
1229
1230 /* make parent node the current node */
1231 prevNodeIndex = curNodeIndex;
1232 curNodeIndex = BackPointer_GetParentIndex(parentPointer);
1233 //if(!dead)
1234 //printf("starPointNode %d move from node %d to %d\n", starPointNode, prevNodeIndex, curNodeIndex);
1235 }
1236 }
1237
1238 const uint FINALIZE_TREELETS_SLM_DEPTHS_SPACE = 32;
1239
finalize_treelets_in_groups(global struct BVHBase * bvh,global uint * scratchStartpoints,local uint * depths)1240 GRL_INLINE void finalize_treelets_in_groups(
1241 global struct BVHBase* bvh,
1242 global uint* scratchStartpoints,
1243 local uint* depths)
1244 {
1245 uint numTreeletsExecuted = *BVHBase_GetRefitTreeletCntPtr(bvh);
1246
1247 uint local_id = get_local_id(0);
1248
1249 uint numTreelets = (numTreeletsExecuted > 1) ? numTreeletsExecuted + 1 : numTreeletsExecuted;
1250
1251 RefitTreelet* treeletDescs = BVHBase_GetRefitTreeletDescs(bvh);
1252
1253 for (uint treeletId = get_group_id(0); treeletId < numTreelets; treeletId += numTreeletsExecuted)
1254 {
1255 if (treeletId == numTreeletsExecuted && treeletId != 0) { work_group_barrier(CLK_LOCAL_MEM_FENCE); }
1256
1257 RefitTreelet treeletDesc = treeletDescs[treeletId];
1258 StartPoint* srcStartpoints = scratchStartpoints + treeletDesc.startpoint_offset;
1259 if (treeletDesc.numStartpoints <= 1)
1260 {
1261 // for smaller latency we store 1 element treelets as RefitTreeletTrivial,
1262 // this happens most of the time for tip treelet
1263 if (local_id == 0)
1264 {
1265 RefitTreeletTrivial tr = { 0, treeletDesc.numStartpoints, 0, treeletDesc.maxDepth, 0 };
1266 if (treeletDesc.numStartpoints == 1)
1267 {
1268 StartPoint sp = srcStartpoints[0];
1269
1270 tr.theOnlyNodeIndex = StartPoint_GetNodeIdx(sp);
1271 uint backpointer = *InnerNode_GetBackPointer(BVHBase_GetBackPointers(bvh), tr.theOnlyNodeIndex);
1272 tr.numChildrenOfTheNode = BackPointer_GetNumChildren(backpointer);
1273 tr.childrenOffsetOfTheNode = BVHBase_GetInternalNodes(bvh)[tr.theOnlyNodeIndex].childOffset + tr.theOnlyNodeIndex;
1274 }
1275 RefitTreeletTrivial* trivial = (RefitTreeletTrivial*)(treeletDescs + treeletId);
1276 *trivial = tr;
1277 #if REFIT_VERBOSE_LOG
1278 printf("treelet trivial %d {\n theOnlyNodeIndex = %d;\n numStartpoints = %d;\n childrenOffsetOfTheNode = %d;\n maxDepth =%d;\n numChildrenOfTheNode = %d;\n}\n",
1279 treeletId,
1280 tr.theOnlyNodeIndex,
1281 tr.numStartpoints,
1282 tr.childrenOffsetOfTheNode,
1283 tr.maxDepth,
1284 tr.numChildrenOfTheNode);
1285 #endif
1286 }
1287 }
1288 else
1289 {
1290 #define SKIP_PATHS_SORTING 0
1291 #if SKIP_PATHS_SORTING
1292 StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
1293 for (uint startpointID = local_id; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
1294 {
1295 dstStartpoints[startpointID] = srcStartpoints[startpointID];
1296 }
1297 #else
1298 //if (local_id == 0) { printf("treelet %d, numStartpoints = %d\n", treeletId, numStartpoints); }
1299
1300 if (local_id <= treeletDesc.maxDepth) {
1301 depths[local_id] = 0;
1302 // printf("initializing slm treelet %d, depths[%d] = 0\n", treeletId, local_id);
1303 }
1304 work_group_barrier(CLK_LOCAL_MEM_FENCE);
1305
1306 uint loopSize = ((treeletDesc.numStartpoints + (get_sub_group_size() - 1)) / get_sub_group_size()) * get_sub_group_size();
1307
1308 // collect histogram of how many paths of given length we have
1309
1310 // keep count of depth 0
1311 uint val = 0;
1312
1313 // optimize: we will load Startpoint only once to
1314 uint S_c[8];
1315 // optimize: keep accumulated numbers in registers to limit number of atomic ops
1316 uint D_c[8] = { 0 };
1317
1318 uint cached_threshold = 8 * get_local_size(0);
1319 cached_threshold = min(cached_threshold, treeletDesc.numStartpoints);
1320
1321 uint loop_turn = 0;
1322 uint sgid = get_sub_group_local_id();
1323
1324 for (uint startpointID = local_id+ cached_threshold; startpointID < treeletDesc.numStartpoints; startpointID += get_local_size(0))
1325 {
1326 uint dstSlot = StartPoint_GetDepth(srcStartpoints[startpointID]);
1327 atomic_inc((volatile local uint*) (depths + dstSlot));
1328 }
1329
1330 uint HistogramSG = 0;
1331 if (treeletDesc.maxDepth < 8)
1332 {
1333 for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
1334 {
1335 StartPoint S = srcStartpoints[startpointID];
1336 S_c[loop_turn++] = S;
1337 uint dstSlot = StartPoint_GetDepth(S);
1338 D_c[dstSlot]++;
1339 }
1340
1341 for (uint d = 0; d <= treeletDesc.maxDepth; d++)
1342 {
1343 val = sub_group_reduce_add(D_c[d]);
1344 if (sgid == d)
1345 {
1346 HistogramSG = val;
1347 }
1348 }
1349 if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
1350 {
1351 atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
1352 }
1353 }
1354 else
1355 {
1356 for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
1357 {
1358 StartPoint S = srcStartpoints[startpointID];
1359 S_c[loop_turn++] = S;
1360 uint dstSlot = StartPoint_GetDepth(S);
1361 atomic_inc((volatile local uint*) (depths + dstSlot));
1362 }
1363 }
1364
1365 work_group_barrier(CLK_LOCAL_MEM_FENCE);
1366
1367 #if REFIT_VERBOSE_LOG
1368 if (local_id == 0)
1369 {
1370 for (uint d = 0; d <= treeletDesc.maxDepth; d++)
1371 {
1372 printf("treelet %d depths[%d] = %d\n", treeletId, d, depths[d]);
1373 }
1374 }
1375 #endif
1376
1377 if (treeletDesc.maxDepth < get_sub_group_size())
1378 {
1379 if (get_sub_group_id() == 0)
1380 {
1381
1382 uint cntOfDepth = 0;
1383 if (sgid <= treeletDesc.maxDepth) {
1384 cntOfDepth = depths[sgid];
1385 }
1386 uint pref_sum = sub_group_scan_exclusive_add(cntOfDepth);
1387 depths[sgid] = pref_sum;
1388
1389 uint numLeft = treeletDesc.numStartpoints - (pref_sum);
1390 uint depthLess64 = (numLeft < 64 ) ? (uint)sgid : (uint)treeletDesc.maxDepth;
1391 uint depthLess128 = (numLeft < 128) ? (uint)sgid : (uint)treeletDesc.maxDepth;
1392 uint depthLess256 = (numLeft < 256) ? (uint)sgid : (uint)treeletDesc.maxDepth;
1393
1394 // filling data for thread 0 who will save this to mem
1395 treeletDesc.depthLess64 = sub_group_reduce_min(depthLess64);
1396 treeletDesc.depthLess128 = sub_group_reduce_min(depthLess128);
1397 treeletDesc.depthLess256 = sub_group_reduce_min(depthLess256);
1398 treeletDesc.numNonTrivialStartpoints = treeletDesc.numStartpoints - cntOfDepth;
1399
1400 if (sgid == 0) {
1401 treeletDescs[treeletId] = treeletDesc;
1402 #if REFIT_VERBOSE_LOG
1403 printf("treelet %d {\n startpoint_offset = %d;\n numStartpoints = %d;\n numNonTrivialStartpoints = %d; \n maxDepth = %d;\n depthLess64 = %d;\n depthLess128 = %d;\n depthLess256 = %d;\n}\n",
1404 treeletId,
1405 treeletDesc.startpoint_offset,
1406 treeletDesc.numStartpoints,
1407 treeletDesc.numNonTrivialStartpoints,
1408 treeletDesc.maxDepth,
1409 treeletDesc.depthLess64,
1410 treeletDesc.depthLess128,
1411 treeletDesc.depthLess256);
1412 #endif
1413 }
1414 }
1415 }
1416 else if (local_id <= treeletDesc.maxDepth) {
1417 uint thisdepthcount = depths[local_id];
1418 treeletDesc.depthLess64 = 0;
1419 treeletDesc.depthLess128 = 0;
1420 treeletDesc.depthLess256 = 0;
1421 uint numLeft = treeletDesc.numStartpoints;
1422 uint pref_sum = 0;
1423
1424 for (uint d = 0; d < local_id; d++)
1425 {
1426 uint depthCnt = depths[d];
1427 if (numLeft > 64) { treeletDesc.depthLess64 = d + 1; }
1428 if (numLeft > 128) { treeletDesc.depthLess128 = d + 1; }
1429 if (numLeft > 256) { treeletDesc.depthLess256 = d + 1; }
1430 pref_sum += depthCnt;
1431 numLeft -= depthCnt;
1432 if (d == 0) { treeletDesc.numNonTrivialStartpoints = numLeft; }
1433 }
1434
1435 if (local_id == treeletDesc.maxDepth)
1436 {
1437 treeletDescs[treeletId] = treeletDesc;
1438 #if REFIT_VERBOSE_LOG
1439 printf("treelet %d {\n startpoint_offset = %d;\n numStartpoints = %d;\n numNonTrivialStartpoints = %d; maxDepth = %d;\n depthLess64 = %d; depthLess128 = %d; depthLess256 = %d;\n}\n",
1440 treeletId,
1441 treeletDesc.startpoint_offset,
1442 treeletDesc.numStartpoints,
1443 treeletDesc.numNonTrivialStartpoints,
1444 treeletDesc.maxDepth,
1445 treeletDesc.depthLess64,
1446 treeletDesc.depthLess128,
1447 treeletDesc.depthLess256);
1448 #endif
1449 }
1450 }
1451
1452 StartPoint* dstStartpoints = BVHBase_GetRefitStartPoints(bvh) + treeletDesc.startpoint_offset;
1453
1454 work_group_barrier(CLK_LOCAL_MEM_FENCE);
1455
1456 loop_turn = 0;
1457 if (treeletDesc.maxDepth < 8)
1458 {
1459 uint prefixSG = 0;
1460
1461 // make prefixSG keep interval for paths with sglid depth that is separated out for sg.
1462 if (sgid <= treeletDesc.maxDepth && HistogramSG != 0)
1463 {
1464 prefixSG = atomic_add((volatile local uint*) (depths + sgid), HistogramSG);
1465 }
1466
1467 // from now on all sgs run independently
1468
1469 // make D_c keep offset interval that is separated out for given lane
1470 for (uint d = 0; d <= treeletDesc.maxDepth; d++)
1471 {
1472 uint thisDPrefixSg = sub_group_broadcast(prefixSG, d);
1473 uint thisLaneCount = D_c[d];
1474 uint laneOffset = sub_group_scan_exclusive_add(thisLaneCount);
1475 D_c[d] = laneOffset + thisDPrefixSg;
1476 }
1477
1478 for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
1479 {
1480 StartPoint S = S_c[loop_turn++];
1481 uint d = StartPoint_GetDepth(S);
1482 uint dstSlot = D_c[d]++;
1483 dstStartpoints[dstSlot] = S;
1484 }
1485 }
1486 else
1487 {
1488 for (uint startpointID = local_id; startpointID < cached_threshold; startpointID += get_local_size(0))
1489 {
1490 StartPoint S = S_c[loop_turn++];
1491 uint d = StartPoint_GetDepth(S);
1492 uint dstSlot = atomic_inc((volatile local uint*) (depths + d));
1493 dstStartpoints[dstSlot] = S;
1494 }
1495 }
1496
1497 for (uint srcStartpointID = local_id+ cached_threshold; srcStartpointID < treeletDesc.numStartpoints; srcStartpointID += get_local_size(0))
1498 {
1499 StartPoint S = srcStartpoints[srcStartpointID];
1500 uint d = StartPoint_GetDepth(srcStartpoints[srcStartpointID]);
1501 uint dstSlot = atomic_inc((volatile local uint*) (depths+ d));
1502 dstStartpoints[dstSlot] = S;
1503 }
1504 #endif //skip sorting
1505 }
1506 }
1507 }
1508