1// 2// Copyright (C) 2009-2022 Intel Corporation 3// 4// SPDX-License-Identifier: MIT 5// 6// 7 8#include "bvh_build_refit.h" 9#include "libs/lsc_intrinsics.h" 10#include "morton/morton_common.h" 11 12/* 13 14 POSTSORT PHASE2: 15 Two kernels here, selected by MORTON_BUILDER_P2_SINGLE_WG_THRESHOLD whish is set to very big value. 16 1. parallel_build_phase2_refit - performs refit using global synchronization and mem_fence_gpu_invalidate. 17 This kernel should be used only for very big bvh, it is faster than non-SLM fallback 18 in parallel_build_phase2_refit_local. 19 2. parallel_build_phase2_refit_local - should be used for most of the cases, we usually fit into SLM with the number of 20 nodes allocated in phase0, but there is also non-SLM fallback there, as the 21 decision on which kernel to run is based on the nodes estimates on the host 22 side. 23 24*/ 25 26 27GRL_INLINE void refit_bottom_up_global_sync( 28 global char* bvh_mem, 29 global uint* global_refit_startpoints, 30 uniform uint nodeId, 31 uniform ushort lane) 32{ 33 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 34 35 BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); 36 global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); 37 38 // Get the node idx that was put here in phase1 39 const uint innerNodeIdx = global_refit_startpoints[nodeId]; 40 41 // Get the qnode and backpointer 42 uniform global struct QBVHNodeN* qnode = nodeData + innerNodeIdx; 43 uint backPointer = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); 44 45 varying struct AABB childrenAABB; // one child AABB per lane 46 AABB_init(&childrenAABB); 47 48 uniform uint numChildren = (backPointer >> 3) & 0x7; 49 if(numChildren == 0) return; 50 51 global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode ); 52 varying ushort child_idx = (lane < numChildren) ? lane : 0; 53 childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx ); 54 55#if MORTON_VERBOSE_LOG 56 if(lane == 0) 57 printf("REFIT2: index: %d, child_idx: %d\n", innerNodeIdx, child_idx); 58#endif 59 60 struct AABB reduce_bounds = AABB_sub_group_reduce_N6( &childrenAABB ); 61 reduce_bounds = AABB_sub_group_shuffle( &reduce_bounds, 0 ); 62 63 subgroup_QBVHNodeN_setBounds(qnode, reduce_bounds, childrenAABB, numChildren, lane); 64 65 uint children_mask = qnode_child[child_idx].instMask; 66 qnode->instMask = sub_group_reduce_or_N6(children_mask); 67 68 SUBGROUP_refit_bottom_up( qnode, bvh, reduce_bounds, numChildren, lane, 0 ); 69} 70 71__attribute__( (reqd_work_group_size( 16, 1, 1 )) ) void kernel 72parallel_build_phase2_refit( global char* bvh_mem, 73 global uint* global_refit_startpoints ) 74{ 75 refit_bottom_up_global_sync(bvh_mem, global_refit_startpoints, get_group_id(0), get_local_id(0)); 76} 77 78 79GRL_INLINE void SUBGROUP_refit_bottom_up_global( 80 uniform global struct QBVHNodeN* globalNodeData, 81 uniform struct BackPointers* backPointers, 82 varying ushort lane, 83 varying uint curNodeIndex) 84{ 85 uniform uint backpointer = *InnerNode_GetBackPointer(backPointers, curNodeIndex); 86 87 const uint head_lane = 0; 88 uniform struct AABB child_aabb; // this carries reduced aabb between loop turns 89 90 while (curNodeIndex != 0) 91 { 92 global struct QBVHNodeN* qnode = globalNodeData + curNodeIndex; 93 global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( qnode ); 94 uint numChildren = BackPointer_GetNumChildren(backpointer); 95 96 varying ushort child_idx = (lane < numChildren) ? lane : 0; 97 child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx ); 98 99 struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb); 100 reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane); 101 102 /* get bounds of all children from child nodes directly */ 103 subgroup_QBVHNodeN_setBounds(qnode, reduced_bounds, child_aabb, numChildren, lane); 104 105 uchar childrenMask = qnode_child[child_idx].instMask; 106 qnode->instMask = sub_group_reduce_or_N6(childrenMask); 107 108 uint parentIndex = BackPointer_GetParentIndex(backpointer); 109 110 mem_fence_gpu_invalidate(); 111 112 if (lane == 0) 113 { 114 backpointer = atomic_inc_global((__global uint *)InnerNode_GetBackPointer(backPointers, parentIndex)); 115 116 uint globalBackpointer = (parentIndex << 6) | (numChildren << 3); 117 118 /* set global back pointer */ 119 *InnerNode_GetBackPointer(backPointers, curNodeIndex) = globalBackpointer; 120 121#if MORTON_VERBOSE_LOG 122 printf("BU_INNER: index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, child_loc_idx: %d reduced_bounds: %f\n", 123 curNodeIndex, curNodeIndex + qnode->offset, qnode->offset, backpointer >> 6, numChildren, child_idx, reduced_bounds.lower.x); 124#endif 125 } 126 127 backpointer = 1 + intel_sub_group_shuffle(backpointer, head_lane); 128 curNodeIndex = parentIndex; 129 130 /* if all children got refitted, then continue */ 131 uniform uint numChildrenRefitted = (backpointer >> 0) & 0x7; 132 uniform uint numChildrenTotal = (backpointer >> 3) & 0x7; 133 134 if (numChildrenRefitted != numChildrenTotal) 135 return; 136 } 137 138 // process root of the treelet 139 { 140 141#if MORTON_DEBUG_CHECKS 142 if (curNodeIndex != 0) printf("SUBGROUP_refit_bottom_up_local: this should be local node index 0\n"); 143#endif 144 145 global struct QBVHNodeN* qnode_child = (global struct QBVHNodeN*)QBVHNodeN_childrenPointer( globalNodeData ); 146 uint numChildren = BackPointer_GetNumChildren(backpointer); 147 148 varying ushort child_idx = (lane < numChildren) ? lane : 0; 149 child_aabb = getAABB_QBVHNodeN( qnode_child + child_idx ); 150 151 struct AABB reduced_bounds = AABB_sub_group_reduce_N6(&child_aabb); 152 reduced_bounds = AABB_sub_group_shuffle(&reduced_bounds, head_lane); 153 154 /* get bounds of all children from child nodes directly */ 155 subgroup_QBVHNodeN_setBounds(globalNodeData, reduced_bounds, child_aabb, numChildren, lane); 156 157 uchar childrenMask = qnode_child[child_idx].instMask; 158 globalNodeData->instMask = sub_group_reduce_or_N6(childrenMask); 159 160 /* reset refit counter for next refit */ 161 if (lane == 0) 162 { 163 /* set global back pointer */ 164 *InnerNode_GetBackPointer(backPointers, 0) = backpointer & (~7u); 165 166#if MORTON_VERBOSE_LOG 167 printf("BU_ROOT: curNodeIndex: %d, index: %d, first_child_id: %d, offset: %d, parent: %d, numChildren: %d, sg_bu_startpoints_cnt: %d\n", 168 curNodeIndex, 0, 0 + globalNodeData->offset, globalNodeData->offset, backpointer >> 6, numChildren, sg_bu_startpoints_cnt); 169#endif 170 } 171 } 172} 173 174 175// TODO: Check why 512 wg size has worse performance than 256 176__attribute__( (reqd_work_group_size( 512, 1, 1 )) ) 177__attribute__((intel_reqd_sub_group_size(16))) void kernel 178parallel_build_phase2_refit_local( global struct Globals* globals, 179 global char* bvh_mem, 180 global struct MortonFlattenedBoxlessNode *boxless_nodes) 181{ 182 // Number of nodes created in P0, to be refitted in this stage 183 uint p0_created_num = globals->p0_created_num; 184 185 // Return immediately if host executed this kernel but there is nothing to do 186 if(p0_created_num == 0) 187 return; 188 189 global struct BVHBase* bvh = (global struct BVHBase*)bvh_mem; 190 BackPointers* backPointers = BVHBase_GetBackPointers( bvh ); 191 global struct QBVHNodeN* nodeData = BVHBase_nodeData( bvh ); 192 varying ushort lane = get_sub_group_local_id(); 193 194 // Hardcode SLM to max here as we do not know upfront how much mem will be needed 195 local union UPerNodeData perNodeData[MORTON_BUILDER_P2_ELEMENTS_IN_SLM]; /* 16kb is max slm for 256 wg_size */ 196 197 // Number of allocated nodes in phase0 (p0_created_num + children) 198 uint p0_allocated_num = globals->p0_allocated_num; 199 200 // array that will keep 2x8 shorts indices 201 varying uint sg_fatleaf_array = 0x0; 202 uniform uint8_t sg_bu_startpoints_cnt = 0; 203 204 // Determine if we can fit into SLM with all the nodes allocated in phase0, 205 // There are two paths here: 206 // 1. Copy all needed flattened nodes and bounding boxes to SLM and reuse bottom up local, 207 // which does refit nad creates qnodes in bvh 208 // 2. If not fit into SLM, first create qnodes in bvh, and perform bottom up refit with global atomics synchronization. 209 // It is not performant to do so, keep it as a guardrail here. On the host side we do fallback 210 // to the old refit separated path, with wg_size 8 with better EU reuse. 211 if(p0_allocated_num < MORTON_BUILDER_P2_ELEMENTS_IN_SLM) 212 { 213 for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() ) 214 { 215 MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID]; 216 uint current_id = boxless_node.binary_hierarchy_index >> 6; 217 218 // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused 219 uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F); 220 221 if(lane == 0) 222 perNodeData[current_id].boxlessNode = boxless_node; 223 224 // When no children are subtree roots, we are done and skip to the next iteration 225 if(children_root_mask == 0x0) 226 { 227 continue; 228 } 229 // When all children are subtree roots, put them to sg_fatleaf_array 230 else if(children_root_mask == 0x3F) 231 { 232 set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane); 233 } 234 235 uniform global struct QBVHNodeN* qnode = nodeData + current_id; 236 237 uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7; 238 uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node); 239 varying ushort child_idx = (lane < numChildren) ? lane : 0; 240 241 varying struct AABB childrenAABB; // one child AABB per lane 242 AABB_init(&childrenAABB); 243 244 uint lead_child_global_id = current_id + lead_child_offset; 245 246 uniform global struct QBVHNodeN* qnode_child = nodeData + lead_child_global_id; 247 childrenAABB = getAABB_QBVHNodeN( qnode_child + child_idx ); 248 249 // Get only AABBs of children that are p1 subtree roots 250 bool lane_active = boxless_node.binary_hierarchy_index & (1 << child_idx); 251 if(lane_active) 252 { 253 uint child_global_id = lead_child_global_id + child_idx; 254 perNodeData[child_global_id].box = childrenAABB; 255 perNodeData[child_global_id].box.lower.w = as_float((uint)qnode_child->instMask); 256 } 257 258#if MORTON_VERBOSE_LOG 259 if(lane == 0) 260 printf("P2_LOCAL: ID: %d, lead_child_offset: %d, child_idx: %d, lane_active: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, lane_active, boxless_node.backPointer >> 6, perNodeData[ID].box.lower.x, qnode->offset); 261#endif 262 } 263 264 work_group_barrier(CLK_LOCAL_MEM_FENCE); 265 266 SUBGROUP_refit_bottom_up_local(nodeData, backPointers, 0, 0, lane, perNodeData, sg_fatleaf_array, sg_bu_startpoints_cnt); 267 } 268 else 269 { 270 for (uint ID = get_sub_group_id(); ID < p0_created_num; ID += get_num_sub_groups() ) 271 { 272 MortonFlattenedBoxlessNode boxless_node = boxless_nodes[ID]; 273 uint current_id = boxless_node.binary_hierarchy_index >> 6; 274 275 // Put the mask for the children that are subtree roots in the binary_hierarchy_index that is unused 276 uchar children_root_mask = (boxless_node.binary_hierarchy_index & 0x3F); 277 uniform uint numChildren = (boxless_node.backPointer >> 3) & 0x7; 278 279 uniform global struct QBVHNodeN* qnode = nodeData + current_id; 280 uint nodeType = MortonFlattenedBoxlessNode_GetType(boxless_node); 281 uint lead_child_offset = MortonFlattenedBoxlessNode_GetChildOffset(boxless_node); 282 283 SUBGROUP_QBVHNodeN_setChildIncr1( qnode ); 284 if(lane == 0) 285 { 286 QBVH6Node_set_type( qnode, nodeType ); 287 qnode->offset = lead_child_offset; 288 } 289 290 // When no children are subtree roots, we are done and skip to the next iteration 291 if(children_root_mask == 0x0) 292 { 293 continue; 294 } 295 // When all children are subtree roots, put them to sg_fatleaf_array 296 else if(children_root_mask == 0x3F) 297 { 298 set_2xSG_arr_first_write(sg_bu_startpoints_cnt++, &sg_fatleaf_array, current_id, lane); 299 } 300 301#if MORTON_VERBOSE_LOG 302 if(lane == 0) 303 printf("P2_GLOBAL: ID: %d, lead_child_offset: %d, child_idx: %d, boxless_node >> 6: %d, perNodeData[ID].box = %f, qnode->offset: %d\n", ID, lead_child_offset, child_idx, boxless_node.backPointer >> 6, reduce_bounds.lower.x, qnode->offset); 304#endif 305 } 306 307 while (sg_bu_startpoints_cnt > 0) 308 { 309 uint curNodeIndex = get_from_2xSG_arr(--sg_bu_startpoints_cnt, sg_fatleaf_array, lane); 310 311 SUBGROUP_refit_bottom_up_global(nodeData, backPointers, lane, curNodeIndex); 312 } 313 } 314} 315