1// 2// Copyright (C) 2009-2021 Intel Corporation 3// 4// SPDX-License-Identifier: MIT 5// 6// 7 8#include "GRLGen12.h" 9 10#include "bvh_build_refit.h" 11#include "bvh_build_treelet_refit.h" 12 13 14struct RefitScratch 15{ 16 float lower[3]; 17 uint mask; 18 float upper[3]; 19 uint _pad; 20 21}; 22 23GRL_ANNOTATE_IGC_DO_NOT_SPILL 24__attribute__((reqd_work_group_size(64, 1, 1))) void kernel 25init_refit_scratch( 26 global struct BVHBase* bvh, 27 global struct RefitScratch* scratch ) 28{ 29 uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); 30 31 if ( tid < BVHBase_GetNumInternalNodes(bvh) ) 32 { 33 float4 v = (float4) (FLT_MAX,FLT_MAX,FLT_MAX,0); 34 store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 0, as_uint4(v) ); 35 store_uint4_L1WB_L3WB( (global uint4*) &scratch[tid], 1, as_uint4(v) ); 36 } 37} 38 39bool is_fat_leaf( InternalNode* curNode ) 40{ 41 return curNode->nodeType != BVH_INTERNAL_NODE; // TODO: Not enough for traversal shaders!! if ts enabled need to check child types 42} 43 44GRL_ANNOTATE_IGC_DO_NOT_SPILL 45__attribute__((reqd_work_group_size(64, 1, 1))) void kernel 46build_fatleaf_table( 47 global struct BVHBase* bvh ) 48{ 49 uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); 50 51 if ( tid < BVHBase_GetNumInternalNodes(bvh) ) 52 { 53 InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid; 54 55 if ( is_fat_leaf(curNode) ) 56 { 57 uint offs = atomic_inc_global( &bvh->fatLeafCount ); 58 59 BackPointers* backPointers = BVHBase_GetBackPointers(bvh); 60 uint bp = *InnerNode_GetBackPointer(backPointers, tid); 61 62 LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+offs; 63 leaf->backpointer = bp; 64 leaf->inner_node_index = tid; 65 leaf->leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart; 66 } 67 } 68} 69 70GRL_ANNOTATE_IGC_DO_NOT_SPILL 71__attribute__((reqd_work_group_size(64, 1, 1))) void kernel 72build_fatleaf_table_new_update( 73 global struct Globals *globals, 74 global struct BVHBase* bvh ) 75{ 76 uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); 77 78 if ( tid < BVHBase_GetNumInternalNodes(bvh) ) 79 { 80 InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid; 81 82 if ( is_fat_leaf(curNode) ) 83 { 84 // This implementation uses fatleaf table structure but it is actually quad table 85 // Also tested implementation that process 2 fatleafs per SIMD line as we iterate over the children 86 // but performance was worse 87 BackPointers* backPointers = BVHBase_GetBackPointers(bvh); 88 uint bp = *InnerNode_GetBackPointer(backPointers, tid); 89 uint fatLeafTableStart = bvh->fatLeafTableStart; 90 91 uint leaf_index = (BVH_ROOT_NODE_OFFSET/64) + tid + curNode->childOffset - bvh->quadLeafStart; 92 uint numChildren = (bp >> 3) & 0x7; 93 94 uint quad_leaf_table_index = leaf_index; 95 96 // Check if num children is outside of the % 256 work group 97 // If so, move these cases to the offset after numQuads and push them to the leftovers part 98 // where fatleaves are stored every 8th pos with additional padding 99 // This way we will not have the case in leftovers table where single fatleaf has children in 2 separate work groups 100 101 uint prev_group = leaf_index & 255; 102 uint next_group = (leaf_index + (numChildren - 1)) & 255; 103 uint slm_pos = prev_group; 104 bool is_leftover = prev_group > next_group; 105 106 if(is_leftover) 107 { 108 LeafTableEntry* leafBase = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index)); 109 uint numQuads_aligned_256 = (globals->numPrimitives + 255) & ~255; 110 111 uint leftovers_offset = atomic_add_global( &bvh->quadLeftoversCountNewAtomicUpdate, 8 ); 112 113 for(uint i = 0; i < BVH_NODE_N6; i++) 114 { 115 uint pos = (i < numChildren) ? i : 0; 116 LeafTableEntry* leaf_null = &leafBase[pos]; 117 leaf_null->leaf_index = -1 << 3; 118 } 119 120 quad_leaf_table_index = numQuads_aligned_256 + leftovers_offset; 121 slm_pos = leftovers_offset & 255; 122 } 123 124 LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * fatLeafTableStart + 12 * quad_leaf_table_index)); 125 126 for(uint i = 0; i < BVH_NODE_N6; i++) 127 { 128 uint pos = (i < numChildren) ? i : 0; 129 LeafTableEntry* leafCur = &leaf[pos]; 130 leafCur->backpointer = bp; 131 leafCur->inner_node_index = (tid << 8) | slm_pos; 132 leafCur->leaf_index = (leaf_index << 3) | pos; 133 } 134 135 // Need to clean the unused area where we pad to 8 for leftovers 136 if(is_leftover) 137 { 138 for(uint i = 1; i < 8; i++) 139 { 140 uint pos = (i >= numChildren) ? i : 7; 141 LeafTableEntry* leafCur = &leaf[pos]; 142 leafCur->leaf_index = -1 << 3; 143 } 144 } 145 } 146 } 147} 148 149GRL_ANNOTATE_IGC_DO_NOT_SPILL 150__attribute__((reqd_work_group_size(64, 1, 1))) void kernel 151build_innernode_table( 152 global struct BVHBase* bvh ) 153{ 154 uint tid = get_local_id(0) + get_group_id(0)*get_local_size(0); 155 156 if ( tid < BVHBase_GetNumInternalNodes(bvh) ) 157 { 158 InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+tid; 159 160 if ( !is_fat_leaf( curNode ) ) 161 { 162 uint offs = atomic_inc_global( &bvh->innerCount ); 163 164 BackPointers* backPointers = BVHBase_GetBackPointers(bvh); 165 uint bp = *InnerNode_GetBackPointer(backPointers, tid); 166 167 InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh)+offs; 168 inner->node_index_and_numchildren = (tid<<3) | ((bp>>3) &7); 169 inner->first_child = tid + curNode->childOffset; 170 } 171 } 172} 173 174GRL_ANNOTATE_IGC_DO_NOT_SPILL 175__attribute__((reqd_work_group_size(256, 1, 1))) void kernel 176fixup_quad_table( 177 global struct BVHBase* bvh ) 178{ 179 // This kernel has 2 work groups that set the magic number for unused data in 180 // fatleaf table. One work group for thelast group of the first part where quads are packed, 181 // second one for the last group of the part where quads are stored padded 182 183 uint numQuads = BVHBase_GetNumQuads(bvh); 184 uint numQuadLeftovers = bvh->quadLeftoversCountNewAtomicUpdate; 185 uint numQuadLeftovers_aligned_256 = (numQuadLeftovers + 255) & ~255; 186 187 uint numQuads_aligned_256 = (numQuads + 255) & ~255; 188 uint quadOffsetEnd = numQuads_aligned_256 + get_group_id(0) * numQuadLeftovers_aligned_256; 189 uint quadOffsetStart = quadOffsetEnd - 256; 190 191 uint quads_number_last_group = (get_group_id(0) == 0) ? numQuads : numQuads_aligned_256 + numQuadLeftovers; 192 193 uint leftovers = quadOffsetEnd - quads_number_last_group; 194 195 uint tid = get_local_id(0) > (255 - leftovers) ? get_local_id(0) : 256 - leftovers; 196 197 if(leftovers != 0) 198 { 199 LeafTableEntry* leafBvh = BVHBase_GetFatLeafTable(bvh); 200 201 LeafTableEntry* leaf = &leafBvh[quadOffsetStart + tid]; 202 leaf->leaf_index = -1 << 3; 203 } 204 205 if(get_group_id(0) == 1 && get_local_id(0) == 0) 206 bvh->quadTableSizeNewAtomicUpdate = quadOffsetEnd; 207} 208 209 210// updates one quad leaf and gets BBOX contatining it 211GRL_INLINE void refit_bottom_child_quad_WB( 212 global struct QuadLeaf* quad, 213 global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, 214 struct AABB* childAABB) 215{ 216 /* get the geomID and primID0/1 for both quad triangles */ 217 const uint geomID = PrimLeaf_GetGeoIndex(&quad->leafDesc); 218 const uint primID0 = quad->primIndex0; 219 const uint primID1 = primID0 + QuadLeaf_GetPrimIndexDelta(quad); 220 ushort fourth_vert = 0; 221 222 if (primID1 != primID0) 223 { 224 ushort packed_indices = QuadLeaf_GetSecondTriangleIndices(quad); 225 fourth_vert = ((packed_indices & 0x0C) == 0x0C) ? 1 : fourth_vert; 226 fourth_vert = ((packed_indices & 0x30) == 0x30) ? 2 : fourth_vert; 227 } 228 229 global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc + geomID; 230 231 uint4 indices = GRL_load_quad_indices(desc, primID0, primID1, fourth_vert); 232 233 // read the indices of the 4 verts we want 234 float3 vtx0, vtx1, vtx2, vtx3; 235 GRL_load_quad_vertices(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices); 236 237 childAABB->lower.xyz = min( min( vtx0, vtx1 ), min(vtx2,vtx3) ); 238 childAABB->upper.xyz = max( max( vtx0, vtx1 ), max(vtx2,vtx3) ); 239 240 float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x ); 241 float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y ); 242 float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z ); 243 244 global uint4* dst_verts = (global uint4*) &(quad->v[0][0]); 245 store_uint4_L1WB_L3WB( dst_verts, 0, as_uint4(pack0) ); 246 store_uint4_L1WB_L3WB( dst_verts, 1, as_uint4(pack1) ); 247 store_uint4_L1WB_L3WB( dst_verts, 2, as_uint4(pack2) ); 248} 249 250inline uchar4 uchar4_shuffle_down( uchar4 v, uint offs ) 251{ 252 uint vi = as_uint(v); 253 return as_uchar4(intel_sub_group_shuffle_down(vi,vi,offs)); 254} 255inline uchar4 uchar4_broadcast( uchar4 v, uint offs ) 256{ 257 uint vi = as_uint(v); 258 return as_uchar4(sub_group_broadcast(vi,offs)); 259} 260 261GRL_INLINE void sg_InternalNode_setFields( 262 struct InternalNode* node, 263 struct AABB reduced_aabb, 264 const int offset, const uint nodeType, struct AABB* input_aabb, 265 const uint numChildren, const uchar nodeMask ) 266{ 267 const float up = 1.0f + ulp; 268 const float down = 1.0f - ulp; 269 270 struct AABB conservative_aabb = conservativeAABB(&reduced_aabb); 271 const float3 org = conservative_aabb.lower.xyz; 272 273 const float3 len = AABB_size(&conservative_aabb).xyz * up; 274 int3 exp; 275 const float3 mant = frexp_vec3(len, &exp); 276 exp += (mant > (float3)QUANT_MAX_MANT ? (int3)1 : (int3)0); 277 278 uchar4 lower_uchar = 0x80; 279 uchar4 upper_uchar = 0; 280 281 ushort lane = get_sub_group_local_id(); 282 ushort simd8_id = lane/8; 283 ushort logical_lane = lane%8; 284 285 if( logical_lane < numChildren ) 286 { 287 struct AABB child_aabb = conservativeAABB( input_aabb ); // conservative ??? 288 289 float3 lower = floor( bitShiftLdexp3( (child_aabb.lower.xyz - org) * down, -exp + 8 ) ); 290 lower = clamp( lower, (float)(QUANT_MIN), (float)(QUANT_MAX) ); 291 float3 upper = ceil( bitShiftLdexp3( (child_aabb.upper.xyz - org) * up, -exp + 8 ) ); 292 upper = clamp( upper, (float)(QUANT_MIN), (float)(QUANT_MAX) ); 293 lower_uchar.xyz = convert_uchar3_rtn( lower ); 294 upper_uchar.xyz = convert_uchar3_rtp( upper ); 295 } 296 297 uchar4 lo0 = lower_uchar; 298 uchar4 lo1 = uchar4_shuffle_down( lower_uchar, 1 ); 299 uchar4 lo2 = uchar4_shuffle_down( lower_uchar, 2 ); 300 uchar4 lo3 = uchar4_shuffle_down( lower_uchar, 3 ); 301 uchar4 lo4 = uchar4_shuffle_down( lower_uchar, 4 ); 302 uchar4 lo5 = uchar4_shuffle_down( lower_uchar, 5 ); 303 304 uchar4 hi0 = upper_uchar; 305 uchar4 hi1 = uchar4_shuffle_down( upper_uchar,1 ); 306 uchar4 hi2 = uchar4_shuffle_down( upper_uchar,2 ); 307 uchar4 hi3 = uchar4_shuffle_down( upper_uchar,3 ); 308 uchar4 hi4 = uchar4_shuffle_down( upper_uchar,4 ); 309 uchar4 hi5 = uchar4_shuffle_down( upper_uchar,5 ); 310 311 if( logical_lane == 0 ) 312 { 313 uchar childBlockStride = 0x01 + (uint)(nodeType == NODE_TYPE_INSTANCE); 314 315 uint4 block0 = (uint4)(as_uint(org.x), as_uint(org.y), as_uint(org.z), offset); 316 317 char3 exp_char = (char3)(exp.x,exp.y,exp.z); 318 319 uint4 block1 = (uint4)( 320 as_uint((uchar4)(nodeType, 0 /* padding */, exp_char.x, exp_char.y)), 321 as_uint((uchar4)(exp_char.z, nodeMask, childBlockStride, childBlockStride)) , 322 as_uint((uchar4)(childBlockStride, childBlockStride, childBlockStride, childBlockStride)) , 323 as_uint((uchar4)(lo0.x,lo1.x,lo2.x,lo3.x)) 324 ); 325 326 uint4 block2 = (uint4)( 327 as_uint((uchar4)(lo4.x,lo5.x,hi0.x,hi1.x)) , 328 as_uint((uchar4)(hi2.x,hi3.x,hi4.x,hi5.x)) , 329 as_uint((uchar4)(lo0.y,lo1.y,lo2.y,lo3.y)) , 330 as_uint((uchar4)(lo4.y,lo5.y,hi0.y,hi1.y)) 331 ); 332 333 uint4 block3 = (uint4)( 334 as_uint((uchar4)(hi2.y,hi3.y,hi4.y,hi5.y)), 335 as_uint((uchar4)(lo0.z,lo1.z,lo2.z,lo3.z)), 336 as_uint((uchar4)(lo4.z,lo5.z,hi0.z,hi1.z)), 337 as_uint((uchar4)(hi2.z,hi3.z,hi4.z,hi5.z)) 338 ); 339 340 global uint4* pNode = (global uint4*)node; 341 342#if 0 343 printf( 344 "block0 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" 345 "block1 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" 346 "block2 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" 347 "block3 = %08x,%08x,%08x,%08x %08x,%08x,%08x,%08x \n" , 348 block0.x,block0.y,block0.z,block0.w, 349 pNode[0].x, pNode[0].y, pNode[0].z, pNode[0].w, 350 block1.x,block1.y,block1.z,block1.w, 351 pNode[1].x, pNode[1].y, pNode[1].z, pNode[1].w, 352 block2.x,block2.y,block2.z,block2.w, 353 pNode[2].x, pNode[2].y, pNode[2].z, pNode[2].w , 354 block3.x,block3.y,block3.z,block3.w, 355 pNode[3].x, pNode[3].y, pNode[3].z, pNode[3].w ); 356#endif 357 358 store_uint4_L1WB_L3WB( pNode, 0, block0 ); 359 store_uint4_L1WB_L3WB( pNode, 1, block1 ); 360 store_uint4_L1WB_L3WB( pNode, 2, block2 ); 361 store_uint4_L1WB_L3WB( pNode, 3, block3 ); 362 } 363 364} 365 366 367 368GRL_ANNOTATE_IGC_DO_NOT_SPILL 369__attribute__((reqd_work_group_size(256, 1, 1))) 370void kernel 371traverse_aabbs_quad( 372 global struct BVHBase* bvh, 373 global struct RefitScratch* scratch, 374 global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc 375 ) 376{ 377 378 uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh); 379 varying ushort lane = get_sub_group_local_id(); 380 381 uniform uint num_leaves = bvh->fatLeafCount; 382 383 local struct RefitScratch local_scratch[256]; 384 if( get_local_id(0) < min(num_nodes,256u) ) 385 { 386 for( uint i=0; i<3; i++ ){ 387 local_scratch[get_local_id(0)].lower[i] = FLT_MAX; 388 local_scratch[get_local_id(0)].upper[i] = FLT_MAX; 389 } 390 } 391 392 barrier( CLK_LOCAL_MEM_FENCE ); 393 394 395 ushort SIMD8_PER_SG = get_sub_group_size()/8; 396 ushort SIMD8_PER_WG = get_num_sub_groups()*SIMD8_PER_SG; 397 ushort simd8_local_id = get_sub_group_local_id()/8; 398 ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; 399 ushort logical_lane = lane%8; 400 401 uniform uint fatleaf_index = simd8_id + get_group_id(0)*SIMD8_PER_WG; 402 403 404 if ( fatleaf_index < num_leaves ) 405 { 406 LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+fatleaf_index; 407 uint innerNodeIdx = leaf->inner_node_index; 408 uint bp = leaf->backpointer; 409 uint leaf_index = leaf->leaf_index; 410 411 varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx; 412 varying QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index; 413 414 uint childOffs = (((char*)quad) - ((char*)curNode))/64; 415 416 varying struct AABB childrenBox; 417 AABB_init(&childrenBox); 418 419 uint numChildren = (bp >> 3) & 0x7; 420 if (logical_lane < numChildren) 421 { 422 refit_bottom_child_quad_WB( 423 (global struct QuadLeaf*) &quad[logical_lane], 424 geomDesc, 425 &childrenBox ); 426 } 427 428 struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox); 429 struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); 430 for (uint i = 1; i < SIMD8_PER_SG; i++) 431 { 432 struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); 433 int3 is_upper_lane = ((uint3)(i)) == simd8_local_id; 434 reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); 435 reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); 436 } 437 438 sg_InternalNode_setFields( 439 curNode, 440 reduce_bounds, 441 childOffs, 442 NODE_TYPE_QUAD, 443 &childrenBox, 444 numChildren, 445 0xff ); 446 447 // atomic min operation vectorized across 6 lanes 448 // [ lower.xyz ][-][upper.xyz][-] 449 // 450 // Lanes 3 and 7 are inactive. 'upper' is negated 451 bool atomic_mask = (1<<logical_lane) & 0x77; 452 453 uint lmod = logical_lane % 4; 454 uint ldiv = logical_lane / 4; 455 float vlo = reduce_bounds.lower.x; 456 float vhi = reduce_bounds.upper.x; 457 vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo; 458 vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi; 459 vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo; 460 vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi; 461 float v = (ldiv == 0) ? vlo : -vhi; 462 463 464 global float* pv = (global float*) &scratch[innerNodeIdx]; 465 466 store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v)); 467 468 BackPointers* backPointers = BVHBase_GetBackPointers(bvh); 469 uint parent = (bp >> 6); 470 471 // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= 256 472 if(atomic_mask && parent != 0x03FFFFFF) 473 { 474 while( parent >= 256 ) 475 { 476 innerNodeIdx = parent; 477 bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); 478 atomic_min( ((global float*) &(scratch[innerNodeIdx]))+logical_lane, v ); 479 parent = bp >> 6; 480 } 481 while( parent != 0x03FFFFFF ) 482 { 483 innerNodeIdx = parent; 484 bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); 485 atomic_min( ((local float*) &(local_scratch[innerNodeIdx]))+logical_lane, v ); 486 parent = bp >> 6; 487 } 488 } 489 490 } 491 492 493 barrier( CLK_LOCAL_MEM_FENCE ); 494 num_nodes = min(num_nodes,256u); 495 496 local float* in = (local float*)&local_scratch[0]; 497 global float* out = (global float*)&scratch[0]; 498 499 for (uint i = get_local_id(0); i < num_nodes*6; i += 256 ) 500 { 501 // since we want to save [ lower.xyz ][-][upper.xyz][-] i.e 0,1,2, 4,5,6 etc. we need to offset +1 for every triplet 502 uint idx = i + (i/3); 503 504 float v = in[idx]; 505 if( v != FLT_MAX ) 506 atomic_min( out + idx , v ); 507 } 508} 509 510GRL_ANNOTATE_IGC_DO_NOT_SPILL 511__attribute__((reqd_work_group_size(64, 1, 1))) 512void kernel 513write_inner_nodes( 514 global struct BVHBase* bvh, 515 global struct RefitScratch* scratch 516 ) 517{ 518 uint SIMD8_PER_SG = get_sub_group_size()/8; 519 uniform uint node_id = SIMD8_PER_SG * get_sub_group_global_id() + (get_sub_group_local_id()/8); 520 varying ushort lane = get_sub_group_local_id() % 8; 521 varying uint num_inners = bvh->innerCount; 522 523 if ( node_id < num_inners ) 524 { 525 InnerNodeTableEntry* entry = BVHBase_GetInnerNodeTable(bvh) + node_id; 526 uint node_index = entry->node_index_and_numchildren>>3; 527 uint numChildren = entry->node_index_and_numchildren & 7; 528 uint first_child = entry->first_child; 529 530 varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+node_index; 531 532 varying struct AABB childAABB; 533 AABB_init(&childAABB); 534 535 if( lane < numChildren ) 536 { 537 uint child = first_child + lane; 538 childAABB.lower.x = scratch[child].lower[0]; 539 childAABB.lower.y = scratch[child].lower[1]; 540 childAABB.lower.z = scratch[child].lower[2]; 541 childAABB.upper.x = -scratch[child].upper[0]; 542 childAABB.upper.y = -scratch[child].upper[1]; 543 childAABB.upper.z = -scratch[child].upper[2]; 544 } 545 546 varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB); 547 struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); 548 for (uint i = 1; i < SIMD8_PER_SG; i++) 549 { 550 struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); 551 int3 is_upper_lane = ((uint3)(i)) == (get_sub_group_local_id()/8); 552 reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); 553 reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); 554 } 555 556 sg_InternalNode_setFields( 557 curNode, 558 reduce_bounds, 559 first_child - node_index, 560 NODE_TYPE_INTERNAL, 561 &childAABB, 562 numChildren, 563 0xff ); 564 565 } 566 567 if (node_id == 0 && lane == 0 ) 568 { 569 bvh->Meta.bounds.lower[0] = scratch[0].lower[0]; 570 bvh->Meta.bounds.lower[1] = scratch[0].lower[1]; 571 bvh->Meta.bounds.lower[2] = scratch[0].lower[2]; 572 bvh->Meta.bounds.upper[0] = -scratch[0].upper[0]; 573 bvh->Meta.bounds.upper[1] = -scratch[0].upper[1]; 574 bvh->Meta.bounds.upper[2] = -scratch[0].upper[2]; 575 } 576 577} 578 579 580 581#if 1 582#define SLM_BOX_COUNT 1024 583 584struct AABB load_box( uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes ) 585{ 586 if( place < SLM_BOX_COUNT ) 587 return local_boxes[place]; 588 else 589 return extra_boxes[place-SLM_BOX_COUNT]; 590} 591 592void store_box( struct AABB box, uint place, local struct AABB* local_boxes, global struct AABB* extra_boxes ) 593{ 594 if (place < SLM_BOX_COUNT) 595 { 596 local_boxes[place] = box; 597 } 598 else 599 { 600 global uint4* ptr = (global uint4*)&extra_boxes[place-SLM_BOX_COUNT]; 601 store_uint4_L1WB_L3WB( ptr, 0, as_uint4(box.lower) ); 602 store_uint4_L1WB_L3WB( ptr+1, 0, as_uint4(box.upper) ); 603 } 604} 605 606 607GRL_ANNOTATE_IGC_DO_NOT_SPILL 608__attribute__((reqd_work_group_size(512, 1, 1))) 609__attribute__((intel_reqd_sub_group_size(16))) 610void kernel 611update_single_group_quads( 612 global struct BVHBase* bvh, 613 global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, 614 global struct AABB* extra_boxes 615) 616{ 617 uniform uint tid = get_sub_group_global_id(); 618 uniform uint num_nodes = BVHBase_GetNumInternalNodes(bvh); 619 uniform uint num_leaves = bvh->fatLeafCount; 620 uniform uint num_inners = bvh->innerCount; 621 622 varying ushort lane = get_sub_group_local_id(); 623 624 local struct AABB local_boxes[SLM_BOX_COUNT]; // == 32KB 625 626 // initialize nodes 627 for (uint i = get_local_id( 0 ); i < num_nodes; i+= get_local_size(0)) 628 { 629 struct AABB tmp; 630 AABB_init(&tmp); 631 tmp.upper = -tmp.upper; 632 store_box( tmp, i, local_boxes, extra_boxes ); 633 } 634 635 636 if( num_nodes > SLM_BOX_COUNT ) 637 mem_fence_workgroup_default(); 638 639 barrier( CLK_LOCAL_MEM_FENCE ); 640 641 642 ushort SIMD8_PER_SG = get_sub_group_size()/8; 643 ushort NUM_SIMD8 = get_num_sub_groups()*SIMD8_PER_SG; 644 ushort simd8_local_id = get_sub_group_local_id()/8; 645 ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; 646 ushort logical_lane = lane%8; 647 648 649 for ( uint i = simd8_id; i < num_leaves; i+= NUM_SIMD8 ) 650 { 651 LeafTableEntry* leaf = BVHBase_GetFatLeafTable(bvh)+i; 652 uint innerNodeIdx = leaf->inner_node_index; 653 uint bp = leaf->backpointer; 654 uint leaf_index = leaf->leaf_index; 655 656 varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+innerNodeIdx; 657 QuadLeaf* quad = BVHBase_GetQuadLeaves(bvh) + leaf_index; 658 659 uint childOffs = (((char*)quad) - ((char*)curNode))/64; 660 661 varying struct AABB childrenBox; 662 AABB_init(&childrenBox); 663 664 uint numChildren = (bp >> 3) & 0x7; 665 if (logical_lane < numChildren) 666 { 667 668 refit_bottom_child_quad_WB( 669 (global struct QuadLeaf*) &quad[logical_lane], 670 geomDesc, 671 &childrenBox ); 672 } 673 674 struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox); 675 struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); 676 for (uint i = 1; i < SIMD8_PER_SG; i++) 677 { 678 struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); 679 int3 is_upper_lane = ((uint3)(i)) == simd8_local_id; 680 reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); 681 reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); 682 } 683 684 685 if( logical_lane == 0 ) 686 { 687 struct AABB negated = reduce_bounds; 688 negated.upper = -negated.upper; 689 store_box( negated, innerNodeIdx, local_boxes, extra_boxes ); 690 } 691 692 sg_InternalNode_setFields( 693 curNode, 694 reduce_bounds, 695 childOffs, 696 NODE_TYPE_QUAD, 697 &childrenBox, 698 numChildren, 699 0xff ); 700 701 702 // atomic min operation vectorized across 6 lanes 703 // [ lower.xyz ][-][upper.xyz][-] 704 // 705 // Lanes 3 and 7 are inactive. 'upper' is negated 706 uint lmod = logical_lane % 4; 707 uint ldiv = logical_lane / 4; 708 float vlo = reduce_bounds.lower.x; 709 float vhi = reduce_bounds.upper.x; 710 vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo; 711 vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi; 712 vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo; 713 vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi; 714 float v = (ldiv == 0) ? vlo : -vhi; 715 bool atomic_mask = (1<<logical_lane) & 0x77; 716 717 BackPointers* backPointers = BVHBase_GetBackPointers(bvh); 718 uint parent = (bp >> 6); 719 720 // check for parent != 0x03FFFFFF once to be sure we don't enter parent >= SLM_BOX_COUNT 721 if(atomic_mask && parent != 0x03FFFFFF) 722 { 723 while( parent >= SLM_BOX_COUNT ) 724 { 725 innerNodeIdx = parent; 726 bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); 727 atomic_min( ((global float*) &(extra_boxes[innerNodeIdx-SLM_BOX_COUNT]))+logical_lane, v ); 728 parent = bp >> 6; 729 } 730 while( parent != 0x03FFFFFF ) 731 { 732 innerNodeIdx = parent; 733 bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); 734 atomic_min( ((local float*) &(local_boxes[innerNodeIdx]))+logical_lane, v ); 735 parent = bp >> 6; 736 } 737 } 738 739 } 740 741 if( num_nodes > SLM_BOX_COUNT ) 742 mem_fence_workgroup_default(); 743 744 barrier( CLK_LOCAL_MEM_FENCE ); 745 746 for ( uint i = simd8_id; i < num_inners; i+= NUM_SIMD8 ) 747 { 748 InnerNodeTableEntry* inner = BVHBase_GetInnerNodeTable(bvh) + i; 749 uint node_index = inner->node_index_and_numchildren>>3; 750 uint numChildren = inner->node_index_and_numchildren & 7; 751 uint first_child = inner->first_child; 752 753 varying InternalNode* curNode = BVHBase_GetInternalNodes(bvh)+ node_index; 754 755 //if (curNode->nodeType == BVH_INTERNAL_NODE) // TODO: Needs updating for traversal shaders 756 { // TODO: Consider using an inner node table or UC load to avoid polluting LSC with these reads 757 uint child = first_child + logical_lane; 758 759 bool child_valid = (logical_lane < numChildren); 760 761 struct AABB childAABB; 762 AABB_init(&childAABB); 763 if (child_valid) 764 { 765 childAABB = load_box( child, local_boxes, extra_boxes ); 766 childAABB.upper = -childAABB.upper; 767 } 768 769 varying struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childAABB); 770 struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); 771 for (uint i = 1; i < SIMD8_PER_SG; i++) 772 { 773 struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); 774 int3 is_upper_lane = ((uint3)(i)) == (get_sub_group_local_id()/8); 775 reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); 776 reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); 777 } 778 779 sg_InternalNode_setFields( 780 curNode, 781 reduce_bounds, 782 first_child - node_index, 783 NODE_TYPE_INTERNAL, 784 &childAABB, 785 numChildren, 786 0xff ); 787 } 788 } 789 790 791 if (get_sub_group_id() == 0 && lane == 0 ) 792 { 793 bvh->Meta.bounds.lower[0] = local_boxes[0].lower.x; 794 bvh->Meta.bounds.lower[1] = local_boxes[0].lower.y; 795 bvh->Meta.bounds.lower[2] = local_boxes[0].lower.z; 796 bvh->Meta.bounds.upper[0] = -local_boxes[0].upper.x; 797 bvh->Meta.bounds.upper[1] = -local_boxes[0].upper.y; 798 bvh->Meta.bounds.upper[2] = -local_boxes[0].upper.z; 799 } 800 801} 802#endif 803 804GRL_INLINE void traverse_aabbs_new_update_func( 805 global struct BVHBase* bvh, 806 global char* vertices, 807 global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, 808 global struct RefitScratch* scratch, 809 uint vertex_format, 810 local struct AABB3f* children_AABBs, 811 local uint* num_fat_leaves, 812 local struct LeafTableEntry* leafTable_local, 813 const bool single_geo 814 ) 815{ 816 // The first part of the kernel with vertices loads/stores is executed with quad per work item, 817 // using previously prepared QuadDataIndices to get the quad data and vert indices 818 // Second part of the kernel that does the reduction, update fatleaf ain bvh and bottom up is 819 // executed per simd. 820 // For bottom up tested also with local part (using local scratch) but since there is not enough SLM additional 821 // barriers were needed to clean and reuse SLM, which curretnly kills performance. Could be worth to revisit 822 // on future gens. 823 824 varying uint lid = get_local_id(0); 825 varying uint tid = lid + get_group_id(0)*get_local_size(0); 826 827 num_fat_leaves[0] = 0; 828 leafTable_local[lid].leaf_index = -1 << 3; 829 830 LeafTableEntry* leaf = (LeafTableEntry*)(((char*)bvh) + (64u * bvh->fatLeafTableStart + 12 * tid)); 831 uint innerNodeIdx_mem = leaf->inner_node_index; 832 uint bp = leaf->backpointer; 833 uint leaf_index_mem = leaf->leaf_index; 834 835 uint numChildren = (bp >> 3) & 0x7; 836 837 uint leaf_index = leaf_index_mem >> 3; 838 uint slm_child_offset = leaf_index_mem & 0x7; 839 840 uint innerNodeIdx = innerNodeIdx_mem >> 8; 841 uint slm_pos_main = innerNodeIdx_mem & 0xFF; 842 843 uint first_el_of_group = get_group_id(0)*get_local_size(0); 844 uint quadsNum = BVHBase_GetNumQuads(bvh); 845 uint expected_tid = first_el_of_group < quadsNum ? first_el_of_group : quadsNum - 1; 846 847 // Skip writes when not all children for single fatleaf are present in this work group 848 bool skip_tid = leaf_index == 0x1FFFFFFF; 849 leaf_index = skip_tid ? expected_tid : leaf_index; 850 851 // Compute bounding box for quads 852 varying struct AABB3f childrenBox; 853 854 tid = leaf_index + slm_child_offset; 855 856 // Read vertex indices and quad header from separate buffer 857 uint quadIndicesStart = bvh->quadIndicesDataStart; 858 varying struct QuadDataIndices* vertex_indice_ptr = (QuadDataIndices*)(((char*)bvh) + (64u * quadIndicesStart + 32 * tid)); 859 QuadDataIndices vertexMap = vertex_indice_ptr[0]; 860 861 varying global uint4* bounds = (global uint4*)((char*)bvh + (64*bvh->quadLeafStart + 64*tid) ); 862 uint4 quad_data = (uint4)(vertexMap.header_data[0], vertexMap.header_data[1], vertexMap.header_data[2], vertexMap.header_data[3]); 863 uint4 indices = (uint4)(vertexMap.vert_idx[0], vertexMap.vert_idx[1], vertexMap.vert_idx[2], vertexMap.vert_idx[3]); 864 865 global GRL_RAYTRACING_GEOMETRY_DESC* desc = geomDesc; 866 867 if(!single_geo) 868 { 869 uint geomID = vertexMap.header_data[0] & 0xFFFFFF; 870 desc += geomID; 871 vertices = (global char*)desc->Desc.Triangles.pVertexBuffer; 872 vertex_format = desc->Desc.Triangles.VertexFormat; 873 } 874 875 float3 vtx0, vtx1, vtx2, vtx3; 876 GRL_load_quad_vertices_no_stride(desc, &vtx0, &vtx1, &vtx2, &vtx3, indices, vertex_format, vertices); 877 878 for(uint i = 0; i < 3; i++) 879 childrenBox.lower[i] = min( min( vtx0[i], vtx1[i] ), min(vtx2[i],vtx3[i]) ); 880 881 for(uint i = 0; i < 3; i++) 882 childrenBox.upper[i] = max( max( vtx0[i], vtx1[i] ), max(vtx2[i],vtx3[i]) ); 883 884 float4 pack0 = (float4) ( vtx0.x, vtx0.y, vtx0.z, vtx1.x ); 885 float4 pack1 = (float4) ( vtx1.y, vtx1.z, vtx2.x, vtx2.y ); 886 float4 pack2 = (float4) ( vtx2.z, vtx3.x, vtx3.y, vtx3.z ); 887 888 // Store quad data in bvh 889 // Make sure this goes without partial writes to get best perf 890 store_uint4_L1WB_L3WB( bounds, 0, quad_data ); 891 store_uint4_L1WB_L3WB( bounds, 1, as_uint4(pack0) ); 892 store_uint4_L1WB_L3WB( bounds, 2, as_uint4(pack1) ); 893 store_uint4_L1WB_L3WB( bounds, 3, as_uint4(pack2) ); 894 895 barrier( CLK_LOCAL_MEM_FENCE ); 896 897 struct AABB reduce_bounds; 898 899 if(!skip_tid) 900 { 901 // Store AABB in SLM, to be used later for children quantization in fatleaf 902 children_AABBs[slm_pos_main + slm_child_offset] = childrenBox; 903 904 if(slm_child_offset == 0) 905 { 906 uint offset = atomic_inc_local(&num_fat_leaves[0]); 907 leafTable_local[offset].inner_node_index = innerNodeIdx_mem; 908 leafTable_local[offset].backpointer = bp; 909 leafTable_local[offset].leaf_index = leaf_index_mem; 910 } 911 } 912 913 barrier( CLK_LOCAL_MEM_FENCE ); 914 915 varying ushort lane = get_sub_group_local_id(); 916 ushort SIMD8_PER_SG = get_sub_group_size()/8; 917 ushort SIMD8_PER_WG = get_num_sub_groups()*SIMD8_PER_SG; 918 ushort simd8_local_id = get_sub_group_local_id()/8; 919 ushort simd8_id = get_sub_group_id()*SIMD8_PER_SG + simd8_local_id; 920 ushort logical_lane = lane%8; 921 922 uint fatleaves_aligned_32 = (num_fat_leaves[0] + 31) & ~31; 923 924 for(uint offset = 0; offset < fatleaves_aligned_32; offset += 32) 925 { 926 uniform uint fatleaf_index = simd8_id + offset; 927 uint innerNodeIdx_mem = leafTable_local[fatleaf_index].inner_node_index; 928 uint bp = leafTable_local[fatleaf_index].backpointer; 929 uint leaf_index_mem = leafTable_local[fatleaf_index].leaf_index; 930 931 uint numChildren = (bp >> 3) & 0x7; 932 933 uint leaf_index = leaf_index_mem >> 3; 934 uint slm_child_offset = leaf_index_mem & 0x7; 935 936 uint innerNodeIdx = innerNodeIdx_mem >> 8; 937 uint slm_pos_main = innerNodeIdx_mem & 0xFF; 938 939 bool skip_tid = leaf_index == 0x1FFFFFFF; 940 bool active_lane = (logical_lane < numChildren); 941 uint lane_children = active_lane ? logical_lane : 0; 942 943 fatleaf_index = leaf_index; 944 945 varying InternalNode* curNode = (InternalNode*)(((char*)bvh) + (BVH_ROOT_NODE_OFFSET + 64 * innerNodeIdx)); 946 947 global struct Quad *quads = (global struct Quad *)((char*)bvh + 64*bvh->quadLeafStart ); 948 949 varying struct AABB childrenBox_bu; 950 AABB_init(&childrenBox_bu); 951 952 if(!skip_tid) 953 childrenBox_bu = AABBfromAABB3f(children_AABBs[slm_pos_main + lane_children]); 954 955 struct AABB reduce_bounds0 = AABB_sub_group_reduce_N6(&childrenBox_bu); 956 struct AABB reduce_bounds = AABB_sub_group_broadcast(&reduce_bounds0,0); 957 958 for (uint i = 1; i < SIMD8_PER_SG; i++) 959 { 960 struct AABB reduce_bounds1 = AABB_sub_group_broadcast(&reduce_bounds0, 8*i); 961 int3 is_upper_lane = ((uint3)(i)) == simd8_local_id; 962 reduce_bounds.lower.xyz = select( reduce_bounds.lower.xyz, reduce_bounds1.lower.xyz, is_upper_lane ); 963 reduce_bounds.upper.xyz = select( reduce_bounds.upper.xyz, reduce_bounds1.upper.xyz, is_upper_lane ); 964 } 965 966 if(!skip_tid) 967 { 968 uint quad_offset = 64u * bvh->quadLeafStart + 64 * fatleaf_index; 969 varying QuadLeaf* quad = (QuadLeaf*)(((char*)bvh) + quad_offset); 970 uint childOffs = (((char*)quad) - ((char*)curNode))/64; 971 972 sg_InternalNode_setFields( 973 curNode, 974 reduce_bounds, 975 childOffs, 976 NODE_TYPE_QUAD, 977 &childrenBox_bu, 978 numChildren, 979 0xff ); 980 981 bool atomic_mask = (1<<logical_lane) & 0x77; 982 983 uint lmod = logical_lane % 4; 984 uint ldiv = logical_lane / 4; 985 float vlo = reduce_bounds.lower.x; 986 float vhi = reduce_bounds.upper.x; 987 vlo = (lmod == 1) ? reduce_bounds.lower.y : vlo; 988 vhi = (lmod == 1) ? reduce_bounds.upper.y : vhi; 989 vlo = (lmod == 2) ? reduce_bounds.lower.z : vlo; 990 vhi = (lmod == 2) ? reduce_bounds.upper.z : vhi; 991 float v = (ldiv == 0) ? vlo : -vhi; 992 993 global float* pv = (global float*) &scratch[innerNodeIdx]; 994 995 store_uint_L1WB_L3WB( (global uint*)(pv+logical_lane), 0, as_uint(v)); 996 997 BackPointers* backPointers = BVHBase_GetBackPointers(bvh); 998 uint parent = (bp >> 6); 999 1000 global float* parent_v = (global float*) &(scratch[parent]) + logical_lane; 1001 1002 if(atomic_mask && (*parent_v >= v) && (parent != 0x03FFFFFF)) 1003 { 1004 innerNodeIdx = parent; 1005 bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); 1006 atomic_min( parent_v, v ); 1007 parent = bp >> 6; 1008 1009 if(parent != 0x03FFFFFF) 1010 { 1011 while( parent != 0x03FFFFFF ) 1012 { 1013 innerNodeIdx = parent; 1014 bp = *InnerNode_GetBackPointer(backPointers, innerNodeIdx); 1015 1016 global float* parent_v_global = (global float*) &(scratch[innerNodeIdx]) + logical_lane; 1017 if(*parent_v_global >= v) 1018 atomic_min( parent_v_global, v ); 1019 else 1020 break; 1021 1022 parent = bp >> 6; 1023 } 1024 } 1025 } 1026 } 1027 } 1028} 1029 1030GRL_ANNOTATE_IGC_DO_NOT_SPILL 1031__attribute__((reqd_work_group_size(256, 1, 1))) 1032__attribute__( (intel_reqd_sub_group_size( 16 )) ) 1033void kernel 1034traverse_aabbs_new_update( 1035 global struct BVHBase* bvh, 1036 global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, 1037 global struct RefitScratch* scratch 1038 ) 1039{ 1040 varying uint lid = get_local_id(0); 1041 varying uint tid = lid + get_group_id(0)*get_local_size(0); 1042 1043 local struct AABB3f children_AABBs[256]; 1044 local struct LeafTableEntry leafTable_local[256]; 1045 local uint num_fat_leaves; 1046 1047 traverse_aabbs_new_update_func(bvh, (global char*)geomDesc /* not used */, geomDesc, scratch, (uint)-1 /* not used */, 1048 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], false); 1049} 1050 1051GRL_ANNOTATE_IGC_DO_NOT_SPILL 1052__attribute__((reqd_work_group_size(256, 1, 1))) 1053__attribute__( (intel_reqd_sub_group_size( 16 )) ) 1054void kernel 1055traverse_aabbs_new_update_single_geo( 1056 global struct BVHBase* bvh, 1057 global char* vertices, 1058 global GRL_RAYTRACING_GEOMETRY_DESC* geomDesc, 1059 global struct RefitScratch* scratch, 1060 const uint vertex_format 1061 ) 1062{ 1063 varying uint lid = get_local_id(0); 1064 varying uint tid = lid + get_group_id(0)*get_local_size(0); 1065 1066 local struct AABB3f children_AABBs[256]; 1067 local struct LeafTableEntry leafTable_local[256]; 1068 local uint num_fat_leaves; 1069 1070 if(vertex_format == VERTEX_FORMAT_R32G32B32_FLOAT) 1071 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32B32_FLOAT, 1072 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1073 else if(vertex_format == VERTEX_FORMAT_R32G32_FLOAT) 1074 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R32G32_FLOAT, 1075 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1076 else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_FLOAT) 1077 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_FLOAT, 1078 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1079 else if(vertex_format == VERTEX_FORMAT_R16G16_FLOAT) 1080 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_FLOAT, 1081 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1082 else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_SNORM) 1083 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_SNORM, 1084 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1085 else if(vertex_format == VERTEX_FORMAT_R16G16_SNORM) 1086 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_SNORM, 1087 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1088 else if(vertex_format == VERTEX_FORMAT_R16G16B16A16_UNORM) 1089 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16B16A16_UNORM, 1090 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1091 else if(vertex_format == VERTEX_FORMAT_R16G16_UNORM) 1092 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R16G16_UNORM, 1093 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1094 else if(vertex_format == VERTEX_FORMAT_R10G10B10A2_UNORM) 1095 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R10G10B10A2_UNORM, 1096 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1097 else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_UNORM) 1098 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_UNORM, 1099 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1100 else if(vertex_format == VERTEX_FORMAT_R8G8_UNORM) 1101 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_UNORM, 1102 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1103 else if(vertex_format == VERTEX_FORMAT_R8G8B8A8_SNORM) 1104 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8B8A8_SNORM, 1105 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1106 else if(vertex_format == VERTEX_FORMAT_R8G8_SNORM) 1107 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, VERTEX_FORMAT_R8G8_SNORM, 1108 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1109 else 1110 traverse_aabbs_new_update_func(bvh, vertices, geomDesc, scratch, (uint)-1, 1111 &children_AABBs[0], &num_fat_leaves, &leafTable_local[0], true); 1112} 1113