1// 2// Copyright (C) 2009-2021 Intel Corporation 3// 4// SPDX-License-Identifier: MIT 5// 6// 7 8#include "input_client_structs.h" 9#include "common.h" 10#include "instance.h" 11 12#define DBG(x) 13#define ENABLE_CHECKS 0 14 15/* 16 17 This kernel implements a exclusive scan addition operation. The 18 implementation currently only uses one DSS. 19 20 */ 21__attribute__((reqd_work_group_size(16, 1, 1))) 22__attribute__((intel_reqd_sub_group_size(16))) void kernel 23parallel_scan_exclusive_add(global uint *input, 24 global uint *output, 25 const uint N) 26{ 27 const uint j = get_local_id(0); 28 const uint J = get_local_size(0); 29 const uint BLOCKSIZE = (N + J - 1) / J; 30 const uint start = min((j + 0) * BLOCKSIZE, N); 31 const uint end = min((j + 1) * BLOCKSIZE, N); 32 33 uint base = 0; 34 for (uint i = start; i < end; i++) 35 base += input[i]; 36 37 base = work_group_scan_exclusive_add(base); 38 39 uint accu = 0; 40 for (uint i = start; i < end; i++) 41 { 42 output[i] = base + accu; 43 accu += input[i]; 44 } 45} 46 47/* 48 49 This kernel implements a exclusive scan addition operation that can use the entire GPU. 50 51 */ 52__attribute__((reqd_work_group_size(16, 1, 1))) 53__attribute__((intel_reqd_sub_group_size(16))) void kernel 54parallel_scan_exclusive_add_phase0(global uint *input, 55 global uint *output, 56 global uint *prefix_sums, 57 const uint N) 58{ 59 const uint local_size = get_local_size(0); 60 const uint numTasks = get_num_groups(0); 61 const uint groupID = get_group_id(0); 62 const uint localID = get_local_id(0); 63 const uint global_startID = (groupID + 0) * N / numTasks; 64 const uint global_endID = (groupID + 1) * N / numTasks; 65 66 uint base = 0; 67 for (uint i = global_startID + localID; i < global_endID; i += local_size) 68 base += input[i]; 69 70 base = work_group_reduce_add(base); 71 72 if (localID == 0) 73 { 74 prefix_sums[groupID] = base; 75 printf("%d -> %d \n", groupID, base); 76 } 77} 78 79__attribute__((reqd_work_group_size(16, 1, 1))) 80__attribute__((intel_reqd_sub_group_size(16))) void kernel 81parallel_scan_exclusive_add_phase1(global uint *input, 82 global uint *output, 83 global uint *prefix_sums, 84 const uint N) 85{ 86 const uint local_size = get_local_size(0); 87 const uint numTasks = get_num_groups(0); 88 const uint groupID = get_group_id(0); 89 const uint localID = get_local_id(0); 90 const uint global_startID = (groupID + 0) * N / numTasks; 91 const uint global_endID = (groupID + 1) * N / numTasks; 92 const uint local_range = global_endID - global_startID; 93 94 uint global_base = 0; 95 for (uint i = 0; i < groupID; i++) 96 global_base += prefix_sums[i]; 97 98 const uint j = get_local_id(0); 99 const uint J = get_local_size(0); 100 const uint BLOCKSIZE = (local_range + J - 1) / J; 101 const uint startID = (j + 0) * local_range / J + global_startID; 102 const uint endID = (j + 1) * local_range / J + global_startID; 103 104 uint base = 0; 105 for (uint i = startID; i < endID; i++) 106 base += input[i]; 107 108 base = work_group_scan_exclusive_add(base); 109 110 uint accu = 0; 111 for (uint i = startID; i < endID; i++) 112 { 113 output[i] = global_base + base + accu; 114 accu += input[i]; 115 } 116} 117 118/* ========================================================================= */ 119/* ============================== STATISTICS =============================== */ 120/* ========================================================================= */ 121 122/* ====== STATS config ====== */ 123 124#define ENABLE_STAT_CHECKS 1 125#define DBG_STATS(x) 126 127__attribute__((reqd_work_group_size(256, 1, 1))) 128__attribute__((intel_reqd_sub_group_size(16))) void kernel 129printBVHStatistics(global struct Globals *globals, 130 global char *bvh_mem, 131 global struct StatStackEntry *global_stack0, 132 global struct StatStackEntry *global_stack1, 133 const uint presplit) 134{ 135 const uint globalID = get_global_id(0); 136 const uint localID = get_local_id(0); 137 const uint local_size = get_local_size(0); 138 139 struct BVHBase *base = (struct BVHBase *)bvh_mem; 140 const uint root = base->rootNodeOffset; 141 142 local uint stack_items[2]; 143 local uint iterations; 144 145 struct AABB root_aabb = getAABB_QBVHNodeN((global struct QBVHNodeN *)(bvh_mem + root)); 146 root_aabb = conservativeAABB(&root_aabb); 147 const float root_area = AABB_halfArea(&root_aabb); 148 149 global struct QBVHNodeN *root_node = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset); 150 151 if (root_node->type != BVH_INTERNAL_NODE) 152 { 153 const uint numChildren = getNumChildren_QBVHNodeN(root_node); 154 const uint current = root; 155 for (uint i = 0; i < numChildren; i++) 156 { 157 struct AABB aabb = extractAABB_QBVHNodeN(root_node, i); 158 const float area = AABB_halfArea(&aabb); 159 160 global_stack0[i].node = current + root_node->offset * 64 + i * sizeof(struct Quad); 161 global_stack0[i].type = root_node->type; 162 global_stack0[i].area = area; 163 global_stack0[i].aabb = aabb; 164 global_stack0[i].depth = 0; 165 } 166 stack_items[0] = numChildren; 167 stack_items[1] = 0; 168 } 169 else 170 { 171 global_stack0[0].node = root; 172 global_stack0[0].type = root_node->type; 173 global_stack0[0].area = root_area; 174 global_stack0[0].aabb = root_aabb; 175 global_stack0[0].depth = 1; 176 stack_items[0] = 1; 177 stack_items[1] = 0; 178 } 179 180 const uint maxInnerNodeOffset = globals->node_mem_allocator.cur; 181 const uint maxLeafNodeOffset = globals->quad_mem_allocator.cur; 182 183 DBG_STATS(if (localID == 0) printf("diff %d \n", (globals->node_mem_allocator_cur - globals->node_mem_allocator_start) / 64)); 184 185 iterations = 0; 186 187 work_group_barrier(CLK_LOCAL_MEM_FENCE); 188 189 float sah_nodes = 0.0f; 190 float sah_leaves = 0.0f; 191 uint leaves = 0; 192 uint inner_nodes = 0; 193 uint max_depth = 0; 194 uint leaf_items = 0; 195 uint inner_nodes_valid_children = 0; 196 197 while (1) 198 { 199 work_group_barrier(CLK_GLOBAL_MEM_FENCE); 200 const uint buffer_index = (iterations % 2) == 0 ? 0 : 1; 201 global struct StatStackEntry *input_global_stack = buffer_index == 0 ? global_stack0 : global_stack1; 202 global struct StatStackEntry *output_global_stack = buffer_index == 0 ? global_stack1 : global_stack0; 203 204 const uint local_stack_items = stack_items[buffer_index]; 205 stack_items[1 - buffer_index] = 0; 206 207 DBG_STATS(if (globalID == 0) printf("iterations %d local_stack_items %d \n", iterations, local_stack_items)); 208 209 if (local_stack_items == 0) 210 break; 211 //if (iterations == 5) break; 212 213 work_group_barrier(CLK_GLOBAL_MEM_FENCE); 214 215 if (globalID == 0) 216 iterations++; 217 218 for (uint sindex = localID; sindex < local_stack_items; sindex += local_size) 219 { 220 221 uint current = input_global_stack[sindex].node; 222 uint type = input_global_stack[sindex].type; 223 float current_area = input_global_stack[sindex].area; 224 struct AABB current_aabb = input_global_stack[sindex].aabb; 225 uint current_depth = input_global_stack[sindex].depth; 226 227 //printf("localID %d sindex %d current %d type %d local_stack_items %d \n",localID,sindex,current,type,local_stack_items); 228 229 max_depth = max(max_depth, current_depth); 230 231 if (type == BVH_QUAD_NODE) 232 { 233 unsigned int prims = 1; //getNumLeafPrims(current); 234 if (prims > BVH_LEAF_N_MAX) 235 printf("too many items in leaf %d \n", prims); 236 unsigned int prims_offset = current; //getLeafOffset(current); 237 //printf("prims_offset %d \n",prims_offset); 238 239 leaf_items += prims; 240 sah_leaves += current_area; 241 leaves++; 242#if ENABLE_STAT_CHECKS == 1 243 struct AABB leafAABB; 244 AABB_init(&leafAABB); 245 246 global struct Quad *quads = (global struct Quad *)(bvh_mem + prims_offset); 247 //printf("prims_offset %d \n",prims_offset); 248 249 for (uint i = 0; i < prims; i++) 250 { 251 struct AABB quadAABB = getAABB_Quad(&quads[i]); 252 AABB_extend(&leafAABB, &quadAABB); 253 } 254 255 if (!presplit && !AABB_subset(&leafAABB, ¤t_aabb)) 256 { 257 printf("leaf error: current %d depth %d \n", current, current_depth); 258 AABB_print(¤t_aabb); 259 printf("leaf bounds: \n"); 260 AABB_print(&leafAABB); 261 } 262#endif 263 } 264 else if (type == BVH_INTERNAL_NODE) 265 { 266 inner_nodes++; 267 sah_nodes += current_area; 268 global struct QBVHNodeN *nodeN = (global struct QBVHNodeN *)(bvh_mem + current); 269 270 uint children = 0; 271 for (uint i = 0; i < BVH_NODE_N6; i++) 272 { 273 if (nodeN->qbounds.lower_x[i] > nodeN->qbounds.upper_x[i]) 274 break; 275 children++; 276 } 277 //printf("children %d \n",children); 278 279#if ENABLE_STAT_CHECKS == 1 280 if (children > BVH_NODE_N6 || children == 0) 281 { 282 printf("#children not in valid range: %d offset %d localID %d \n", children, current, localID); 283 printQBVHNodeN(nodeN); 284 } 285 286 if (nodeN->offset > globals->totalAllocatedMem || (int)nodeN->offset < 0) 287 { 288 printf("offset error %d \n", nodeN->offset); 289 } 290#endif 291 292 uint children_offset = atomic_add(&stack_items[1 - buffer_index], children); 293 294 for (uint i = 0; i < children; i++) 295 { 296 inner_nodes_valid_children++; 297 298 struct AABB aabb = extractAABB_QBVHNodeN(nodeN, i); 299 const float area = AABB_halfArea(&aabb); 300 301 aabb = conservativeAABB(&aabb); 302 303#if 0 // ENABLE_STAT_CHECKS == 1 // FIXME: not clear whether parent child property still holds !!!! 304 305 // if (aabb.lower.x == (float)(INFINITY)) 306 // { 307 // printf("aabb inf error %d current %d nodeN %d \n",i, current, children); 308 // break; 309 // } 310 311 312 if (!presplit && !AABB_subset(&aabb,¤t_aabb)) 313 { 314 printf("Parent: current %d depth %d children %d \n",current, current_depth, children); 315 AABB_print(¤t_aabb); 316 printf("Child %d: \n",i); 317 AABB_print(&aabb); 318 } 319#endif 320 321 uint dest_index = children_offset + i; 322 if (nodeN->type == BVH_QUAD_NODE) 323 { 324 output_global_stack[dest_index].node = current + nodeN->offset * 64 + i * sizeof(struct Quad); 325 if (output_global_stack[dest_index].node >= maxLeafNodeOffset) 326 { 327 printf("stack leaf offset error %d %d current %d %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64); 328 } 329 } 330 else if (nodeN->type == BVH_INTERNAL_NODE) 331 { 332 output_global_stack[dest_index].node = (current + nodeN->offset * 64 + i * sizeof(struct QBVHNodeN)); 333 if (output_global_stack[dest_index].node >= maxInnerNodeOffset) 334 { 335 printf("stack inner node offset error %d %d current %d %d maxInnerNodeOffset %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64, maxInnerNodeOffset); 336 } 337 } 338 339 output_global_stack[dest_index].type = nodeN->type; 340 output_global_stack[dest_index].area = area; 341 output_global_stack[dest_index].aabb = aabb; 342 output_global_stack[dest_index].depth = current_depth + 1; 343 //printf("global_stack[dest_index].node %d global_stack[dest_index].type %d \n",global_stack[dest_index].node,global_stack[dest_index].type); 344 } 345 } 346 } 347 } 348 349 sah_nodes = work_group_reduce_add(sah_nodes); 350 sah_leaves = work_group_reduce_add(sah_leaves); 351 leaves = work_group_reduce_add(leaves); 352 inner_nodes = work_group_reduce_add(inner_nodes); 353 max_depth = work_group_reduce_max(max_depth); 354 leaf_items = work_group_reduce_add(leaf_items); 355 inner_nodes_valid_children = work_group_reduce_add(inner_nodes_valid_children); 356 357 if (globalID == 0) 358 { 359 /* 360 sah_nodes *= 1.0f / root_area; 361 sah_leaves *= 1.0f / root_area; 362 float sah = sah_nodes + sah_leaves; 363 364 const uint globalLeafMemAllocatorOffset = globals->quad_mem_allocator.start; 365 const uint totalAllocatedMem = globals->totalAllocatedMem; 366 367 printf("BVH_NODE_N6 %d BVH_LEAF_N_MIN %d BVH_LEAF_N_MAX %d \n",BVH_NODE_N6,BVH_LEAF_N_MIN,BVH_LEAF_N_MAX); 368 float node_util = 100.0f * (float)inner_nodes_valid_children / (inner_nodes * BVH_NODE_N6); 369 float leaf_util = 100.0f * (float)leaf_items / (leaves); 370 printf("allocators: node %d -> %d ; leaf %d -> %d \n",globals->node_mem_allocator_cur,globals->node_mem_allocator_start,globals->leaf_mem_allocator_cur,globals->leaf_mem_allocator_start); 371 printf("inner nodes %d leaves %d sah %f sah_node %f sah_leaves %f max_depth %d leaf_items %d node util %f leaf util %f (%f) \n",inner_nodes,leaves,sah,sah_nodes,sah_leaves,max_depth,leaf_items,node_util,leaf_util,(float)leaf_items / leaves); 372 uint node_mem = globals->node_mem_allocator_cur; 373 uint max_node_mem = globalLeafMemAllocatorOffset; 374 float node_mem_ratio = 100.0f * (float)node_mem / max_node_mem; 375 376 uint leaf_mem = globals->leaf_mem_allocator.cur - globalLeafMemAllocatorOffset; 377 uint max_leaf_mem = totalAllocatedMem - globalLeafMemAllocatorOffset; 378 float leaf_mem_ratio = 100.0f * (float)leaf_mem / max_leaf_mem; 379 380 uint total_mem = node_mem + leaf_mem; 381 float total_mem_ratio = 100.0f * (float)total_mem / totalAllocatedMem; 382 383 printf("used node memory %d (%f) / used leaf memory %d (%f) / total memory used %d (%f) / total memory allocated %d \n",node_mem, node_mem_ratio, leaf_mem, leaf_mem_ratio, total_mem, total_mem_ratio, totalAllocatedMem); 384 */ 385 } 386} 387