xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/misc_legacy.cl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1//
2// Copyright (C) 2009-2021 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8#include "input_client_structs.h"
9#include "common.h"
10#include "instance.h"
11
12#define DBG(x)
13#define ENABLE_CHECKS 0
14
15/*
16
17  This kernel implements a exclusive scan addition operation. The
18  implementation currently only uses one DSS.
19
20 */
21__attribute__((reqd_work_group_size(16, 1, 1)))
22__attribute__((intel_reqd_sub_group_size(16))) void kernel
23parallel_scan_exclusive_add(global uint *input,
24                            global uint *output,
25                            const uint N)
26{
27    const uint j = get_local_id(0);
28    const uint J = get_local_size(0);
29    const uint BLOCKSIZE = (N + J - 1) / J;
30    const uint start = min((j + 0) * BLOCKSIZE, N);
31    const uint end = min((j + 1) * BLOCKSIZE, N);
32
33    uint base = 0;
34    for (uint i = start; i < end; i++)
35        base += input[i];
36
37    base = work_group_scan_exclusive_add(base);
38
39    uint accu = 0;
40    for (uint i = start; i < end; i++)
41    {
42        output[i] = base + accu;
43        accu += input[i];
44    }
45}
46
47/*
48
49  This kernel implements a exclusive scan addition operation that can use the entire GPU.
50
51 */
52__attribute__((reqd_work_group_size(16, 1, 1)))
53__attribute__((intel_reqd_sub_group_size(16))) void kernel
54parallel_scan_exclusive_add_phase0(global uint *input,
55                                   global uint *output,
56                                   global uint *prefix_sums,
57                                   const uint N)
58{
59    const uint local_size = get_local_size(0);
60    const uint numTasks = get_num_groups(0);
61    const uint groupID = get_group_id(0);
62    const uint localID = get_local_id(0);
63    const uint global_startID = (groupID + 0) * N / numTasks;
64    const uint global_endID = (groupID + 1) * N / numTasks;
65
66    uint base = 0;
67    for (uint i = global_startID + localID; i < global_endID; i += local_size)
68        base += input[i];
69
70    base = work_group_reduce_add(base);
71
72    if (localID == 0)
73    {
74        prefix_sums[groupID] = base;
75        printf("%d -> %d \n", groupID, base);
76    }
77}
78
79__attribute__((reqd_work_group_size(16, 1, 1)))
80__attribute__((intel_reqd_sub_group_size(16))) void kernel
81parallel_scan_exclusive_add_phase1(global uint *input,
82                                   global uint *output,
83                                   global uint *prefix_sums,
84                                   const uint N)
85{
86    const uint local_size = get_local_size(0);
87    const uint numTasks = get_num_groups(0);
88    const uint groupID = get_group_id(0);
89    const uint localID = get_local_id(0);
90    const uint global_startID = (groupID + 0) * N / numTasks;
91    const uint global_endID = (groupID + 1) * N / numTasks;
92    const uint local_range = global_endID - global_startID;
93
94    uint global_base = 0;
95    for (uint i = 0; i < groupID; i++)
96        global_base += prefix_sums[i];
97
98    const uint j = get_local_id(0);
99    const uint J = get_local_size(0);
100    const uint BLOCKSIZE = (local_range + J - 1) / J;
101    const uint startID = (j + 0) * local_range / J + global_startID;
102    const uint endID = (j + 1) * local_range / J + global_startID;
103
104    uint base = 0;
105    for (uint i = startID; i < endID; i++)
106        base += input[i];
107
108    base = work_group_scan_exclusive_add(base);
109
110    uint accu = 0;
111    for (uint i = startID; i < endID; i++)
112    {
113        output[i] = global_base + base + accu;
114        accu += input[i];
115    }
116}
117
118/* ========================================================================= */
119/* ============================== STATISTICS =============================== */
120/* ========================================================================= */
121
122/* ====== STATS config ====== */
123
124#define ENABLE_STAT_CHECKS 1
125#define DBG_STATS(x)
126
127__attribute__((reqd_work_group_size(256, 1, 1)))
128__attribute__((intel_reqd_sub_group_size(16))) void kernel
129printBVHStatistics(global struct Globals *globals,
130                   global char *bvh_mem,
131                   global struct StatStackEntry *global_stack0,
132                   global struct StatStackEntry *global_stack1,
133                   const uint presplit)
134{
135    const uint globalID = get_global_id(0);
136    const uint localID = get_local_id(0);
137    const uint local_size = get_local_size(0);
138
139    struct BVHBase *base = (struct BVHBase *)bvh_mem;
140    const uint root = base->rootNodeOffset;
141
142    local uint stack_items[2];
143    local uint iterations;
144
145    struct AABB root_aabb = getAABB_QBVHNodeN((global struct QBVHNodeN *)(bvh_mem + root));
146    root_aabb = conservativeAABB(&root_aabb);
147    const float root_area = AABB_halfArea(&root_aabb);
148
149    global struct QBVHNodeN *root_node = (global struct QBVHNodeN *)(bvh_mem + base->rootNodeOffset);
150
151    if (root_node->type != BVH_INTERNAL_NODE)
152    {
153        const uint numChildren = getNumChildren_QBVHNodeN(root_node);
154        const uint current = root;
155        for (uint i = 0; i < numChildren; i++)
156        {
157            struct AABB aabb = extractAABB_QBVHNodeN(root_node, i);
158            const float area = AABB_halfArea(&aabb);
159
160            global_stack0[i].node = current + root_node->offset * 64 + i * sizeof(struct Quad);
161            global_stack0[i].type = root_node->type;
162            global_stack0[i].area = area;
163            global_stack0[i].aabb = aabb;
164            global_stack0[i].depth = 0;
165        }
166        stack_items[0] = numChildren;
167        stack_items[1] = 0;
168    }
169    else
170    {
171        global_stack0[0].node = root;
172        global_stack0[0].type = root_node->type;
173        global_stack0[0].area = root_area;
174        global_stack0[0].aabb = root_aabb;
175        global_stack0[0].depth = 1;
176        stack_items[0] = 1;
177        stack_items[1] = 0;
178    }
179
180    const uint maxInnerNodeOffset = globals->node_mem_allocator.cur;
181    const uint maxLeafNodeOffset = globals->quad_mem_allocator.cur;
182
183    DBG_STATS(if (localID == 0) printf("diff %d \n", (globals->node_mem_allocator_cur - globals->node_mem_allocator_start) / 64));
184
185    iterations = 0;
186
187    work_group_barrier(CLK_LOCAL_MEM_FENCE);
188
189    float sah_nodes = 0.0f;
190    float sah_leaves = 0.0f;
191    uint leaves = 0;
192    uint inner_nodes = 0;
193    uint max_depth = 0;
194    uint leaf_items = 0;
195    uint inner_nodes_valid_children = 0;
196
197    while (1)
198    {
199        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
200        const uint buffer_index = (iterations % 2) == 0 ? 0 : 1;
201        global struct StatStackEntry *input_global_stack = buffer_index == 0 ? global_stack0 : global_stack1;
202        global struct StatStackEntry *output_global_stack = buffer_index == 0 ? global_stack1 : global_stack0;
203
204        const uint local_stack_items = stack_items[buffer_index];
205        stack_items[1 - buffer_index] = 0;
206
207        DBG_STATS(if (globalID == 0) printf("iterations %d local_stack_items %d \n", iterations, local_stack_items));
208
209        if (local_stack_items == 0)
210            break;
211        //if (iterations == 5) break;
212
213        work_group_barrier(CLK_GLOBAL_MEM_FENCE);
214
215        if (globalID == 0)
216            iterations++;
217
218        for (uint sindex = localID; sindex < local_stack_items; sindex += local_size)
219        {
220
221            uint current = input_global_stack[sindex].node;
222            uint type = input_global_stack[sindex].type;
223            float current_area = input_global_stack[sindex].area;
224            struct AABB current_aabb = input_global_stack[sindex].aabb;
225            uint current_depth = input_global_stack[sindex].depth;
226
227            //printf("localID %d sindex %d current %d type %d local_stack_items %d \n",localID,sindex,current,type,local_stack_items);
228
229            max_depth = max(max_depth, current_depth);
230
231            if (type == BVH_QUAD_NODE)
232            {
233                unsigned int prims = 1; //getNumLeafPrims(current);
234                if (prims > BVH_LEAF_N_MAX)
235                    printf("too many items in leaf %d \n", prims);
236                unsigned int prims_offset = current; //getLeafOffset(current);
237                //printf("prims_offset %d \n",prims_offset);
238
239                leaf_items += prims;
240                sah_leaves += current_area;
241                leaves++;
242#if ENABLE_STAT_CHECKS == 1
243                struct AABB leafAABB;
244                AABB_init(&leafAABB);
245
246                global struct Quad *quads = (global struct Quad *)(bvh_mem + prims_offset);
247                //printf("prims_offset %d \n",prims_offset);
248
249                for (uint i = 0; i < prims; i++)
250                {
251                    struct AABB quadAABB = getAABB_Quad(&quads[i]);
252                    AABB_extend(&leafAABB, &quadAABB);
253                }
254
255                if (!presplit && !AABB_subset(&leafAABB, &current_aabb))
256                {
257                    printf("leaf error: current %d depth %d \n", current, current_depth);
258                    AABB_print(&current_aabb);
259                    printf("leaf bounds: \n");
260                    AABB_print(&leafAABB);
261                }
262#endif
263            }
264            else if (type == BVH_INTERNAL_NODE)
265            {
266                inner_nodes++;
267                sah_nodes += current_area;
268                global struct QBVHNodeN *nodeN = (global struct QBVHNodeN *)(bvh_mem + current);
269
270                uint children = 0;
271                for (uint i = 0; i < BVH_NODE_N6; i++)
272                {
273                    if (nodeN->qbounds.lower_x[i] > nodeN->qbounds.upper_x[i])
274                        break;
275                    children++;
276                }
277                //printf("children %d \n",children);
278
279#if ENABLE_STAT_CHECKS == 1
280                if (children > BVH_NODE_N6 || children == 0)
281                {
282                    printf("#children not in valid range: %d offset %d localID %d \n", children, current, localID);
283                    printQBVHNodeN(nodeN);
284                }
285
286                if (nodeN->offset > globals->totalAllocatedMem || (int)nodeN->offset < 0)
287                {
288                    printf("offset error %d \n", nodeN->offset);
289                }
290#endif
291
292                uint children_offset = atomic_add(&stack_items[1 - buffer_index], children);
293
294                for (uint i = 0; i < children; i++)
295                {
296                    inner_nodes_valid_children++;
297
298                    struct AABB aabb = extractAABB_QBVHNodeN(nodeN, i);
299                    const float area = AABB_halfArea(&aabb);
300
301                    aabb = conservativeAABB(&aabb);
302
303#if 0 // ENABLE_STAT_CHECKS == 1                            // FIXME: not clear whether parent child property still holds !!!!
304
305                  // if (aabb.lower.x == (float)(INFINITY))
306                  //   {
307                  //     printf("aabb inf error %d current %d nodeN %d \n",i, current, children);
308                  //     break;
309                  //   }
310
311
312                  if (!presplit && !AABB_subset(&aabb,&current_aabb))
313                    {
314                      printf("Parent: current %d depth %d children %d \n",current, current_depth, children);
315                      AABB_print(&current_aabb);
316                      printf("Child %d: \n",i);
317                      AABB_print(&aabb);
318                    }
319#endif
320
321                    uint dest_index = children_offset + i;
322                    if (nodeN->type == BVH_QUAD_NODE)
323                    {
324                        output_global_stack[dest_index].node = current + nodeN->offset * 64 + i * sizeof(struct Quad);
325                        if (output_global_stack[dest_index].node >= maxLeafNodeOffset)
326                        {
327                            printf("stack leaf offset error %d %d current %d %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64);
328                        }
329                    }
330                    else if (nodeN->type == BVH_INTERNAL_NODE)
331                    {
332                        output_global_stack[dest_index].node = (current + nodeN->offset * 64 + i * sizeof(struct QBVHNodeN));
333                        if (output_global_stack[dest_index].node >= maxInnerNodeOffset)
334                        {
335                            printf("stack inner node offset error %d %d current %d %d maxInnerNodeOffset %d \n", output_global_stack[dest_index].node, output_global_stack[dest_index].node / 64, current, current / 64, maxInnerNodeOffset);
336                        }
337                    }
338
339                    output_global_stack[dest_index].type = nodeN->type;
340                    output_global_stack[dest_index].area = area;
341                    output_global_stack[dest_index].aabb = aabb;
342                    output_global_stack[dest_index].depth = current_depth + 1;
343                    //printf("global_stack[dest_index].node %d global_stack[dest_index].type %d \n",global_stack[dest_index].node,global_stack[dest_index].type);
344                }
345            }
346        }
347    }
348
349    sah_nodes = work_group_reduce_add(sah_nodes);
350    sah_leaves = work_group_reduce_add(sah_leaves);
351    leaves = work_group_reduce_add(leaves);
352    inner_nodes = work_group_reduce_add(inner_nodes);
353    max_depth = work_group_reduce_max(max_depth);
354    leaf_items = work_group_reduce_add(leaf_items);
355    inner_nodes_valid_children = work_group_reduce_add(inner_nodes_valid_children);
356
357    if (globalID == 0)
358    {
359        /*
360    sah_nodes  *= 1.0f / root_area;
361    sah_leaves *= 1.0f / root_area;
362    float sah = sah_nodes + sah_leaves;
363
364    const uint globalLeafMemAllocatorOffset = globals->quad_mem_allocator.start;
365    const uint totalAllocatedMem = globals->totalAllocatedMem;
366
367    printf("BVH_NODE_N6 %d BVH_LEAF_N_MIN %d BVH_LEAF_N_MAX %d \n",BVH_NODE_N6,BVH_LEAF_N_MIN,BVH_LEAF_N_MAX);
368    float node_util = 100.0f * (float)inner_nodes_valid_children / (inner_nodes * BVH_NODE_N6);
369    float leaf_util = 100.0f * (float)leaf_items / (leaves);
370    printf("allocators: node  %d -> %d ; leaf %d -> %d \n",globals->node_mem_allocator_cur,globals->node_mem_allocator_start,globals->leaf_mem_allocator_cur,globals->leaf_mem_allocator_start);
371    printf("inner nodes %d leaves %d  sah %f sah_node %f sah_leaves %f max_depth %d leaf_items %d node util %f leaf util %f (%f) \n",inner_nodes,leaves,sah,sah_nodes,sah_leaves,max_depth,leaf_items,node_util,leaf_util,(float)leaf_items / leaves);
372    uint node_mem     = globals->node_mem_allocator_cur;
373    uint max_node_mem = globalLeafMemAllocatorOffset;
374    float node_mem_ratio = 100.0f * (float)node_mem / max_node_mem;
375
376    uint leaf_mem        = globals->leaf_mem_allocator.cur - globalLeafMemAllocatorOffset;
377    uint max_leaf_mem    = totalAllocatedMem - globalLeafMemAllocatorOffset;
378    float leaf_mem_ratio = 100.0f * (float)leaf_mem / max_leaf_mem;
379
380    uint total_mem = node_mem + leaf_mem;
381    float total_mem_ratio = 100.0f * (float)total_mem / totalAllocatedMem;
382
383    printf("used node memory %d (%f) / used leaf memory %d (%f) / total memory used %d (%f) / total memory allocated %d \n",node_mem, node_mem_ratio, leaf_mem, leaf_mem_ratio, total_mem, total_mem_ratio, totalAllocatedMem);
384    */
385    }
386}
387