xref: /aosp_15_r20/external/clpeak/src/kernels/global_bandwidth_kernels.cl (revision 1cd03ba3888297bc945f2c84574e105e3ced3e34)
1MSTRINGIFY(
2
3\n#undef FETCH_2
4\n#undef FETCH_8
5\n
6\n#define FETCH_2(sum, id, A, jumpBy)      sum += A[id];   id += jumpBy;   sum += A[id];   id += jumpBy;
7\n#define FETCH_4(sum, id, A, jumpBy)      FETCH_2(sum, id, A, jumpBy);   FETCH_2(sum, id, A, jumpBy);
8\n#define FETCH_8(sum, id, A, jumpBy)      FETCH_4(sum, id, A, jumpBy);   FETCH_4(sum, id, A, jumpBy);
9\n
10\n
11\n#define FETCH_PER_WI  16
12\n
13
14// Kernels fetching by local_size offset
15__kernel void global_bandwidth_v1_local_offset(__global float *A, __global float *B)
16{
17    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
18    float sum = 0;
19
20    for(int i=0; i<4; i++)
21    {
22        FETCH_4(sum, id, A, get_local_size(0));
23    }
24
25    B[get_global_id(0)] = sum;
26}
27
28
29__kernel void global_bandwidth_v2_local_offset(__global float2 *A, __global float *B)
30{
31    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
32    float2 sum = 0;
33
34    for(int i=0; i<4; i++)
35    {
36        FETCH_4(sum, id, A, get_local_size(0));
37    }
38
39    B[get_global_id(0)] = (sum.S0) + (sum.S1);
40}
41
42
43__kernel void global_bandwidth_v4_local_offset(__global float4 *A, __global float *B)
44{
45    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
46    float4 sum = 0;
47
48    for(int i=0; i<4; i++)
49    {
50        FETCH_4(sum, id, A, get_local_size(0));
51    }
52
53    B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3);
54}
55
56
57__kernel void global_bandwidth_v8_local_offset(__global float8 *A, __global float *B)
58{
59    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
60    float8 sum = 0;
61
62    for(int i=0; i<4; i++)
63    {
64        FETCH_4(sum, id, A, get_local_size(0));
65    }
66
67    B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7);
68}
69
70__kernel void global_bandwidth_v16_local_offset(__global float16 *A, __global float *B)
71{
72    int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0);
73    float16 sum = 0;
74
75    for(int i=0; i<4; i++)
76    {
77        FETCH_4(sum, id, A, get_local_size(0));
78    }
79
80    float t = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7);
81    t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF);
82    B[get_global_id(0)] = t;
83}
84
85
86// Kernels fetching by global_size offset
87__kernel void global_bandwidth_v1_global_offset(__global float *A, __global float *B)
88{
89    int id = get_global_id(0);
90    float sum = 0;
91
92    for(int i=0; i<4; i++)
93    {
94        FETCH_4(sum, id, A, get_global_size(0));
95    }
96
97    B[get_global_id(0)] = sum;
98}
99
100
101__kernel void global_bandwidth_v2_global_offset(__global float2 *A, __global float *B)
102{
103    int id = get_global_id(0);
104    float2 sum = 0;
105
106    for(int i=0; i<4; i++)
107    {
108        FETCH_4(sum, id, A, get_global_size(0));
109    }
110
111    B[get_global_id(0)] = (sum.S0) + (sum.S1);
112}
113
114
115__kernel void global_bandwidth_v4_global_offset(__global float4 *A, __global float *B)
116{
117    int id = get_global_id(0);
118    float4 sum = 0;
119
120    for(int i=0; i<4; i++)
121    {
122        FETCH_4(sum, id, A, get_global_size(0));
123    }
124
125    B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3);
126}
127
128
129__kernel void global_bandwidth_v8_global_offset(__global float8 *A, __global float *B)
130{
131    int id = get_global_id(0);
132    float8 sum = 0;
133
134    for(int i=0; i<4; i++)
135    {
136        FETCH_4(sum, id, A, get_global_size(0));
137    }
138
139    B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7);
140}
141
142__kernel void global_bandwidth_v16_global_offset(__global float16 *A, __global float *B)
143{
144    int id = get_global_id(0);
145    float16 sum = 0;
146
147    for(int i=0; i<4; i++)
148    {
149        FETCH_4(sum, id, A, get_global_size(0));
150    }
151
152    float t = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7);
153    t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF);
154    B[get_global_id(0)] = t;
155}
156
157
158)
159