xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/helpers.hembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1R"(
2#ifndef ARM_COMPUTE_HELPER_H
3#define ARM_COMPUTE_HELPER_H
4
5
6
7
8#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
9    VSTORE(N0)                                                 \
10    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
11
12#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
14    VSTORE(N0)                                                 \
15    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
16
17#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
18    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
19    VSTORE(N0)                                                 \
20    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
21
22#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
23    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
24    VSTORE(N0)                                                 \
25    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
26
27#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
28    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
29    VSTORE(N0)                                                 \
30    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
31
32#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
33    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
34    VSTORE(N0)                                                 \
35    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
36
37#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
38    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
39    VSTORE(N0)                                                 \
40    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
41
42#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
43    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
44    VSTORE(N0)                                                 \
45    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
46
47#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
48    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
49    VSTORE(N0)                                                 \
50    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
51
52#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
53    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
54    VSTORE(N0)                                                  \
55    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
56
57#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
58    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
59    VSTORE(N0)                                                  \
60    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
61
62#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
63    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
64    VSTORE(N0)                                                  \
65    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
66
67#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
68    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
69    VSTORE(N0)                                                  \
70    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
71
72#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
73    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
74    VSTORE(N0)                                                  \
75    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
76
77#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
78    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
79    VSTORE(N0)                                                  \
80    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
81
82#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
83    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
84    VSTORE(N0)                                                  \
85    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
86
87
88
89#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
90    VSTORE(N0)                                                         \
91    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
92
93#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
95    VSTORE(N0)                                                         \
96    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
97
98#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
99    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
100    VSTORE(N0)                                                         \
101    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
102
103#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
104    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
105    VSTORE(N0)                                                         \
106    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
107
108#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
109    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
110    VSTORE(N0)                                                         \
111    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
112
113#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
114    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
115    VSTORE(N0)                                                         \
116    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
117
118#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
119    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
120    VSTORE(N0)                                                         \
121    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
122
123#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
124    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
125    VSTORE(N0)                                                         \
126    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
127
128#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
129    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
130    VSTORE(N0)                                                         \
131    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
132
133#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
134    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
135    VSTORE(N0)                                                     \
136    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
137
138#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
140    VSTORE(N0)                                                          \
141    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
142
143#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
144    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
145    VSTORE(N0)                                                          \
146    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
147
148#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
149    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
150    VSTORE(N0)                                                          \
151    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
152
153#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
154    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
155    VSTORE(N0)                                                          \
156    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
157
158#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
159    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
160    VSTORE(N0)                                                          \
161    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
162
163#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
164    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
165    VSTORE(N0)                                                          \
166    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
167
168
169
170
171#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
172#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
173
174
175
176#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
178
179
180
181#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
182    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
183    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
184
185#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
187    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
188    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
189
190#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
191    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
192    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
193    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
194
195#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
196    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
197    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
198    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
199
200#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
201    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
202    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
203    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
204
205#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
206    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
207    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
208    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
209
210#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
211    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
212    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
213    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
214
215#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
216    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
217    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
218    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
219
220#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
221    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
222    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
223    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
224
225#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
226    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
227    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
228    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
229
230#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
231    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
232    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
233    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
234
235#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
236    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
237    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
238    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
239
240#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
241    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
242    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
243    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
244
245#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
246    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
247    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
248    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
249
250#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
251    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
252    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
253    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
254
255#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
256    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
257    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
258    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
259
260
261
262#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
263#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
264
265#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
266    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
267    {                                                                                                                                                     \
268        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
269    }                                                                                                                                                     \
270    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
271    {                                                                                                                                                     \
272        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
273    }                                                                                                                                                     \
274    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
275    {                                                                                                                                                     \
276        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277    }                                                                                                                                                     \
278    else                                                                                                                                                  \
279    {                                                                                                                                                     \
280        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
281    }
282
283#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
284    if(!(PARTIAL_COND_X))                                                                                         \
285    {                                                                                                             \
286        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
287    }                                                                                                             \
288    else                                                                                                          \
289    {                                                                                                             \
290        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
291    }
292
293#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
294    if(!(PARTIAL_COND_Y))                                                                                         \
295    {                                                                                                             \
296        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
297    }                                                                                                             \
298    else                                                                                                          \
299    {                                                                                                             \
300        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
301    }
302
303
304#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
305
306
307#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
308
309#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
310    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
311
312#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
313
314#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
315    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
316
317#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
318
319#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
320    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
321
322#else
323
324#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
325    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
326
327#endif
328
329#endif
330
331
332#if defined(PARTIAL_STORE_M0)
333
334#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
335    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
336#else
337#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
338    ((uint)(y * M0))
339#endif
340
341
342
343#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
344    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
345
346
347#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
348#pragma OPENCL EXTENSION cl_khr_fp16 : enable
349#endif
350
351#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
352#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
353#endif
354
355#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
357#endif
358
359#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
360#pragma OPENCL EXTENSION cl_arm_printf : enable
361#endif
362
363#define GPU_ARCH_MIDGARD 0x100
364#define GPU_ARCH_BIFROST 0x200
365#define GPU_ARCH_VALHALL 0x300
366
367
368#define CONCAT(a, b) a##b
369
370
371#define EXPAND(x) x
372
373
374#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
375
376
377#define REV1(x) ((x))
378#define REV2(x) ((x).s10)
379#define REV3(x) ((x).s210)
380#define REV4(x) ((x).s3210)
381#define REV8(x) ((x).s76543210)
382#define REV16(x) ((x).sFEDCBA9876543210)
383
384
385
386#define REVERSE_STR(x, s) REV##s((x))
387#define REVERSE(x, s) REVERSE_STR(x, s)
388
389
390
391#define ROT1_0(x) ((x))
392#define ROT1_1(x) ((x))
393
394#define ROT2_0(x) ((x))
395#define ROT2_1(x) ((x).s10)
396#define ROT2_2(x) ((x))
397
398#define ROT3_0(x) ((x))
399#define ROT3_1(x) ((x).s201)
400#define ROT3_2(x) ((x).s120)
401#define ROT3_3(x) ((x))
402
403#define ROT4_0(x) ((x))
404#define ROT4_1(x) ((x).s3012)
405#define ROT4_2(x) ((x).s2301)
406#define ROT4_3(x) ((x).s1230)
407#define ROT4_4(x) ((x))
408
409#define ROT8_0(x) ((x))
410#define ROT8_1(x) ((x).s70123456)
411#define ROT8_2(x) ((x).s67012345)
412#define ROT8_3(x) ((x).s56701234)
413#define ROT8_4(x) ((x).s45670123)
414#define ROT8_5(x) ((x).s34567012)
415#define ROT8_6(x) ((x).s23456701)
416#define ROT8_7(x) ((x).s12345670)
417#define ROT8_8(x) ((x))
418
419#define ROT16_0(x) ((x))
420#define ROT16_1(x) ((x).sF0123456789ABCDE)
421#define ROT16_2(x) ((x).sEF0123456789ABCD)
422#define ROT16_3(x) ((x).sDEF0123456789ABC)
423#define ROT16_4(x) ((x).sCDEF0123456789AB)
424#define ROT16_5(x) ((x).sBCDEF0123456789A)
425#define ROT16_6(x) ((x).sABCDEF0123456789)
426#define ROT16_7(x) ((x).s9ABCDEF012345678)
427#define ROT16_8(x) ((x).s89ABCDEF01234567)
428#define ROT16_9(x) ((x).s789ABCDEF0123456)
429#define ROT16_10(x) ((x).s6789ABCDEF012345)
430#define ROT16_11(x) ((x).s56789ABCDEF01234)
431#define ROT16_12(x) ((x).s456789ABCDEF0123)
432#define ROT16_13(x) ((x).s3456789ABCDEF012)
433#define ROT16_14(x) ((x).s23456789ABCDEF01)
434#define ROT16_15(x) ((x).s123456789ABCDEF0)
435#define ROT16_16(x) ((x))
436
437
438
439#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
440#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
441
442
443
444#define V_OFFS1(dt) (dt##1)(0)
445#define V_OFFS2(dt) (dt##2)(0, 1)
446#define V_OFFS3(dt) (dt##3)(0, 1, 2)
447#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
448#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
449#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
450
451
452
453#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
454#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
455
456
457#define VLOAD_STR(size) vload##size
458#define VLOAD(size) VLOAD_STR(size)
459
460
461#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
462#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
463
464#define NO_LOAD(data, offs, ptr) \
465    {                            \
466    }
467
468
469#define vload_partial_1_0 NO_LOAD
470#define vload_partial_1_1 vload1
471#define vload_partial_1_2 NO_LOAD
472#define vload_partial_1_3 NO_LOAD
473#define vload_partial_1_4 NO_LOAD
474#define vload_partial_1_5 NO_LOAD
475#define vload_partial_1_6 NO_LOAD
476#define vload_partial_1_7 NO_LOAD
477#define vload_partial_1_8 NO_LOAD
478#define vload_partial_1_9 NO_LOAD
479#define vload_partial_1_10 NO_LOAD
480#define vload_partial_1_11 NO_LOAD
481#define vload_partial_1_12 NO_LOAD
482#define vload_partial_1_13 NO_LOAD
483#define vload_partial_1_14 NO_LOAD
484#define vload_partial_1_15 NO_LOAD
485#define vload_partial_1_16 NO_LOAD
486
487#define vload_partial_2_0 NO_LOAD
488#define vload_partial_2_1 vload_partial_1
489#define vload_partial_2_2 vload_partial_2
490#define vload_partial_2_3 NO_LOAD
491#define vload_partial_2_4 NO_LOAD
492#define vload_partial_2_5 NO_LOAD
493#define vload_partial_2_6 NO_LOAD
494#define vload_partial_2_7 NO_LOAD
495#define vload_partial_2_8 NO_LOAD
496#define vload_partial_2_9 NO_LOAD
497#define vload_partial_2_10 NO_LOAD
498#define vload_partial_2_11 NO_LOAD
499#define vload_partial_2_12 NO_LOAD
500#define vload_partial_2_13 NO_LOAD
501#define vload_partial_2_14 NO_LOAD
502#define vload_partial_2_15 NO_LOAD
503#define vload_partial_2_16 NO_LOAD
504
505#define vload_partial_3_0 NO_LOAD
506#define vload_partial_3_1 vload_partial_1
507#define vload_partial_3_2 vload_partial_2
508#define vload_partial_3_3 vload_partial_3
509#define vload_partial_3_4 NO_LOAD
510#define vload_partial_3_5 NO_LOAD
511#define vload_partial_3_6 NO_LOAD
512#define vload_partial_3_7 NO_LOAD
513#define vload_partial_3_8 NO_LOAD
514#define vload_partial_3_9 NO_LOAD
515#define vload_partial_3_10 NO_LOAD
516#define vload_partial_3_11 NO_LOAD
517#define vload_partial_3_12 NO_LOAD
518#define vload_partial_3_13 NO_LOAD
519#define vload_partial_3_14 NO_LOAD
520#define vload_partial_3_15 NO_LOAD
521#define vload_partial_3_16 NO_LOAD
522
523#define vload_partial_4_0 NO_LOAD
524#define vload_partial_4_1 vload_partial_1
525#define vload_partial_4_2 vload_partial_2
526#define vload_partial_4_3 vload_partial_3
527#define vload_partial_4_4 vload_partial_4
528#define vload_partial_4_5 NO_LOAD
529#define vload_partial_4_6 NO_LOAD
530#define vload_partial_4_7 NO_LOAD
531#define vload_partial_4_8 NO_LOAD
532#define vload_partial_4_9 NO_LOAD
533#define vload_partial_4_10 NO_LOAD
534#define vload_partial_4_11 NO_LOAD
535#define vload_partial_4_12 NO_LOAD
536#define vload_partial_4_13 NO_LOAD
537#define vload_partial_4_14 NO_LOAD
538#define vload_partial_4_15 NO_LOAD
539#define vload_partial_4_16 NO_LOAD
540
541#define vload_partial_8_0 NO_LOAD
542#define vload_partial_8_1 vload_partial_1
543#define vload_partial_8_2 vload_partial_2
544#define vload_partial_8_3 vload_partial_3
545#define vload_partial_8_4 vload_partial_4
546#define vload_partial_8_5 vload_partial_5
547#define vload_partial_8_6 vload_partial_6
548#define vload_partial_8_7 vload_partial_7
549#define vload_partial_8_8 vload_partial_8
550#define vload_partial_8_9 NO_LOAD
551#define vload_partial_8_10 NO_LOAD
552#define vload_partial_8_11 NO_LOAD
553#define vload_partial_8_12 NO_LOAD
554#define vload_partial_8_13 NO_LOAD
555#define vload_partial_8_14 NO_LOAD
556#define vload_partial_8_15 NO_LOAD
557#define vload_partial_8_16 NO_LOAD
558
559#define vload_partial_16_0 NO_LOAD
560#define vload_partial_16_1 vload_partial_1
561#define vload_partial_16_2 vload_partial_2
562#define vload_partial_16_3 vload_partial_3
563#define vload_partial_16_4 vload_partial_4
564#define vload_partial_16_5 vload_partial_5
565#define vload_partial_16_6 vload_partial_6
566#define vload_partial_16_7 vload_partial_7
567#define vload_partial_16_8 vload_partial_8
568#define vload_partial_16_9 vload_partial_9
569#define vload_partial_16_10 vload_partial_10
570#define vload_partial_16_11 vload_partial_11
571#define vload_partial_16_12 vload_partial_12
572#define vload_partial_16_13 vload_partial_13
573#define vload_partial_16_14 vload_partial_14
574#define vload_partial_16_15 vload_partial_15
575#define vload_partial_16_16 vload_partial_16
576
577
578#define vload_partial_1(DATA, OFFSET, PTR) \
579    DATA.s0 = vload1(OFFSET, PTR);
580
581#define vload_partial_2(DATA, OFFSET, PTR) \
582    DATA.s01 = vload2(OFFSET, PTR);
583
584#define vload_partial_3(DATA, OFFSET, PTR) \
585    DATA.s012 = vload3(OFFSET, PTR);
586
587#define vload_partial_4(DATA, OFFSET, PTR) \
588    DATA.s0123 = vload4(OFFSET, PTR);
589
590#define vload_partial_5(DATA, OFFSET, PTR)    \
591    vload_partial_4(DATA.s0123, OFFSET, PTR); \
592    DATA.s4 = vload1(OFFSET, PTR + 4);
593
594#define vload_partial_6(DATA, OFFSET, PTR)    \
595    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
597
598#define vload_partial_7(DATA, OFFSET, PTR)    \
599    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
601
602#define vload_partial_8(DATA, OFFSET, PTR) \
603    DATA.s01234567 = vload8(OFFSET, PTR);
604
605#define vload_partial_9(DATA, OFFSET, PTR)        \
606    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
607    DATA.s8 = vload1(OFFSET, PTR + 8);
608
609#define vload_partial_10(DATA, OFFSET, PTR)       \
610    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
612
613#define vload_partial_11(DATA, OFFSET, PTR)       \
614    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
616
617#define vload_partial_12(DATA, OFFSET, PTR)       \
618    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
620
621#define vload_partial_13(DATA, OFFSET, PTR)       \
622    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
624
625#define vload_partial_14(DATA, OFFSET, PTR)       \
626    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
628
629#define vload_partial_15(DATA, OFFSET, PTR)       \
630    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
632
633#define vload_partial_16(DATA, OFFSET, PTR) \
634    DATA = vload16(OFFSET, PTR);
635
636
637
638#define PIXEL_UNIT4 1
639#define PIXEL_UNIT8 2
640#define PIXEL_UNIT16 4
641
642
643#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
644#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
645
646
647#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
648#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
649#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
650
651#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
652#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
653#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
654#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
655#endif
656
657#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
658#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
659#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
660
661#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
662#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
663#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
664#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
665#endif
666
667
668#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
669#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
670
671
672#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
673#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
674
675#define VSTORE_STR(size) vstore##size
676#define VSTORE(size) VSTORE_STR(size)
677
678#define float1 float
679#define half1 half
680#define char1 char
681#define uchar1 uchar
682#define short1 short
683#define ushort1 ushort
684#define int1 int
685#define uint1 uint
686#define long1 long
687#define ulong1 ulong
688#define double1 double
689
690#define vload1(OFFSET, PTR) *(OFFSET + PTR)
691#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
692
693
694#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
695#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
696
697#define NO_STORE(data, offs, ptr) \
698    {                             \
699    }
700
701
702#define vstore_partial_1_0 NO_STORE
703#define vstore_partial_1_1 vstore1
704#define vstore_partial_1_2 NO_STORE
705#define vstore_partial_1_3 NO_STORE
706#define vstore_partial_1_4 NO_STORE
707#define vstore_partial_1_5 NO_STORE
708#define vstore_partial_1_6 NO_STORE
709#define vstore_partial_1_7 NO_STORE
710#define vstore_partial_1_8 NO_STORE
711#define vstore_partial_1_9 NO_STORE
712#define vstore_partial_1_10 NO_STORE
713#define vstore_partial_1_11 NO_STORE
714#define vstore_partial_1_12 NO_STORE
715#define vstore_partial_1_13 NO_STORE
716#define vstore_partial_1_14 NO_STORE
717#define vstore_partial_1_15 NO_STORE
718#define vstore_partial_1_16 NO_STORE
719
720#define vstore_partial_2_0 NO_STORE
721#define vstore_partial_2_1 vstore_partial_1
722#define vstore_partial_2_2 vstore_partial_2
723#define vstore_partial_2_3 NO_STORE
724#define vstore_partial_2_4 NO_STORE
725#define vstore_partial_2_5 NO_STORE
726#define vstore_partial_2_6 NO_STORE
727#define vstore_partial_2_7 NO_STORE
728#define vstore_partial_2_8 NO_STORE
729#define vstore_partial_2_9 NO_STORE
730#define vstore_partial_2_10 NO_STORE
731#define vstore_partial_2_11 NO_STORE
732#define vstore_partial_2_12 NO_STORE
733#define vstore_partial_2_13 NO_STORE
734#define vstore_partial_2_14 NO_STORE
735#define vstore_partial_2_15 NO_STORE
736#define vstore_partial_2_16 NO_STORE
737
738#define vstore_partial_3_0 NO_STORE
739#define vstore_partial_3_1 vstore_partial_1
740#define vstore_partial_3_2 vstore_partial_2
741#define vstore_partial_3_3 vstore_partial_3
742#define vstore_partial_3_4 NO_STORE
743#define vstore_partial_3_5 NO_STORE
744#define vstore_partial_3_6 NO_STORE
745#define vstore_partial_3_7 NO_STORE
746#define vstore_partial_3_8 NO_STORE
747#define vstore_partial_3_9 NO_STORE
748#define vstore_partial_3_10 NO_STORE
749#define vstore_partial_3_11 NO_STORE
750#define vstore_partial_3_12 NO_STORE
751#define vstore_partial_3_13 NO_STORE
752#define vstore_partial_3_14 NO_STORE
753#define vstore_partial_3_15 NO_STORE
754#define vstore_partial_3_16 NO_STORE
755
756#define vstore_partial_4_0 NO_STORE
757#define vstore_partial_4_1 vstore_partial_1
758#define vstore_partial_4_2 vstore_partial_2
759#define vstore_partial_4_3 vstore_partial_3
760#define vstore_partial_4_4 vstore_partial_4
761#define vstore_partial_4_5 NO_STORE
762#define vstore_partial_4_6 NO_STORE
763#define vstore_partial_4_7 NO_STORE
764#define vstore_partial_4_8 NO_STORE
765#define vstore_partial_4_9 NO_STORE
766#define vstore_partial_4_10 NO_STORE
767#define vstore_partial_4_11 NO_STORE
768#define vstore_partial_4_12 NO_STORE
769#define vstore_partial_4_13 NO_STORE
770#define vstore_partial_4_14 NO_STORE
771#define vstore_partial_4_15 NO_STORE
772#define vstore_partial_4_16 NO_STORE
773
774#define vstore_partial_8_0 NO_STORE
775#define vstore_partial_8_1 vstore_partial_1
776#define vstore_partial_8_2 vstore_partial_2
777#define vstore_partial_8_3 vstore_partial_3
778#define vstore_partial_8_4 vstore_partial_4
779#define vstore_partial_8_5 vstore_partial_5
780#define vstore_partial_8_6 vstore_partial_6
781#define vstore_partial_8_7 vstore_partial_7
782#define vstore_partial_8_8 vstore_partial_8
783#define vstore_partial_8_9 NO_STORE
784#define vstore_partial_8_10 NO_STORE
785#define vstore_partial_8_11 NO_STORE
786#define vstore_partial_8_12 NO_STORE
787#define vstore_partial_8_13 NO_STORE
788#define vstore_partial_8_14 NO_STORE
789#define vstore_partial_8_15 NO_STORE
790#define vstore_partial_8_16 NO_STORE
791
792#define vstore_partial_16_0 NO_STORE
793#define vstore_partial_16_1 vstore_partial_1
794#define vstore_partial_16_2 vstore_partial_2
795#define vstore_partial_16_3 vstore_partial_3
796#define vstore_partial_16_4 vstore_partial_4
797#define vstore_partial_16_5 vstore_partial_5
798#define vstore_partial_16_6 vstore_partial_6
799#define vstore_partial_16_7 vstore_partial_7
800#define vstore_partial_16_8 vstore_partial_8
801#define vstore_partial_16_9 vstore_partial_9
802#define vstore_partial_16_10 vstore_partial_10
803#define vstore_partial_16_11 vstore_partial_11
804#define vstore_partial_16_12 vstore_partial_12
805#define vstore_partial_16_13 vstore_partial_13
806#define vstore_partial_16_14 vstore_partial_14
807#define vstore_partial_16_15 vstore_partial_15
808#define vstore_partial_16_16 vstore_partial_16
809
810
811#define vstore_partial_1(DATA, OFFSET, PTR) \
812    vstore1(DATA.s0, OFFSET, PTR);
813
814#define vstore_partial_2(DATA, OFFSET, PTR) \
815    vstore2(DATA.s01, OFFSET, PTR);
816
817#define vstore_partial_3(DATA, OFFSET, PTR) \
818    vstore3(DATA.s012, OFFSET, PTR);
819
820#define vstore_partial_4(DATA, OFFSET, PTR) \
821    vstore4(DATA.s0123, OFFSET, PTR);
822
823#define vstore_partial_5(DATA, OFFSET, PTR)    \
824    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
825    vstore1(DATA.s4, OFFSET, PTR + 4);
826
827#define vstore_partial_6(DATA, OFFSET, PTR)    \
828    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
830
831#define vstore_partial_7(DATA, OFFSET, PTR)    \
832    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
834
835#define vstore_partial_8(DATA, OFFSET, PTR) \
836    vstore8(DATA.s01234567, OFFSET, PTR);
837
838#define vstore_partial_9(DATA, OFFSET, PTR)        \
839    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
840    vstore1(DATA.s8, OFFSET, PTR + 8);
841
842#define vstore_partial_10(DATA, OFFSET, PTR)       \
843    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
845
846#define vstore_partial_11(DATA, OFFSET, PTR)       \
847    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
849
850#define vstore_partial_12(DATA, OFFSET, PTR)       \
851    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
853
854#define vstore_partial_13(DATA, OFFSET, PTR)       \
855    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
857
858#define vstore_partial_14(DATA, OFFSET, PTR)       \
859    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
861
862#define vstore_partial_15(DATA, OFFSET, PTR)       \
863    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
865
866#define vstore_partial_16(DATA, OFFSET, PTR) \
867    vstore16(DATA, OFFSET, PTR);
868
869
870
871
872
873#define convert_float_sat convert_float
874#define convert_float1_sat convert_float
875#define convert_float2_sat convert_float2
876#define convert_float3_sat convert_float3
877#define convert_float4_sat convert_float4
878#define convert_float8_sat convert_float8
879#define convert_float16_sat convert_float16
880#define convert_half_sat convert_float
881#define convert_half1_sat convert_half
882#define convert_half2_sat convert_half2
883#define convert_half3_sat convert_half3
884#define convert_half4_sat convert_half4
885#define convert_half8_sat convert_half8
886#define convert_half16_sat convert_half16
887
888#define convert_float1 convert_float
889#define convert_half1 convert_half
890#define convert_char1 convert_char
891#define convert_uchar1 convert_uchar
892#define convert_short1 convert_short
893#define convert_ushort1 convert_ushort
894#define convert_int1 convert_int
895#define convert_uint1 convert_uint
896#define convert_long1 convert_long
897#define convert_ulong1 convert_ulong
898#define convert_double1 convert_double
899
900#define convert_char1_sat convert_char_sat
901#define convert_uchar1_sat convert_uchar_sat
902#define convert_uchar2_sat convert_uchar2_sat
903#define convert_uchar3_sat convert_uchar3_sat
904#define convert_uchar4_sat convert_uchar4_sat
905#define convert_uchar8_sat convert_uchar8_sat
906#define convert_uchar16_sat convert_uchar16_sat
907#define convert_short1_sat convert_short_sat
908#define convert_ushort1_sat convert_ushort_sat
909#define convert_int1_sat convert_int_sat
910#define convert_uint1_sat convert_uint_sat
911#define convert_long1_sat convert_long_sat
912#define convert_ulong1_sat convert_ulong_sat
913#define convert_double1_sat convert_double_sat
914
915#define VEC_DATA_TYPE_STR(type, size) type##size
916#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
917
918#define CONVERT_STR(x, type) (convert_##type((x)))
919#define CONVERT(x, type) CONVERT_STR(x, type)
920
921#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
922#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
923
924#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
925#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
926
927#define select_vec_dt_uchar(size) uchar##size
928#define select_vec_dt_char(size) char##size
929#define select_vec_dt_ushort(size) ushort##size
930#define select_vec_dt_short(size) short##size
931#define select_vec_dt_half(size) short##size
932#define select_vec_dt_uint(size) uint##size
933#define select_vec_dt_int(size) int##size
934#define select_vec_dt_float(size) int##size
935#define select_vec_dt_ulong(size) ulong##size
936#define select_vec_dt_long(size) long##size
937
938#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
939#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
940#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
941
942#define signed_int_vec_dt_uchar(size) char##size
943#define signed_int_vec_dt_char(size) char##size
944#define signed_int_vec_dt_ushort(size) short##size
945#define signed_int_vec_dt_short(size) short##size
946#define signed_int_vec_dt_half(size) short##size
947#define signed_int_vec_dt_uint(size) int##size
948#define signed_int_vec_dt_int(size) int##size
949#define signed_int_vec_dt_float(size) int##size
950#define signed_int_vec_dt_ulong(size) long##size
951#define signed_int_vec_dt_long(size) long##size
952
953#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
954#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
955#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
956
957#define sum_reduce_1(x) (x)
958#define sum_reduce_2(x) ((x).s0) + ((x).s1)
959#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
960#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
961#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
962#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
963
964#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
965#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
966
967#define prod_reduce_1(x) (x)
968#define prod_reduce_2(x) ((x).s0) * ((x).s1)
969#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
970#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
971#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
972#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
973
974#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
975#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
976
977#define max_reduce_1(x) (x)
978#define max_reduce_2(x) max(((x).s0), ((x).s1))
979#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
980#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
981#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
982#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
983
984#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
985#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
986
987#define VECTOR_DECLARATION(name)     \
988    __global uchar *name##_ptr,      \
989    uint        name##_stride_x, \
990    uint        name##_step_x,   \
991    uint        name##_offset_first_element_in_bytes
992
993#define IMAGE_DECLARATION(name)      \
994    __global uchar *name##_ptr,      \
995    uint        name##_stride_x, \
996    uint        name##_step_x,   \
997    uint        name##_stride_y, \
998    uint        name##_step_y,   \
999    uint        name##_offset_first_element_in_bytes
1000
1001#define TENSOR3D_DECLARATION(name)   \
1002    __global uchar *name##_ptr,      \
1003    uint        name##_stride_x, \
1004    uint        name##_step_x,   \
1005    uint        name##_stride_y, \
1006    uint        name##_step_y,   \
1007    uint        name##_stride_z, \
1008    uint        name##_step_z,   \
1009    uint        name##_offset_first_element_in_bytes
1010
1011#define TENSOR4D_DECLARATION(name)   \
1012    __global uchar *name##_ptr,      \
1013    uint        name##_stride_x, \
1014    uint        name##_step_x,   \
1015    uint        name##_stride_y, \
1016    uint        name##_step_y,   \
1017    uint        name##_stride_z, \
1018    uint        name##_step_z,   \
1019    uint        name##_stride_w, \
1020    uint        name##_step_w,   \
1021    uint        name##_offset_first_element_in_bytes
1022
1023#define TENSOR5D_DECLARATION(name)   \
1024    __global uchar *name##_ptr,      \
1025    uint        name##_stride_x, \
1026    uint        name##_step_x,   \
1027    uint        name##_stride_y, \
1028    uint        name##_step_y,   \
1029    uint        name##_stride_z, \
1030    uint        name##_step_z,   \
1031    uint        name##_stride_w, \
1032    uint        name##_step_w,   \
1033    uint        name##_stride_v, \
1034    uint        name##_step_v,   \
1035    uint        name##_offset_first_element_in_bytes
1036
1037#define CONVERT_TO_VECTOR_STRUCT(name) \
1038    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1039
1040#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1041    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1042
1043#define CONVERT_TO_IMAGE_STRUCT(name) \
1044    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1045
1046#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1047    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1048
1049#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1050    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1051
1052#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1053    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1054
1055#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1056    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1057
1058#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1059    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1060                                 name##_stride_z, name##_step_z)
1061
1062#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1063    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1064
1065#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1066    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1067                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1068
1069#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1070    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1071
1072#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1073    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1074                           name##_stride_z, name##_step_z)
1075
1076
1077typedef struct Vector
1078{
1079    __global uchar *ptr;
1080    int             offset_first_element_in_bytes;
1081    int             stride_x;
1082} Vector;
1083
1084
1085typedef struct Image
1086{
1087    __global uchar *ptr;
1088    int             offset_first_element_in_bytes;
1089    int             stride_x;
1090    int             stride_y;
1091} Image;
1092
1093
1094typedef struct Tensor3D
1095{
1096    __global uchar *ptr;
1097    int             offset_first_element_in_bytes;
1098    int             stride_x;
1099    int             stride_y;
1100    int             stride_z;
1101} Tensor3D;
1102
1103
1104typedef struct Tensor4D
1105{
1106    __global uchar *ptr;
1107    int             offset_first_element_in_bytes;
1108    int             stride_x;
1109    int             stride_y;
1110    int             stride_z;
1111    int             stride_w;
1112} Tensor4D;
1113
1114
1115inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1116{
1117    Vector vector =
1118    {
1119        .ptr                           = ptr,
1120        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1121        .stride_x                      = stride_x,
1122    };
1123    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1124    return vector;
1125}
1126
1127
1128inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1129{
1130    Image img =
1131    {
1132        .ptr                           = ptr,
1133        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1134        .stride_x                      = stride_x,
1135        .stride_y                      = stride_y
1136    };
1137    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1138    return img;
1139}
1140
1141
1142inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1143{
1144    Image img =
1145    {
1146        .ptr                           = ptr,
1147        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1148        .stride_x                      = stride_x,
1149        .stride_y                      = stride_y
1150    };
1151    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1152    return img;
1153}
1154
1155
1156inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1157{
1158    Tensor3D tensor =
1159    {
1160        .ptr                           = ptr,
1161        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1162        .stride_x                      = stride_x,
1163        .stride_y                      = stride_y,
1164        .stride_z                      = stride_z
1165    };
1166    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1167    return tensor;
1168}
1169
1170
1171inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1172{
1173    Tensor3D tensor =
1174    {
1175        .ptr                           = ptr,
1176        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1177        .stride_x                      = stride_x,
1178        .stride_y                      = stride_y,
1179        .stride_z                      = stride_z
1180    };
1181    return tensor;
1182}
1183
1184inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1185                                             uint step_w,
1186                                             uint mod_size)
1187{
1188    Tensor4D tensor =
1189    {
1190        .ptr                           = ptr,
1191        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1192        .stride_x                      = stride_x,
1193        .stride_y                      = stride_y,
1194        .stride_z                      = stride_z,
1195        .stride_w                      = stride_w
1196    };
1197
1198    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1199    return tensor;
1200}
1201
1202
1203inline __global const uchar *vector_offset(const Vector *vec, int x)
1204{
1205    return vec->ptr + x * vec->stride_x;
1206}
1207
1208
1209inline __global uchar *offset(const Image *img, int x, int y)
1210{
1211    return img->ptr + x * img->stride_x + y * img->stride_y;
1212}
1213
1214
1215inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1216{
1217    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1218}
1219
1220
1221inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1222{
1223    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1224}
1225
1226
1227inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1228{
1229    uint num_elements = width * height;
1230
1231    const uint z = index / num_elements;
1232
1233    index %= num_elements;
1234
1235    const uint y = index / width;
1236
1237    index %= width;
1238
1239    const uint x = index;
1240
1241    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1242}
1243
1244#endif  )"