xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/common/gemmlowp.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1R"(
2
3
4
5
6#ifndef ARM_COMPUTE_HELPER_H
7#define ARM_COMPUTE_HELPER_H
8
9
10
11
12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13    VSTORE(N0)                                                 \
14    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
15
16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
17    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
18    VSTORE(N0)                                                 \
19    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
20
21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
22    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
23    VSTORE(N0)                                                 \
24    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
25
26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
27    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
28    VSTORE(N0)                                                 \
29    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
30
31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
32    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
33    VSTORE(N0)                                                 \
34    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
35
36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
37    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
38    VSTORE(N0)                                                 \
39    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
40
41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
42    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
43    VSTORE(N0)                                                 \
44    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
45
46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
47    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
48    VSTORE(N0)                                                 \
49    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
50
51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
52    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
53    VSTORE(N0)                                                 \
54    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
55
56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
57    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
58    VSTORE(N0)                                                  \
59    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
60
61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
62    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
63    VSTORE(N0)                                                  \
64    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
65
66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
67    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
68    VSTORE(N0)                                                  \
69    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
70
71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
72    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
73    VSTORE(N0)                                                  \
74    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
75
76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
77    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
78    VSTORE(N0)                                                  \
79    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
80
81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
82    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
83    VSTORE(N0)                                                  \
84    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
85
86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
87    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
88    VSTORE(N0)                                                  \
89    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
90
91
92
93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94    VSTORE(N0)                                                         \
95    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
96
97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
98    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
99    VSTORE(N0)                                                         \
100    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
101
102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
103    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
104    VSTORE(N0)                                                         \
105    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
106
107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
108    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
109    VSTORE(N0)                                                         \
110    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
111
112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
113    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
114    VSTORE(N0)                                                         \
115    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
116
117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119    VSTORE(N0)                                                         \
120    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
121
122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124    VSTORE(N0)                                                         \
125    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
126
127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129    VSTORE(N0)                                                         \
130    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
131
132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134    VSTORE(N0)                                                         \
135    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
136
137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
138    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139    VSTORE(N0)                                                     \
140    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
141
142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144    VSTORE(N0)                                                          \
145    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
146
147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149    VSTORE(N0)                                                          \
150    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
151
152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154    VSTORE(N0)                                                          \
155    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156
157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
159    VSTORE(N0)                                                          \
160    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
161
162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164    VSTORE(N0)                                                          \
165    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
166
167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169    VSTORE(N0)                                                          \
170    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
171
172
173
174
175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177
178
179
180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
182
183
184
185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
187    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
188
189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
190    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
191    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
192    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
193
194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
195    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
196    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
197    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
198
199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
200    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
201    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
202    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
203
204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
206    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
207    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
208
209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
210    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
211    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
212    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
213
214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
215    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
216    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
217    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
218
219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
220    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
221    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
222    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
223
224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
225    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
226    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
227    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
228
229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
230    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
231    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
232    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
233
234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
235    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
236    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
237    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
238
239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
240    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
241    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
242    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
243
244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
245    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
246    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
247    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
248
249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
251    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
252    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
253
254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
255    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
256    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
257    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
258
259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
260    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
261    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
262    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
263
264
265
266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
268
269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
270    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
271    {                                                                                                                                                     \
272        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
273    }                                                                                                                                                     \
274    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
275    {                                                                                                                                                     \
276        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277    }                                                                                                                                                     \
278    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
279    {                                                                                                                                                     \
280        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
281    }                                                                                                                                                     \
282    else                                                                                                                                                  \
283    {                                                                                                                                                     \
284        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
285    }
286
287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
288    if(!(PARTIAL_COND_X))                                                                                         \
289    {                                                                                                             \
290        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
291    }                                                                                                             \
292    else                                                                                                          \
293    {                                                                                                             \
294        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
295    }
296
297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
298    if(!(PARTIAL_COND_Y))                                                                                         \
299    {                                                                                                             \
300        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
301    }                                                                                                             \
302    else                                                                                                          \
303    {                                                                                                             \
304        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
305    }
306
307
308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
309
310
311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
312
313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
314    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
315
316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
317
318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
319    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
320
321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
322
323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
324    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
325
326#else
327
328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
329    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
330
331#endif
332
333#endif
334
335
336#if defined(PARTIAL_STORE_M0)
337
338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
340#else
341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342    ((uint)(y * M0))
343#endif
344
345
346
347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
348    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
349
350
351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
352#pragma OPENCL EXTENSION cl_khr_fp16 : enable
353#endif
354
355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
357#endif
358
359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
361#endif
362
363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
364#pragma OPENCL EXTENSION cl_arm_printf : enable
365#endif
366
367#define GPU_ARCH_MIDGARD 0x100
368#define GPU_ARCH_BIFROST 0x200
369#define GPU_ARCH_VALHALL 0x300
370
371
372#define CONCAT(a, b) a##b
373
374
375#define EXPAND(x) x
376
377
378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
379
380
381#define REV1(x) ((x))
382#define REV2(x) ((x).s10)
383#define REV3(x) ((x).s210)
384#define REV4(x) ((x).s3210)
385#define REV8(x) ((x).s76543210)
386#define REV16(x) ((x).sFEDCBA9876543210)
387
388
389
390#define REVERSE_STR(x, s) REV##s((x))
391#define REVERSE(x, s) REVERSE_STR(x, s)
392
393
394
395#define ROT1_0(x) ((x))
396#define ROT1_1(x) ((x))
397
398#define ROT2_0(x) ((x))
399#define ROT2_1(x) ((x).s10)
400#define ROT2_2(x) ((x))
401
402#define ROT3_0(x) ((x))
403#define ROT3_1(x) ((x).s201)
404#define ROT3_2(x) ((x).s120)
405#define ROT3_3(x) ((x))
406
407#define ROT4_0(x) ((x))
408#define ROT4_1(x) ((x).s3012)
409#define ROT4_2(x) ((x).s2301)
410#define ROT4_3(x) ((x).s1230)
411#define ROT4_4(x) ((x))
412
413#define ROT8_0(x) ((x))
414#define ROT8_1(x) ((x).s70123456)
415#define ROT8_2(x) ((x).s67012345)
416#define ROT8_3(x) ((x).s56701234)
417#define ROT8_4(x) ((x).s45670123)
418#define ROT8_5(x) ((x).s34567012)
419#define ROT8_6(x) ((x).s23456701)
420#define ROT8_7(x) ((x).s12345670)
421#define ROT8_8(x) ((x))
422
423#define ROT16_0(x) ((x))
424#define ROT16_1(x) ((x).sF0123456789ABCDE)
425#define ROT16_2(x) ((x).sEF0123456789ABCD)
426#define ROT16_3(x) ((x).sDEF0123456789ABC)
427#define ROT16_4(x) ((x).sCDEF0123456789AB)
428#define ROT16_5(x) ((x).sBCDEF0123456789A)
429#define ROT16_6(x) ((x).sABCDEF0123456789)
430#define ROT16_7(x) ((x).s9ABCDEF012345678)
431#define ROT16_8(x) ((x).s89ABCDEF01234567)
432#define ROT16_9(x) ((x).s789ABCDEF0123456)
433#define ROT16_10(x) ((x).s6789ABCDEF012345)
434#define ROT16_11(x) ((x).s56789ABCDEF01234)
435#define ROT16_12(x) ((x).s456789ABCDEF0123)
436#define ROT16_13(x) ((x).s3456789ABCDEF012)
437#define ROT16_14(x) ((x).s23456789ABCDEF01)
438#define ROT16_15(x) ((x).s123456789ABCDEF0)
439#define ROT16_16(x) ((x))
440
441
442
443#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
444#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
445
446
447
448#define V_OFFS1(dt) (dt##1)(0)
449#define V_OFFS2(dt) (dt##2)(0, 1)
450#define V_OFFS3(dt) (dt##3)(0, 1, 2)
451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
454
455
456
457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
459
460
461#define VLOAD_STR(size) vload##size
462#define VLOAD(size) VLOAD_STR(size)
463
464
465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
467
468#define NO_LOAD(data, offs, ptr) \
469    {                            \
470    }
471
472
473#define vload_partial_1_0 NO_LOAD
474#define vload_partial_1_1 vload1
475#define vload_partial_1_2 NO_LOAD
476#define vload_partial_1_3 NO_LOAD
477#define vload_partial_1_4 NO_LOAD
478#define vload_partial_1_5 NO_LOAD
479#define vload_partial_1_6 NO_LOAD
480#define vload_partial_1_7 NO_LOAD
481#define vload_partial_1_8 NO_LOAD
482#define vload_partial_1_9 NO_LOAD
483#define vload_partial_1_10 NO_LOAD
484#define vload_partial_1_11 NO_LOAD
485#define vload_partial_1_12 NO_LOAD
486#define vload_partial_1_13 NO_LOAD
487#define vload_partial_1_14 NO_LOAD
488#define vload_partial_1_15 NO_LOAD
489#define vload_partial_1_16 NO_LOAD
490
491#define vload_partial_2_0 NO_LOAD
492#define vload_partial_2_1 vload_partial_1
493#define vload_partial_2_2 vload_partial_2
494#define vload_partial_2_3 NO_LOAD
495#define vload_partial_2_4 NO_LOAD
496#define vload_partial_2_5 NO_LOAD
497#define vload_partial_2_6 NO_LOAD
498#define vload_partial_2_7 NO_LOAD
499#define vload_partial_2_8 NO_LOAD
500#define vload_partial_2_9 NO_LOAD
501#define vload_partial_2_10 NO_LOAD
502#define vload_partial_2_11 NO_LOAD
503#define vload_partial_2_12 NO_LOAD
504#define vload_partial_2_13 NO_LOAD
505#define vload_partial_2_14 NO_LOAD
506#define vload_partial_2_15 NO_LOAD
507#define vload_partial_2_16 NO_LOAD
508
509#define vload_partial_3_0 NO_LOAD
510#define vload_partial_3_1 vload_partial_1
511#define vload_partial_3_2 vload_partial_2
512#define vload_partial_3_3 vload_partial_3
513#define vload_partial_3_4 NO_LOAD
514#define vload_partial_3_5 NO_LOAD
515#define vload_partial_3_6 NO_LOAD
516#define vload_partial_3_7 NO_LOAD
517#define vload_partial_3_8 NO_LOAD
518#define vload_partial_3_9 NO_LOAD
519#define vload_partial_3_10 NO_LOAD
520#define vload_partial_3_11 NO_LOAD
521#define vload_partial_3_12 NO_LOAD
522#define vload_partial_3_13 NO_LOAD
523#define vload_partial_3_14 NO_LOAD
524#define vload_partial_3_15 NO_LOAD
525#define vload_partial_3_16 NO_LOAD
526
527#define vload_partial_4_0 NO_LOAD
528#define vload_partial_4_1 vload_partial_1
529#define vload_partial_4_2 vload_partial_2
530#define vload_partial_4_3 vload_partial_3
531#define vload_partial_4_4 vload_partial_4
532#define vload_partial_4_5 NO_LOAD
533#define vload_partial_4_6 NO_LOAD
534#define vload_partial_4_7 NO_LOAD
535#define vload_partial_4_8 NO_LOAD
536#define vload_partial_4_9 NO_LOAD
537#define vload_partial_4_10 NO_LOAD
538#define vload_partial_4_11 NO_LOAD
539#define vload_partial_4_12 NO_LOAD
540#define vload_partial_4_13 NO_LOAD
541#define vload_partial_4_14 NO_LOAD
542#define vload_partial_4_15 NO_LOAD
543#define vload_partial_4_16 NO_LOAD
544
545#define vload_partial_8_0 NO_LOAD
546#define vload_partial_8_1 vload_partial_1
547#define vload_partial_8_2 vload_partial_2
548#define vload_partial_8_3 vload_partial_3
549#define vload_partial_8_4 vload_partial_4
550#define vload_partial_8_5 vload_partial_5
551#define vload_partial_8_6 vload_partial_6
552#define vload_partial_8_7 vload_partial_7
553#define vload_partial_8_8 vload_partial_8
554#define vload_partial_8_9 NO_LOAD
555#define vload_partial_8_10 NO_LOAD
556#define vload_partial_8_11 NO_LOAD
557#define vload_partial_8_12 NO_LOAD
558#define vload_partial_8_13 NO_LOAD
559#define vload_partial_8_14 NO_LOAD
560#define vload_partial_8_15 NO_LOAD
561#define vload_partial_8_16 NO_LOAD
562
563#define vload_partial_16_0 NO_LOAD
564#define vload_partial_16_1 vload_partial_1
565#define vload_partial_16_2 vload_partial_2
566#define vload_partial_16_3 vload_partial_3
567#define vload_partial_16_4 vload_partial_4
568#define vload_partial_16_5 vload_partial_5
569#define vload_partial_16_6 vload_partial_6
570#define vload_partial_16_7 vload_partial_7
571#define vload_partial_16_8 vload_partial_8
572#define vload_partial_16_9 vload_partial_9
573#define vload_partial_16_10 vload_partial_10
574#define vload_partial_16_11 vload_partial_11
575#define vload_partial_16_12 vload_partial_12
576#define vload_partial_16_13 vload_partial_13
577#define vload_partial_16_14 vload_partial_14
578#define vload_partial_16_15 vload_partial_15
579#define vload_partial_16_16 vload_partial_16
580
581
582#define vload_partial_1(DATA, OFFSET, PTR) \
583    DATA.s0 = vload1(OFFSET, PTR);
584
585#define vload_partial_2(DATA, OFFSET, PTR) \
586    DATA.s01 = vload2(OFFSET, PTR);
587
588#define vload_partial_3(DATA, OFFSET, PTR) \
589    DATA.s012 = vload3(OFFSET, PTR);
590
591#define vload_partial_4(DATA, OFFSET, PTR) \
592    DATA.s0123 = vload4(OFFSET, PTR);
593
594#define vload_partial_5(DATA, OFFSET, PTR)    \
595    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596    DATA.s4 = vload1(OFFSET, PTR + 4);
597
598#define vload_partial_6(DATA, OFFSET, PTR)    \
599    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
601
602#define vload_partial_7(DATA, OFFSET, PTR)    \
603    vload_partial_4(DATA.s0123, OFFSET, PTR); \
604    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
605
606#define vload_partial_8(DATA, OFFSET, PTR) \
607    DATA.s01234567 = vload8(OFFSET, PTR);
608
609#define vload_partial_9(DATA, OFFSET, PTR)        \
610    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611    DATA.s8 = vload1(OFFSET, PTR + 8);
612
613#define vload_partial_10(DATA, OFFSET, PTR)       \
614    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
616
617#define vload_partial_11(DATA, OFFSET, PTR)       \
618    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
620
621#define vload_partial_12(DATA, OFFSET, PTR)       \
622    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
624
625#define vload_partial_13(DATA, OFFSET, PTR)       \
626    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
628
629#define vload_partial_14(DATA, OFFSET, PTR)       \
630    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
632
633#define vload_partial_15(DATA, OFFSET, PTR)       \
634    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
635    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
636
637#define vload_partial_16(DATA, OFFSET, PTR) \
638    DATA = vload16(OFFSET, PTR);
639
640
641
642#define PIXEL_UNIT4 1
643#define PIXEL_UNIT8 2
644#define PIXEL_UNIT16 4
645
646
647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
649
650
651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
654
655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
659#endif
660
661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
664
665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
669#endif
670
671
672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
674
675
676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
678
679#define VSTORE_STR(size) vstore##size
680#define VSTORE(size) VSTORE_STR(size)
681
682#define float1 float
683#define half1 half
684#define char1 char
685#define uchar1 uchar
686#define short1 short
687#define ushort1 ushort
688#define int1 int
689#define uint1 uint
690#define long1 long
691#define ulong1 ulong
692#define double1 double
693
694#define vload1(OFFSET, PTR) *(OFFSET + PTR)
695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
696
697
698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
700
701#define NO_STORE(data, offs, ptr) \
702    {                             \
703    }
704
705
706#define vstore_partial_1_0 NO_STORE
707#define vstore_partial_1_1 vstore1
708#define vstore_partial_1_2 NO_STORE
709#define vstore_partial_1_3 NO_STORE
710#define vstore_partial_1_4 NO_STORE
711#define vstore_partial_1_5 NO_STORE
712#define vstore_partial_1_6 NO_STORE
713#define vstore_partial_1_7 NO_STORE
714#define vstore_partial_1_8 NO_STORE
715#define vstore_partial_1_9 NO_STORE
716#define vstore_partial_1_10 NO_STORE
717#define vstore_partial_1_11 NO_STORE
718#define vstore_partial_1_12 NO_STORE
719#define vstore_partial_1_13 NO_STORE
720#define vstore_partial_1_14 NO_STORE
721#define vstore_partial_1_15 NO_STORE
722#define vstore_partial_1_16 NO_STORE
723
724#define vstore_partial_2_0 NO_STORE
725#define vstore_partial_2_1 vstore_partial_1
726#define vstore_partial_2_2 vstore_partial_2
727#define vstore_partial_2_3 NO_STORE
728#define vstore_partial_2_4 NO_STORE
729#define vstore_partial_2_5 NO_STORE
730#define vstore_partial_2_6 NO_STORE
731#define vstore_partial_2_7 NO_STORE
732#define vstore_partial_2_8 NO_STORE
733#define vstore_partial_2_9 NO_STORE
734#define vstore_partial_2_10 NO_STORE
735#define vstore_partial_2_11 NO_STORE
736#define vstore_partial_2_12 NO_STORE
737#define vstore_partial_2_13 NO_STORE
738#define vstore_partial_2_14 NO_STORE
739#define vstore_partial_2_15 NO_STORE
740#define vstore_partial_2_16 NO_STORE
741
742#define vstore_partial_3_0 NO_STORE
743#define vstore_partial_3_1 vstore_partial_1
744#define vstore_partial_3_2 vstore_partial_2
745#define vstore_partial_3_3 vstore_partial_3
746#define vstore_partial_3_4 NO_STORE
747#define vstore_partial_3_5 NO_STORE
748#define vstore_partial_3_6 NO_STORE
749#define vstore_partial_3_7 NO_STORE
750#define vstore_partial_3_8 NO_STORE
751#define vstore_partial_3_9 NO_STORE
752#define vstore_partial_3_10 NO_STORE
753#define vstore_partial_3_11 NO_STORE
754#define vstore_partial_3_12 NO_STORE
755#define vstore_partial_3_13 NO_STORE
756#define vstore_partial_3_14 NO_STORE
757#define vstore_partial_3_15 NO_STORE
758#define vstore_partial_3_16 NO_STORE
759
760#define vstore_partial_4_0 NO_STORE
761#define vstore_partial_4_1 vstore_partial_1
762#define vstore_partial_4_2 vstore_partial_2
763#define vstore_partial_4_3 vstore_partial_3
764#define vstore_partial_4_4 vstore_partial_4
765#define vstore_partial_4_5 NO_STORE
766#define vstore_partial_4_6 NO_STORE
767#define vstore_partial_4_7 NO_STORE
768#define vstore_partial_4_8 NO_STORE
769#define vstore_partial_4_9 NO_STORE
770#define vstore_partial_4_10 NO_STORE
771#define vstore_partial_4_11 NO_STORE
772#define vstore_partial_4_12 NO_STORE
773#define vstore_partial_4_13 NO_STORE
774#define vstore_partial_4_14 NO_STORE
775#define vstore_partial_4_15 NO_STORE
776#define vstore_partial_4_16 NO_STORE
777
778#define vstore_partial_8_0 NO_STORE
779#define vstore_partial_8_1 vstore_partial_1
780#define vstore_partial_8_2 vstore_partial_2
781#define vstore_partial_8_3 vstore_partial_3
782#define vstore_partial_8_4 vstore_partial_4
783#define vstore_partial_8_5 vstore_partial_5
784#define vstore_partial_8_6 vstore_partial_6
785#define vstore_partial_8_7 vstore_partial_7
786#define vstore_partial_8_8 vstore_partial_8
787#define vstore_partial_8_9 NO_STORE
788#define vstore_partial_8_10 NO_STORE
789#define vstore_partial_8_11 NO_STORE
790#define vstore_partial_8_12 NO_STORE
791#define vstore_partial_8_13 NO_STORE
792#define vstore_partial_8_14 NO_STORE
793#define vstore_partial_8_15 NO_STORE
794#define vstore_partial_8_16 NO_STORE
795
796#define vstore_partial_16_0 NO_STORE
797#define vstore_partial_16_1 vstore_partial_1
798#define vstore_partial_16_2 vstore_partial_2
799#define vstore_partial_16_3 vstore_partial_3
800#define vstore_partial_16_4 vstore_partial_4
801#define vstore_partial_16_5 vstore_partial_5
802#define vstore_partial_16_6 vstore_partial_6
803#define vstore_partial_16_7 vstore_partial_7
804#define vstore_partial_16_8 vstore_partial_8
805#define vstore_partial_16_9 vstore_partial_9
806#define vstore_partial_16_10 vstore_partial_10
807#define vstore_partial_16_11 vstore_partial_11
808#define vstore_partial_16_12 vstore_partial_12
809#define vstore_partial_16_13 vstore_partial_13
810#define vstore_partial_16_14 vstore_partial_14
811#define vstore_partial_16_15 vstore_partial_15
812#define vstore_partial_16_16 vstore_partial_16
813
814
815#define vstore_partial_1(DATA, OFFSET, PTR) \
816    vstore1(DATA.s0, OFFSET, PTR);
817
818#define vstore_partial_2(DATA, OFFSET, PTR) \
819    vstore2(DATA.s01, OFFSET, PTR);
820
821#define vstore_partial_3(DATA, OFFSET, PTR) \
822    vstore3(DATA.s012, OFFSET, PTR);
823
824#define vstore_partial_4(DATA, OFFSET, PTR) \
825    vstore4(DATA.s0123, OFFSET, PTR);
826
827#define vstore_partial_5(DATA, OFFSET, PTR)    \
828    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829    vstore1(DATA.s4, OFFSET, PTR + 4);
830
831#define vstore_partial_6(DATA, OFFSET, PTR)    \
832    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
834
835#define vstore_partial_7(DATA, OFFSET, PTR)    \
836    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
837    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
838
839#define vstore_partial_8(DATA, OFFSET, PTR) \
840    vstore8(DATA.s01234567, OFFSET, PTR);
841
842#define vstore_partial_9(DATA, OFFSET, PTR)        \
843    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844    vstore1(DATA.s8, OFFSET, PTR + 8);
845
846#define vstore_partial_10(DATA, OFFSET, PTR)       \
847    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
849
850#define vstore_partial_11(DATA, OFFSET, PTR)       \
851    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
853
854#define vstore_partial_12(DATA, OFFSET, PTR)       \
855    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
857
858#define vstore_partial_13(DATA, OFFSET, PTR)       \
859    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
861
862#define vstore_partial_14(DATA, OFFSET, PTR)       \
863    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
865
866#define vstore_partial_15(DATA, OFFSET, PTR)       \
867    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
868    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
869
870#define vstore_partial_16(DATA, OFFSET, PTR) \
871    vstore16(DATA, OFFSET, PTR);
872
873
874
875
876
877#define convert_float_sat convert_float
878#define convert_float1_sat convert_float
879#define convert_float2_sat convert_float2
880#define convert_float3_sat convert_float3
881#define convert_float4_sat convert_float4
882#define convert_float8_sat convert_float8
883#define convert_float16_sat convert_float16
884#define convert_half_sat convert_float
885#define convert_half1_sat convert_half
886#define convert_half2_sat convert_half2
887#define convert_half3_sat convert_half3
888#define convert_half4_sat convert_half4
889#define convert_half8_sat convert_half8
890#define convert_half16_sat convert_half16
891
892#define convert_float1 convert_float
893#define convert_half1 convert_half
894#define convert_char1 convert_char
895#define convert_uchar1 convert_uchar
896#define convert_short1 convert_short
897#define convert_ushort1 convert_ushort
898#define convert_int1 convert_int
899#define convert_uint1 convert_uint
900#define convert_long1 convert_long
901#define convert_ulong1 convert_ulong
902#define convert_double1 convert_double
903
904#define convert_char1_sat convert_char_sat
905#define convert_uchar1_sat convert_uchar_sat
906#define convert_uchar2_sat convert_uchar2_sat
907#define convert_uchar3_sat convert_uchar3_sat
908#define convert_uchar4_sat convert_uchar4_sat
909#define convert_uchar8_sat convert_uchar8_sat
910#define convert_uchar16_sat convert_uchar16_sat
911#define convert_short1_sat convert_short_sat
912#define convert_ushort1_sat convert_ushort_sat
913#define convert_int1_sat convert_int_sat
914#define convert_uint1_sat convert_uint_sat
915#define convert_long1_sat convert_long_sat
916#define convert_ulong1_sat convert_ulong_sat
917#define convert_double1_sat convert_double_sat
918
919#define VEC_DATA_TYPE_STR(type, size) type##size
920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
921
922#define CONVERT_STR(x, type) (convert_##type((x)))
923#define CONVERT(x, type) CONVERT_STR(x, type)
924
925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
927
928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
930
931#define select_vec_dt_uchar(size) uchar##size
932#define select_vec_dt_char(size) char##size
933#define select_vec_dt_ushort(size) ushort##size
934#define select_vec_dt_short(size) short##size
935#define select_vec_dt_half(size) short##size
936#define select_vec_dt_uint(size) uint##size
937#define select_vec_dt_int(size) int##size
938#define select_vec_dt_float(size) int##size
939#define select_vec_dt_ulong(size) ulong##size
940#define select_vec_dt_long(size) long##size
941
942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
945
946#define signed_int_vec_dt_uchar(size) char##size
947#define signed_int_vec_dt_char(size) char##size
948#define signed_int_vec_dt_ushort(size) short##size
949#define signed_int_vec_dt_short(size) short##size
950#define signed_int_vec_dt_half(size) short##size
951#define signed_int_vec_dt_uint(size) int##size
952#define signed_int_vec_dt_int(size) int##size
953#define signed_int_vec_dt_float(size) int##size
954#define signed_int_vec_dt_ulong(size) long##size
955#define signed_int_vec_dt_long(size) long##size
956
957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
960
961#define sum_reduce_1(x) (x)
962#define sum_reduce_2(x) ((x).s0) + ((x).s1)
963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
967
968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
970
971#define prod_reduce_1(x) (x)
972#define prod_reduce_2(x) ((x).s0) * ((x).s1)
973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
977
978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
980
981#define max_reduce_1(x) (x)
982#define max_reduce_2(x) max(((x).s0), ((x).s1))
983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
987
988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
990
991#define VECTOR_DECLARATION(name)     \
992    __global uchar *name##_ptr,      \
993    uint        name##_stride_x, \
994    uint        name##_step_x,   \
995    uint        name##_offset_first_element_in_bytes
996
997#define IMAGE_DECLARATION(name)      \
998    __global uchar *name##_ptr,      \
999    uint        name##_stride_x, \
1000    uint        name##_step_x,   \
1001    uint        name##_stride_y, \
1002    uint        name##_step_y,   \
1003    uint        name##_offset_first_element_in_bytes
1004
1005#define TENSOR3D_DECLARATION(name)   \
1006    __global uchar *name##_ptr,      \
1007    uint        name##_stride_x, \
1008    uint        name##_step_x,   \
1009    uint        name##_stride_y, \
1010    uint        name##_step_y,   \
1011    uint        name##_stride_z, \
1012    uint        name##_step_z,   \
1013    uint        name##_offset_first_element_in_bytes
1014
1015#define TENSOR4D_DECLARATION(name)   \
1016    __global uchar *name##_ptr,      \
1017    uint        name##_stride_x, \
1018    uint        name##_step_x,   \
1019    uint        name##_stride_y, \
1020    uint        name##_step_y,   \
1021    uint        name##_stride_z, \
1022    uint        name##_step_z,   \
1023    uint        name##_stride_w, \
1024    uint        name##_step_w,   \
1025    uint        name##_offset_first_element_in_bytes
1026
1027#define TENSOR5D_DECLARATION(name)   \
1028    __global uchar *name##_ptr,      \
1029    uint        name##_stride_x, \
1030    uint        name##_step_x,   \
1031    uint        name##_stride_y, \
1032    uint        name##_step_y,   \
1033    uint        name##_stride_z, \
1034    uint        name##_step_z,   \
1035    uint        name##_stride_w, \
1036    uint        name##_step_w,   \
1037    uint        name##_stride_v, \
1038    uint        name##_step_v,   \
1039    uint        name##_offset_first_element_in_bytes
1040
1041#define CONVERT_TO_VECTOR_STRUCT(name) \
1042    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1043
1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1045    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1046
1047#define CONVERT_TO_IMAGE_STRUCT(name) \
1048    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1049
1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1051    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1052
1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1054    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1055
1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1057    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1058
1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1060    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1061
1062#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1063    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1064                                 name##_stride_z, name##_step_z)
1065
1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1067    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1068
1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1070    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1071                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1072
1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1074    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1075
1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1077    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1078                           name##_stride_z, name##_step_z)
1079
1080
1081typedef struct Vector
1082{
1083    __global uchar *ptr;
1084    int             offset_first_element_in_bytes;
1085    int             stride_x;
1086} Vector;
1087
1088
1089typedef struct Image
1090{
1091    __global uchar *ptr;
1092    int             offset_first_element_in_bytes;
1093    int             stride_x;
1094    int             stride_y;
1095} Image;
1096
1097
1098typedef struct Tensor3D
1099{
1100    __global uchar *ptr;
1101    int             offset_first_element_in_bytes;
1102    int             stride_x;
1103    int             stride_y;
1104    int             stride_z;
1105} Tensor3D;
1106
1107
1108typedef struct Tensor4D
1109{
1110    __global uchar *ptr;
1111    int             offset_first_element_in_bytes;
1112    int             stride_x;
1113    int             stride_y;
1114    int             stride_z;
1115    int             stride_w;
1116} Tensor4D;
1117
1118
1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1120{
1121    Vector vector =
1122    {
1123        .ptr                           = ptr,
1124        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1125        .stride_x                      = stride_x,
1126    };
1127    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1128    return vector;
1129}
1130
1131
1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1133{
1134    Image img =
1135    {
1136        .ptr                           = ptr,
1137        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1138        .stride_x                      = stride_x,
1139        .stride_y                      = stride_y
1140    };
1141    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1142    return img;
1143}
1144
1145
1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1147{
1148    Image img =
1149    {
1150        .ptr                           = ptr,
1151        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1152        .stride_x                      = stride_x,
1153        .stride_y                      = stride_y
1154    };
1155    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1156    return img;
1157}
1158
1159
1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1161{
1162    Tensor3D tensor =
1163    {
1164        .ptr                           = ptr,
1165        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1166        .stride_x                      = stride_x,
1167        .stride_y                      = stride_y,
1168        .stride_z                      = stride_z
1169    };
1170    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1171    return tensor;
1172}
1173
1174
1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1176{
1177    Tensor3D tensor =
1178    {
1179        .ptr                           = ptr,
1180        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1181        .stride_x                      = stride_x,
1182        .stride_y                      = stride_y,
1183        .stride_z                      = stride_z
1184    };
1185    return tensor;
1186}
1187
1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1189                                             uint step_w,
1190                                             uint mod_size)
1191{
1192    Tensor4D tensor =
1193    {
1194        .ptr                           = ptr,
1195        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1196        .stride_x                      = stride_x,
1197        .stride_y                      = stride_y,
1198        .stride_z                      = stride_z,
1199        .stride_w                      = stride_w
1200    };
1201
1202    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1203    return tensor;
1204}
1205
1206
1207inline __global const uchar *vector_offset(const Vector *vec, int x)
1208{
1209    return vec->ptr + x * vec->stride_x;
1210}
1211
1212
1213inline __global uchar *offset(const Image *img, int x, int y)
1214{
1215    return img->ptr + x * img->stride_x + y * img->stride_y;
1216}
1217
1218
1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1220{
1221    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1222}
1223
1224
1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1226{
1227    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1228}
1229
1230
1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1232{
1233    uint num_elements = width * height;
1234
1235    const uint z = index / num_elements;
1236
1237    index %= num_elements;
1238
1239    const uint y = index / width;
1240
1241    index %= width;
1242
1243    const uint x = index;
1244
1245    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1246}
1247
1248#endif
1249
1250#if GPU_ARCH == GPU_ARCH_BIFROST
1251#define MLA(a, b, c) (fma(c, b, a))
1252#else
1253#define MLA(a, b, c) ((b) * (c) + (a))
1254#endif
1255
1256
1257#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1258
1259
1260#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1261
1262
1263#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1264
1265
1266#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1267
1268
1269#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1270
1271
1272#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1273
1274
1275#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1276
1277
1278#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1279
1280
1281#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1282
1283
1284#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1285
1286
1287#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1288
1289
1290#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1291
1292
1293#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1294
1295
1296#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1297
1298
1299#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1300
1301#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1302
1303#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1304
1305#ifndef ARM_COMPUTE_HELPER_H
1306#define ARM_COMPUTE_HELPER_H
1307
1308
1309
1310
1311#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312    VSTORE(N0)                                                 \
1313    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1314
1315#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1316    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1317    VSTORE(N0)                                                 \
1318    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1319
1320#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1321    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1322    VSTORE(N0)                                                 \
1323    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1324
1325#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1326    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1327    VSTORE(N0)                                                 \
1328    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1329
1330#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1331    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1332    VSTORE(N0)                                                 \
1333    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1334
1335#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1336    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1337    VSTORE(N0)                                                 \
1338    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1339
1340#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1341    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1342    VSTORE(N0)                                                 \
1343    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1344
1345#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1346    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1347    VSTORE(N0)                                                 \
1348    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1349
1350#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1351    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1352    VSTORE(N0)                                                 \
1353    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1354
1355#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1356    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1357    VSTORE(N0)                                                  \
1358    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1359
1360#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1361    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1362    VSTORE(N0)                                                  \
1363    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1364
1365#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1366    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1367    VSTORE(N0)                                                  \
1368    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1369
1370#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1371    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1372    VSTORE(N0)                                                  \
1373    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1374
1375#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1376    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1377    VSTORE(N0)                                                  \
1378    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1379
1380#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1381    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1382    VSTORE(N0)                                                  \
1383    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1384
1385#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1386    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1387    VSTORE(N0)                                                  \
1388    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1389
1390
1391
1392#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393    VSTORE(N0)                                                         \
1394    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1395
1396#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1397    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1398    VSTORE(N0)                                                         \
1399    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1400
1401#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1402    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1403    VSTORE(N0)                                                         \
1404    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1405
1406#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1407    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1408    VSTORE(N0)                                                         \
1409    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1410
1411#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1412    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1413    VSTORE(N0)                                                         \
1414    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1415
1416#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1417    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1418    VSTORE(N0)                                                         \
1419    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1420
1421#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1422    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1423    VSTORE(N0)                                                         \
1424    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1425
1426#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1427    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1428    VSTORE(N0)                                                         \
1429    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1430
1431#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1432    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1433    VSTORE(N0)                                                         \
1434    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1435
1436#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1437    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1438    VSTORE(N0)                                                     \
1439    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1440
1441#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1442    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1443    VSTORE(N0)                                                          \
1444    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1445
1446#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1447    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1448    VSTORE(N0)                                                          \
1449    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1450
1451#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1452    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1453    VSTORE(N0)                                                          \
1454    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1455
1456#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1457    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1458    VSTORE(N0)                                                          \
1459    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1460
1461#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1462    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1463    VSTORE(N0)                                                          \
1464    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1465
1466#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1467    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1468    VSTORE(N0)                                                          \
1469    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1470
1471
1472
1473
1474#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1476
1477
1478
1479#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1481
1482
1483
1484#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1486    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1487
1488#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1489    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1490    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1491    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1492
1493#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1494    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1495    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1496    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1497
1498#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1499    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1500    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1501    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1502
1503#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1504    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1505    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1506    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1507
1508#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1509    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1510    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1511    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1512
1513#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1514    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1515    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1516    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1517
1518#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1519    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1520    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1521    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1522
1523#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1524    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1525    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1526    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1527
1528#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1529    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1530    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1531    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1532
1533#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1534    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1535    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1536    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1537
1538#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1539    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1540    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1541    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1542
1543#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1544    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1545    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1546    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1547
1548#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1549    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1550    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1551    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1552
1553#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1554    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1555    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1556    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1557
1558#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1559    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1560    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1561    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1562
1563
1564
1565#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1567
1568#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1570    {                                                                                                                                                     \
1571        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1572    }                                                                                                                                                     \
1573    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1574    {                                                                                                                                                     \
1575        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1576    }                                                                                                                                                     \
1577    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1578    {                                                                                                                                                     \
1579        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1580    }                                                                                                                                                     \
1581    else                                                                                                                                                  \
1582    {                                                                                                                                                     \
1583        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1584    }
1585
1586#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1587    if(!(PARTIAL_COND_X))                                                                                         \
1588    {                                                                                                             \
1589        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1590    }                                                                                                             \
1591    else                                                                                                          \
1592    {                                                                                                             \
1593        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1594    }
1595
1596#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1597    if(!(PARTIAL_COND_Y))                                                                                         \
1598    {                                                                                                             \
1599        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1600    }                                                                                                             \
1601    else                                                                                                          \
1602    {                                                                                                             \
1603        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1604    }
1605
1606
1607#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1608
1609
1610#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1611
1612#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1613    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1614
1615#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1616
1617#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1618    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1619
1620#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1621
1622#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1623    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1624
1625#else
1626
1627#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1628    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1629
1630#endif
1631
1632#endif
1633
1634
1635#if defined(PARTIAL_STORE_M0)
1636
1637#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1638    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1639#else
1640#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1641    ((uint)(y * M0))
1642#endif
1643
1644
1645
1646#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1647    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1648
1649
1650#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1651#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1652#endif
1653
1654#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1655#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1656#endif
1657
1658#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1659#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1660#endif
1661
1662#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1663#pragma OPENCL EXTENSION cl_arm_printf : enable
1664#endif
1665
1666#define GPU_ARCH_MIDGARD 0x100
1667#define GPU_ARCH_BIFROST 0x200
1668#define GPU_ARCH_VALHALL 0x300
1669
1670
1671#define CONCAT(a, b) a##b
1672
1673
1674#define EXPAND(x) x
1675
1676
1677#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1678
1679
1680#define REV1(x) ((x))
1681#define REV2(x) ((x).s10)
1682#define REV3(x) ((x).s210)
1683#define REV4(x) ((x).s3210)
1684#define REV8(x) ((x).s76543210)
1685#define REV16(x) ((x).sFEDCBA9876543210)
1686
1687
1688
1689#define REVERSE_STR(x, s) REV##s((x))
1690#define REVERSE(x, s) REVERSE_STR(x, s)
1691
1692
1693
1694#define ROT1_0(x) ((x))
1695#define ROT1_1(x) ((x))
1696
1697#define ROT2_0(x) ((x))
1698#define ROT2_1(x) ((x).s10)
1699#define ROT2_2(x) ((x))
1700
1701#define ROT3_0(x) ((x))
1702#define ROT3_1(x) ((x).s201)
1703#define ROT3_2(x) ((x).s120)
1704#define ROT3_3(x) ((x))
1705
1706#define ROT4_0(x) ((x))
1707#define ROT4_1(x) ((x).s3012)
1708#define ROT4_2(x) ((x).s2301)
1709#define ROT4_3(x) ((x).s1230)
1710#define ROT4_4(x) ((x))
1711
1712#define ROT8_0(x) ((x))
1713#define ROT8_1(x) ((x).s70123456)
1714#define ROT8_2(x) ((x).s67012345)
1715#define ROT8_3(x) ((x).s56701234)
1716#define ROT8_4(x) ((x).s45670123)
1717#define ROT8_5(x) ((x).s34567012)
1718#define ROT8_6(x) ((x).s23456701)
1719#define ROT8_7(x) ((x).s12345670)
1720#define ROT8_8(x) ((x))
1721
1722#define ROT16_0(x) ((x))
1723#define ROT16_1(x) ((x).sF0123456789ABCDE)
1724#define ROT16_2(x) ((x).sEF0123456789ABCD)
1725#define ROT16_3(x) ((x).sDEF0123456789ABC)
1726#define ROT16_4(x) ((x).sCDEF0123456789AB)
1727#define ROT16_5(x) ((x).sBCDEF0123456789A)
1728#define ROT16_6(x) ((x).sABCDEF0123456789)
1729#define ROT16_7(x) ((x).s9ABCDEF012345678)
1730#define ROT16_8(x) ((x).s89ABCDEF01234567)
1731#define ROT16_9(x) ((x).s789ABCDEF0123456)
1732#define ROT16_10(x) ((x).s6789ABCDEF012345)
1733#define ROT16_11(x) ((x).s56789ABCDEF01234)
1734#define ROT16_12(x) ((x).s456789ABCDEF0123)
1735#define ROT16_13(x) ((x).s3456789ABCDEF012)
1736#define ROT16_14(x) ((x).s23456789ABCDEF01)
1737#define ROT16_15(x) ((x).s123456789ABCDEF0)
1738#define ROT16_16(x) ((x))
1739
1740
1741
1742#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1743#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1744
1745
1746
1747#define V_OFFS1(dt) (dt##1)(0)
1748#define V_OFFS2(dt) (dt##2)(0, 1)
1749#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1750#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1751#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1752#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1753
1754
1755
1756#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1757#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1758
1759
1760#define VLOAD_STR(size) vload##size
1761#define VLOAD(size) VLOAD_STR(size)
1762
1763
1764#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1765#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1766
1767#define NO_LOAD(data, offs, ptr) \
1768    {                            \
1769    }
1770
1771
1772#define vload_partial_1_0 NO_LOAD
1773#define vload_partial_1_1 vload1
1774#define vload_partial_1_2 NO_LOAD
1775#define vload_partial_1_3 NO_LOAD
1776#define vload_partial_1_4 NO_LOAD
1777#define vload_partial_1_5 NO_LOAD
1778#define vload_partial_1_6 NO_LOAD
1779#define vload_partial_1_7 NO_LOAD
1780#define vload_partial_1_8 NO_LOAD
1781#define vload_partial_1_9 NO_LOAD
1782#define vload_partial_1_10 NO_LOAD
1783#define vload_partial_1_11 NO_LOAD
1784#define vload_partial_1_12 NO_LOAD
1785#define vload_partial_1_13 NO_LOAD
1786#define vload_partial_1_14 NO_LOAD
1787#define vload_partial_1_15 NO_LOAD
1788#define vload_partial_1_16 NO_LOAD
1789
1790#define vload_partial_2_0 NO_LOAD
1791#define vload_partial_2_1 vload_partial_1
1792#define vload_partial_2_2 vload_partial_2
1793#define vload_partial_2_3 NO_LOAD
1794#define vload_partial_2_4 NO_LOAD
1795#define vload_partial_2_5 NO_LOAD
1796#define vload_partial_2_6 NO_LOAD
1797#define vload_partial_2_7 NO_LOAD
1798#define vload_partial_2_8 NO_LOAD
1799#define vload_partial_2_9 NO_LOAD
1800#define vload_partial_2_10 NO_LOAD
1801#define vload_partial_2_11 NO_LOAD
1802#define vload_partial_2_12 NO_LOAD
1803#define vload_partial_2_13 NO_LOAD
1804#define vload_partial_2_14 NO_LOAD
1805#define vload_partial_2_15 NO_LOAD
1806#define vload_partial_2_16 NO_LOAD
1807
1808#define vload_partial_3_0 NO_LOAD
1809#define vload_partial_3_1 vload_partial_1
1810#define vload_partial_3_2 vload_partial_2
1811#define vload_partial_3_3 vload_partial_3
1812#define vload_partial_3_4 NO_LOAD
1813#define vload_partial_3_5 NO_LOAD
1814#define vload_partial_3_6 NO_LOAD
1815#define vload_partial_3_7 NO_LOAD
1816#define vload_partial_3_8 NO_LOAD
1817#define vload_partial_3_9 NO_LOAD
1818#define vload_partial_3_10 NO_LOAD
1819#define vload_partial_3_11 NO_LOAD
1820#define vload_partial_3_12 NO_LOAD
1821#define vload_partial_3_13 NO_LOAD
1822#define vload_partial_3_14 NO_LOAD
1823#define vload_partial_3_15 NO_LOAD
1824#define vload_partial_3_16 NO_LOAD
1825
1826#define vload_partial_4_0 NO_LOAD
1827#define vload_partial_4_1 vload_partial_1
1828#define vload_partial_4_2 vload_partial_2
1829#define vload_partial_4_3 vload_partial_3
1830#define vload_partial_4_4 vload_partial_4
1831#define vload_partial_4_5 NO_LOAD
1832#define vload_partial_4_6 NO_LOAD
1833#define vload_partial_4_7 NO_LOAD
1834#define vload_partial_4_8 NO_LOAD
1835#define vload_partial_4_9 NO_LOAD
1836#define vload_partial_4_10 NO_LOAD
1837#define vload_partial_4_11 NO_LOAD
1838#define vload_partial_4_12 NO_LOAD
1839#define vload_partial_4_13 NO_LOAD
1840#define vload_partial_4_14 NO_LOAD
1841#define vload_partial_4_15 NO_LOAD
1842#define vload_partial_4_16 NO_LOAD
1843
1844#define vload_partial_8_0 NO_LOAD
1845#define vload_partial_8_1 vload_partial_1
1846#define vload_partial_8_2 vload_partial_2
1847#define vload_partial_8_3 vload_partial_3
1848#define vload_partial_8_4 vload_partial_4
1849#define vload_partial_8_5 vload_partial_5
1850#define vload_partial_8_6 vload_partial_6
1851#define vload_partial_8_7 vload_partial_7
1852#define vload_partial_8_8 vload_partial_8
1853#define vload_partial_8_9 NO_LOAD
1854#define vload_partial_8_10 NO_LOAD
1855#define vload_partial_8_11 NO_LOAD
1856#define vload_partial_8_12 NO_LOAD
1857#define vload_partial_8_13 NO_LOAD
1858#define vload_partial_8_14 NO_LOAD
1859#define vload_partial_8_15 NO_LOAD
1860#define vload_partial_8_16 NO_LOAD
1861
1862#define vload_partial_16_0 NO_LOAD
1863#define vload_partial_16_1 vload_partial_1
1864#define vload_partial_16_2 vload_partial_2
1865#define vload_partial_16_3 vload_partial_3
1866#define vload_partial_16_4 vload_partial_4
1867#define vload_partial_16_5 vload_partial_5
1868#define vload_partial_16_6 vload_partial_6
1869#define vload_partial_16_7 vload_partial_7
1870#define vload_partial_16_8 vload_partial_8
1871#define vload_partial_16_9 vload_partial_9
1872#define vload_partial_16_10 vload_partial_10
1873#define vload_partial_16_11 vload_partial_11
1874#define vload_partial_16_12 vload_partial_12
1875#define vload_partial_16_13 vload_partial_13
1876#define vload_partial_16_14 vload_partial_14
1877#define vload_partial_16_15 vload_partial_15
1878#define vload_partial_16_16 vload_partial_16
1879
1880
1881#define vload_partial_1(DATA, OFFSET, PTR) \
1882    DATA.s0 = vload1(OFFSET, PTR);
1883
1884#define vload_partial_2(DATA, OFFSET, PTR) \
1885    DATA.s01 = vload2(OFFSET, PTR);
1886
1887#define vload_partial_3(DATA, OFFSET, PTR) \
1888    DATA.s012 = vload3(OFFSET, PTR);
1889
1890#define vload_partial_4(DATA, OFFSET, PTR) \
1891    DATA.s0123 = vload4(OFFSET, PTR);
1892
1893#define vload_partial_5(DATA, OFFSET, PTR)    \
1894    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1895    DATA.s4 = vload1(OFFSET, PTR + 4);
1896
1897#define vload_partial_6(DATA, OFFSET, PTR)    \
1898    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1899    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1900
1901#define vload_partial_7(DATA, OFFSET, PTR)    \
1902    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1903    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1904
1905#define vload_partial_8(DATA, OFFSET, PTR) \
1906    DATA.s01234567 = vload8(OFFSET, PTR);
1907
1908#define vload_partial_9(DATA, OFFSET, PTR)        \
1909    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1910    DATA.s8 = vload1(OFFSET, PTR + 8);
1911
1912#define vload_partial_10(DATA, OFFSET, PTR)       \
1913    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1914    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1915
1916#define vload_partial_11(DATA, OFFSET, PTR)       \
1917    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1918    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1919
1920#define vload_partial_12(DATA, OFFSET, PTR)       \
1921    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1922    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1923
1924#define vload_partial_13(DATA, OFFSET, PTR)       \
1925    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1926    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1927
1928#define vload_partial_14(DATA, OFFSET, PTR)       \
1929    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1930    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1931
1932#define vload_partial_15(DATA, OFFSET, PTR)       \
1933    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1934    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1935
1936#define vload_partial_16(DATA, OFFSET, PTR) \
1937    DATA = vload16(OFFSET, PTR);
1938
1939
1940
1941#define PIXEL_UNIT4 1
1942#define PIXEL_UNIT8 2
1943#define PIXEL_UNIT16 4
1944
1945
1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1947#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1948
1949
1950#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1951#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1952#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1953
1954#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1955#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1956#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1957#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1958#endif
1959
1960#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1961#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1962#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1963
1964#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1965#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1966#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1967#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1968#endif
1969
1970
1971#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1972#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1973
1974
1975#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1976#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1977
1978#define VSTORE_STR(size) vstore##size
1979#define VSTORE(size) VSTORE_STR(size)
1980
1981#define float1 float
1982#define half1 half
1983#define char1 char
1984#define uchar1 uchar
1985#define short1 short
1986#define ushort1 ushort
1987#define int1 int
1988#define uint1 uint
1989#define long1 long
1990#define ulong1 ulong
1991#define double1 double
1992
1993#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1994#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1995
1996
1997#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1998#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1999
2000#define NO_STORE(data, offs, ptr) \
2001    {                             \
2002    }
2003
2004
2005#define vstore_partial_1_0 NO_STORE
2006#define vstore_partial_1_1 vstore1
2007#define vstore_partial_1_2 NO_STORE
2008#define vstore_partial_1_3 NO_STORE
2009#define vstore_partial_1_4 NO_STORE
2010#define vstore_partial_1_5 NO_STORE
2011#define vstore_partial_1_6 NO_STORE
2012#define vstore_partial_1_7 NO_STORE
2013#define vstore_partial_1_8 NO_STORE
2014#define vstore_partial_1_9 NO_STORE
2015#define vstore_partial_1_10 NO_STORE
2016#define vstore_partial_1_11 NO_STORE
2017#define vstore_partial_1_12 NO_STORE
2018#define vstore_partial_1_13 NO_STORE
2019#define vstore_partial_1_14 NO_STORE
2020#define vstore_partial_1_15 NO_STORE
2021#define vstore_partial_1_16 NO_STORE
2022
2023#define vstore_partial_2_0 NO_STORE
2024#define vstore_partial_2_1 vstore_partial_1
2025#define vstore_partial_2_2 vstore_partial_2
2026#define vstore_partial_2_3 NO_STORE
2027#define vstore_partial_2_4 NO_STORE
2028#define vstore_partial_2_5 NO_STORE
2029#define vstore_partial_2_6 NO_STORE
2030#define vstore_partial_2_7 NO_STORE
2031#define vstore_partial_2_8 NO_STORE
2032#define vstore_partial_2_9 NO_STORE
2033#define vstore_partial_2_10 NO_STORE
2034#define vstore_partial_2_11 NO_STORE
2035#define vstore_partial_2_12 NO_STORE
2036#define vstore_partial_2_13 NO_STORE
2037#define vstore_partial_2_14 NO_STORE
2038#define vstore_partial_2_15 NO_STORE
2039#define vstore_partial_2_16 NO_STORE
2040
2041#define vstore_partial_3_0 NO_STORE
2042#define vstore_partial_3_1 vstore_partial_1
2043#define vstore_partial_3_2 vstore_partial_2
2044#define vstore_partial_3_3 vstore_partial_3
2045#define vstore_partial_3_4 NO_STORE
2046#define vstore_partial_3_5 NO_STORE
2047#define vstore_partial_3_6 NO_STORE
2048#define vstore_partial_3_7 NO_STORE
2049#define vstore_partial_3_8 NO_STORE
2050#define vstore_partial_3_9 NO_STORE
2051#define vstore_partial_3_10 NO_STORE
2052#define vstore_partial_3_11 NO_STORE
2053#define vstore_partial_3_12 NO_STORE
2054#define vstore_partial_3_13 NO_STORE
2055#define vstore_partial_3_14 NO_STORE
2056#define vstore_partial_3_15 NO_STORE
2057#define vstore_partial_3_16 NO_STORE
2058
2059#define vstore_partial_4_0 NO_STORE
2060#define vstore_partial_4_1 vstore_partial_1
2061#define vstore_partial_4_2 vstore_partial_2
2062#define vstore_partial_4_3 vstore_partial_3
2063#define vstore_partial_4_4 vstore_partial_4
2064#define vstore_partial_4_5 NO_STORE
2065#define vstore_partial_4_6 NO_STORE
2066#define vstore_partial_4_7 NO_STORE
2067#define vstore_partial_4_8 NO_STORE
2068#define vstore_partial_4_9 NO_STORE
2069#define vstore_partial_4_10 NO_STORE
2070#define vstore_partial_4_11 NO_STORE
2071#define vstore_partial_4_12 NO_STORE
2072#define vstore_partial_4_13 NO_STORE
2073#define vstore_partial_4_14 NO_STORE
2074#define vstore_partial_4_15 NO_STORE
2075#define vstore_partial_4_16 NO_STORE
2076
2077#define vstore_partial_8_0 NO_STORE
2078#define vstore_partial_8_1 vstore_partial_1
2079#define vstore_partial_8_2 vstore_partial_2
2080#define vstore_partial_8_3 vstore_partial_3
2081#define vstore_partial_8_4 vstore_partial_4
2082#define vstore_partial_8_5 vstore_partial_5
2083#define vstore_partial_8_6 vstore_partial_6
2084#define vstore_partial_8_7 vstore_partial_7
2085#define vstore_partial_8_8 vstore_partial_8
2086#define vstore_partial_8_9 NO_STORE
2087#define vstore_partial_8_10 NO_STORE
2088#define vstore_partial_8_11 NO_STORE
2089#define vstore_partial_8_12 NO_STORE
2090#define vstore_partial_8_13 NO_STORE
2091#define vstore_partial_8_14 NO_STORE
2092#define vstore_partial_8_15 NO_STORE
2093#define vstore_partial_8_16 NO_STORE
2094
2095#define vstore_partial_16_0 NO_STORE
2096#define vstore_partial_16_1 vstore_partial_1
2097#define vstore_partial_16_2 vstore_partial_2
2098#define vstore_partial_16_3 vstore_partial_3
2099#define vstore_partial_16_4 vstore_partial_4
2100#define vstore_partial_16_5 vstore_partial_5
2101#define vstore_partial_16_6 vstore_partial_6
2102#define vstore_partial_16_7 vstore_partial_7
2103#define vstore_partial_16_8 vstore_partial_8
2104#define vstore_partial_16_9 vstore_partial_9
2105#define vstore_partial_16_10 vstore_partial_10
2106#define vstore_partial_16_11 vstore_partial_11
2107#define vstore_partial_16_12 vstore_partial_12
2108#define vstore_partial_16_13 vstore_partial_13
2109#define vstore_partial_16_14 vstore_partial_14
2110#define vstore_partial_16_15 vstore_partial_15
2111#define vstore_partial_16_16 vstore_partial_16
2112
2113
2114#define vstore_partial_1(DATA, OFFSET, PTR) \
2115    vstore1(DATA.s0, OFFSET, PTR);
2116
2117#define vstore_partial_2(DATA, OFFSET, PTR) \
2118    vstore2(DATA.s01, OFFSET, PTR);
2119
2120#define vstore_partial_3(DATA, OFFSET, PTR) \
2121    vstore3(DATA.s012, OFFSET, PTR);
2122
2123#define vstore_partial_4(DATA, OFFSET, PTR) \
2124    vstore4(DATA.s0123, OFFSET, PTR);
2125
2126#define vstore_partial_5(DATA, OFFSET, PTR)    \
2127    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2128    vstore1(DATA.s4, OFFSET, PTR + 4);
2129
2130#define vstore_partial_6(DATA, OFFSET, PTR)    \
2131    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2132    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2133
2134#define vstore_partial_7(DATA, OFFSET, PTR)    \
2135    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2136    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2137
2138#define vstore_partial_8(DATA, OFFSET, PTR) \
2139    vstore8(DATA.s01234567, OFFSET, PTR);
2140
2141#define vstore_partial_9(DATA, OFFSET, PTR)        \
2142    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2143    vstore1(DATA.s8, OFFSET, PTR + 8);
2144
2145#define vstore_partial_10(DATA, OFFSET, PTR)       \
2146    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2147    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2148
2149#define vstore_partial_11(DATA, OFFSET, PTR)       \
2150    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2151    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2152
2153#define vstore_partial_12(DATA, OFFSET, PTR)       \
2154    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2155    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2156
2157#define vstore_partial_13(DATA, OFFSET, PTR)       \
2158    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2159    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2160
2161#define vstore_partial_14(DATA, OFFSET, PTR)       \
2162    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2163    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2164
2165#define vstore_partial_15(DATA, OFFSET, PTR)       \
2166    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2167    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2168
2169#define vstore_partial_16(DATA, OFFSET, PTR) \
2170    vstore16(DATA, OFFSET, PTR);
2171
2172
2173
2174
2175
2176#define convert_float_sat convert_float
2177#define convert_float1_sat convert_float
2178#define convert_float2_sat convert_float2
2179#define convert_float3_sat convert_float3
2180#define convert_float4_sat convert_float4
2181#define convert_float8_sat convert_float8
2182#define convert_float16_sat convert_float16
2183#define convert_half_sat convert_float
2184#define convert_half1_sat convert_half
2185#define convert_half2_sat convert_half2
2186#define convert_half3_sat convert_half3
2187#define convert_half4_sat convert_half4
2188#define convert_half8_sat convert_half8
2189#define convert_half16_sat convert_half16
2190
2191#define convert_float1 convert_float
2192#define convert_half1 convert_half
2193#define convert_char1 convert_char
2194#define convert_uchar1 convert_uchar
2195#define convert_short1 convert_short
2196#define convert_ushort1 convert_ushort
2197#define convert_int1 convert_int
2198#define convert_uint1 convert_uint
2199#define convert_long1 convert_long
2200#define convert_ulong1 convert_ulong
2201#define convert_double1 convert_double
2202
2203#define convert_char1_sat convert_char_sat
2204#define convert_uchar1_sat convert_uchar_sat
2205#define convert_uchar2_sat convert_uchar2_sat
2206#define convert_uchar3_sat convert_uchar3_sat
2207#define convert_uchar4_sat convert_uchar4_sat
2208#define convert_uchar8_sat convert_uchar8_sat
2209#define convert_uchar16_sat convert_uchar16_sat
2210#define convert_short1_sat convert_short_sat
2211#define convert_ushort1_sat convert_ushort_sat
2212#define convert_int1_sat convert_int_sat
2213#define convert_uint1_sat convert_uint_sat
2214#define convert_long1_sat convert_long_sat
2215#define convert_ulong1_sat convert_ulong_sat
2216#define convert_double1_sat convert_double_sat
2217
2218#define VEC_DATA_TYPE_STR(type, size) type##size
2219#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2220
2221#define CONVERT_STR(x, type) (convert_##type((x)))
2222#define CONVERT(x, type) CONVERT_STR(x, type)
2223
2224#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2225#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2226
2227#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2228#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2229
2230#define select_vec_dt_uchar(size) uchar##size
2231#define select_vec_dt_char(size) char##size
2232#define select_vec_dt_ushort(size) ushort##size
2233#define select_vec_dt_short(size) short##size
2234#define select_vec_dt_half(size) short##size
2235#define select_vec_dt_uint(size) uint##size
2236#define select_vec_dt_int(size) int##size
2237#define select_vec_dt_float(size) int##size
2238#define select_vec_dt_ulong(size) ulong##size
2239#define select_vec_dt_long(size) long##size
2240
2241#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2242#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2243#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2244
2245#define signed_int_vec_dt_uchar(size) char##size
2246#define signed_int_vec_dt_char(size) char##size
2247#define signed_int_vec_dt_ushort(size) short##size
2248#define signed_int_vec_dt_short(size) short##size
2249#define signed_int_vec_dt_half(size) short##size
2250#define signed_int_vec_dt_uint(size) int##size
2251#define signed_int_vec_dt_int(size) int##size
2252#define signed_int_vec_dt_float(size) int##size
2253#define signed_int_vec_dt_ulong(size) long##size
2254#define signed_int_vec_dt_long(size) long##size
2255
2256#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2257#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2258#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2259
2260#define sum_reduce_1(x) (x)
2261#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2262#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2263#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2264#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2265#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2266
2267#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2268#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2269
2270#define prod_reduce_1(x) (x)
2271#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2272#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2273#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2274#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2275#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2276
2277#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2278#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2279
2280#define max_reduce_1(x) (x)
2281#define max_reduce_2(x) max(((x).s0), ((x).s1))
2282#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2283#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2284#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2285#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2286
2287#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2288#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2289
2290#define VECTOR_DECLARATION(name)     \
2291    __global uchar *name##_ptr,      \
2292    uint        name##_stride_x, \
2293    uint        name##_step_x,   \
2294    uint        name##_offset_first_element_in_bytes
2295
2296#define IMAGE_DECLARATION(name)      \
2297    __global uchar *name##_ptr,      \
2298    uint        name##_stride_x, \
2299    uint        name##_step_x,   \
2300    uint        name##_stride_y, \
2301    uint        name##_step_y,   \
2302    uint        name##_offset_first_element_in_bytes
2303
2304#define TENSOR3D_DECLARATION(name)   \
2305    __global uchar *name##_ptr,      \
2306    uint        name##_stride_x, \
2307    uint        name##_step_x,   \
2308    uint        name##_stride_y, \
2309    uint        name##_step_y,   \
2310    uint        name##_stride_z, \
2311    uint        name##_step_z,   \
2312    uint        name##_offset_first_element_in_bytes
2313
2314#define TENSOR4D_DECLARATION(name)   \
2315    __global uchar *name##_ptr,      \
2316    uint        name##_stride_x, \
2317    uint        name##_step_x,   \
2318    uint        name##_stride_y, \
2319    uint        name##_step_y,   \
2320    uint        name##_stride_z, \
2321    uint        name##_step_z,   \
2322    uint        name##_stride_w, \
2323    uint        name##_step_w,   \
2324    uint        name##_offset_first_element_in_bytes
2325
2326#define TENSOR5D_DECLARATION(name)   \
2327    __global uchar *name##_ptr,      \
2328    uint        name##_stride_x, \
2329    uint        name##_step_x,   \
2330    uint        name##_stride_y, \
2331    uint        name##_step_y,   \
2332    uint        name##_stride_z, \
2333    uint        name##_step_z,   \
2334    uint        name##_stride_w, \
2335    uint        name##_step_w,   \
2336    uint        name##_stride_v, \
2337    uint        name##_step_v,   \
2338    uint        name##_offset_first_element_in_bytes
2339
2340#define CONVERT_TO_VECTOR_STRUCT(name) \
2341    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2342
2343#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2344    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2345
2346#define CONVERT_TO_IMAGE_STRUCT(name) \
2347    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2348
2349#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2350    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2351
2352#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2353    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2354
2355#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2356    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2357
2358#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2359    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2360
2361#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2362    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2363                                 name##_stride_z, name##_step_z)
2364
2365#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2366    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2367
2368#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2369    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2370                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2371
2372#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2373    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2374
2375#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2376    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2377                           name##_stride_z, name##_step_z)
2378
2379
2380typedef struct Vector
2381{
2382    __global uchar *ptr;
2383    int             offset_first_element_in_bytes;
2384    int             stride_x;
2385} Vector;
2386
2387
2388typedef struct Image
2389{
2390    __global uchar *ptr;
2391    int             offset_first_element_in_bytes;
2392    int             stride_x;
2393    int             stride_y;
2394} Image;
2395
2396
2397typedef struct Tensor3D
2398{
2399    __global uchar *ptr;
2400    int             offset_first_element_in_bytes;
2401    int             stride_x;
2402    int             stride_y;
2403    int             stride_z;
2404} Tensor3D;
2405
2406
2407typedef struct Tensor4D
2408{
2409    __global uchar *ptr;
2410    int             offset_first_element_in_bytes;
2411    int             stride_x;
2412    int             stride_y;
2413    int             stride_z;
2414    int             stride_w;
2415} Tensor4D;
2416
2417
2418inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2419{
2420    Vector vector =
2421    {
2422        .ptr                           = ptr,
2423        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2424        .stride_x                      = stride_x,
2425    };
2426    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2427    return vector;
2428}
2429
2430
2431inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2432{
2433    Image img =
2434    {
2435        .ptr                           = ptr,
2436        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2437        .stride_x                      = stride_x,
2438        .stride_y                      = stride_y
2439    };
2440    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2441    return img;
2442}
2443
2444
2445inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2446{
2447    Image img =
2448    {
2449        .ptr                           = ptr,
2450        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2451        .stride_x                      = stride_x,
2452        .stride_y                      = stride_y
2453    };
2454    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2455    return img;
2456}
2457
2458
2459inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2460{
2461    Tensor3D tensor =
2462    {
2463        .ptr                           = ptr,
2464        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2465        .stride_x                      = stride_x,
2466        .stride_y                      = stride_y,
2467        .stride_z                      = stride_z
2468    };
2469    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2470    return tensor;
2471}
2472
2473
2474inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2475{
2476    Tensor3D tensor =
2477    {
2478        .ptr                           = ptr,
2479        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2480        .stride_x                      = stride_x,
2481        .stride_y                      = stride_y,
2482        .stride_z                      = stride_z
2483    };
2484    return tensor;
2485}
2486
2487inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2488                                             uint step_w,
2489                                             uint mod_size)
2490{
2491    Tensor4D tensor =
2492    {
2493        .ptr                           = ptr,
2494        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2495        .stride_x                      = stride_x,
2496        .stride_y                      = stride_y,
2497        .stride_z                      = stride_z,
2498        .stride_w                      = stride_w
2499    };
2500
2501    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2502    return tensor;
2503}
2504
2505
2506inline __global const uchar *vector_offset(const Vector *vec, int x)
2507{
2508    return vec->ptr + x * vec->stride_x;
2509}
2510
2511
2512inline __global uchar *offset(const Image *img, int x, int y)
2513{
2514    return img->ptr + x * img->stride_x + y * img->stride_y;
2515}
2516
2517
2518inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2519{
2520    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2521}
2522
2523
2524inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2525{
2526    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2527}
2528
2529
2530inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2531{
2532    uint num_elements = width * height;
2533
2534    const uint z = index / num_elements;
2535
2536    index %= num_elements;
2537
2538    const uint y = index / width;
2539
2540    index %= width;
2541
2542    const uint x = index;
2543
2544    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2545}
2546
2547#endif
2548
2549
2550#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
2551#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
2552
2553
2554#define scalar_access_0_1(x) ((x).s0)
2555#define scalar_access_0_2(x) ((x).s01)
2556#define scalar_access_0_3(x) ((x).s012)
2557#define scalar_access_0_4(x) ((x).s0123)
2558#define scalar_access_0_8(x) ((x).s01234567)
2559#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
2560
2561
2562#define scalar_access_1_1(x) ((x).s1)
2563#define scalar_access_1_2(x) ((x).s12)
2564#define scalar_access_1_3(x) ((x).s123)
2565#define scalar_access_1_4(x) ((x).s1234)
2566#define scalar_access_1_8(x) ((x).s12345678)
2567
2568
2569#define scalar_access_2_1(x) ((x).s2)
2570#define scalar_access_2_2(x) ((x).s23)
2571#define scalar_access_2_3(x) ((x).s234)
2572#define scalar_access_2_4(x) ((x).s2345)
2573#define scalar_access_2_8(x) ((x).s23456789)
2574
2575
2576#define scalar_access_3_1(x) ((x).s3)
2577#define scalar_access_3_2(x) ((x).s34)
2578#define scalar_access_3_3(x) ((x).s345)
2579#define scalar_access_3_4(x) ((x).s3456)
2580#define scalar_access_3_8(x) ((x).s3456789A)
2581
2582
2583#define scalar_access_4_1(x) ((x).s4)
2584#define scalar_access_4_2(x) ((x).s45)
2585#define scalar_access_4_3(x) ((x).s456)
2586#define scalar_access_4_4(x) ((x).s4567)
2587#define scalar_access_4_8(x) ((x).s456789AB)
2588
2589
2590#define scalar_access_8_1(x) ((x).s8)
2591#define scalar_access_8_2(x) ((x).s89)
2592#define scalar_access_8_3(x) ((x).s89A)
2593#define scalar_access_8_4(x) ((x).s89AB)
2594#define scalar_access_8_8(x) ((x).s89ABCDEF)
2595
2596
2597#define scalar_access_12_1(x) ((x).sC)
2598#define scalar_access_12_2(x) ((x).sCD)
2599#define scalar_access_12_3(x) ((x).sCDE)
2600#define scalar_access_12_4(x) ((x).sCDEF)
2601
2602
2603#define scalar_access_16_1(x) ((x).sF)
2604
2605
2606#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2607    ({})
2608
2609#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2610    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2611
2612#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2613    LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2614    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2615
2616#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2617    LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2618    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2619
2620#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2621    LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2622    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2623
2624#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2625    LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2626    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2627
2628#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2629    LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2630    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2631
2632#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2633    LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2634    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2635
2636#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2637    LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2638    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2639
2640#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2641    LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2642    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2643
2644#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2645    LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
2646    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2647
2648#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2649    LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2650    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2651
2652#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2653    LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2654    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2655
2656#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2657    LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2658    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2659
2660#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2661    LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2662    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2663
2664#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2665    LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2666    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2667
2668#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2669    LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2670    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2671
2672
2673
2674#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2675#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2676
2677
2678
2679#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2680    ({})
2681
2682#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2683    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2684
2685#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2686    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2687
2688#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2689    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2690
2691#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2692    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2693
2694#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2695    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2696    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2697
2698#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2699    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2700    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2701
2702#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2703    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2704    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2705
2706#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2707    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2708
2709#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2710    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
2711    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2712
2713#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2714    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2715    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2716
2717#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2718    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2719    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2720
2721#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2722    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2723    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2724
2725#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2726    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2727    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2728    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2729
2730#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2731    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
2732    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2733    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2734
2735#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2736    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2737    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2738    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2739
2740#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2741    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2742
2743
2744
2745#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2746#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2747
2748
2749#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2750    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2751    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2752
2753#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2754    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2755    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2756    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2757
2758#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2759    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2760    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2761    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2762
2763#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2764    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2765    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2766    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2767
2768#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2769    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2770    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2771    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2772
2773#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2774    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2775    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2776    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2777
2778#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2779    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2780    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2781    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2782
2783#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2784    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2785    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2786    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2787
2788#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2789    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2790    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2791    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2792
2793#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2794    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2795    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2796    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2797
2798#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2799    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2800    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2801    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2802
2803#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2804    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2805    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2806    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2807
2808#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2809    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2810    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2811    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2812
2813#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2814    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2815    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2816    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2817
2818#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2819    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2820    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2821    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2822
2823#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2824    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2825    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2826    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2827
2828
2829
2830
2831#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2832#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2833
2834
2835
2836#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2837    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2838    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2839
2840#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2841    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2842    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2843    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2844
2845#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2846    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2847    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2848    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2849
2850#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2851    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2852    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2853    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2854
2855#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2856    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2857    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2858    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2859
2860#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2861    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2862    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2863    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2864
2865#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2866    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2867    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2868    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2869
2870#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2871    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2872    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2873    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2874
2875#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2876    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2877    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2878    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2879
2880#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2881    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2882    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2883    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2884
2885#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2886    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2887    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2888    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2889
2890#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2891    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2892    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2893    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2894
2895#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2896    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2897    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2898    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2899
2900#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2901    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2902    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2903    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2904
2905#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2906    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2907    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2908    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2909
2910#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2911    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2912    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2913    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2914
2915
2916
2917#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2918#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2919
2920#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2921    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
2922    {                                                                                                                                                            \
2923        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
2924    }                                                                                                                                                            \
2925    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
2926    {                                                                                                                                                            \
2927        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2928    }                                                                                                                                                            \
2929    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
2930    {                                                                                                                                                            \
2931        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2932    }                                                                                                                                                            \
2933    else                                                                                                                                                         \
2934    {                                                                                                                                                            \
2935        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
2936    }
2937
2938#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2939    if(!(PARTIAL_COND_X))                                                                                                \
2940    {                                                                                                                    \
2941        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2942    }                                                                                                                    \
2943    else                                                                                                                 \
2944    {                                                                                                                    \
2945        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2946    }
2947
2948#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2949    if(!(PARTIAL_COND_Y))                                                                                                \
2950    {                                                                                                                    \
2951        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2952    }                                                                                                                    \
2953    else                                                                                                                 \
2954    {                                                                                                                    \
2955        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2956    }
2957
2958
2959#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2960
2961#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2962    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2963
2964#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2965
2966#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2967    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2968    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2969
2970#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2971
2972#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2973    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2974    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2975
2976#else
2977
2978#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2979    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2980    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2981
2982#endif
2983
2984
2985#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2986    BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
2987
2988#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2989    LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2990    BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
2991
2992#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2993    LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2994    BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
2995
2996#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2997    LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2998    BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
2999
3000#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3001    LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3002    BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
3003
3004#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3005    LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3006    BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
3007
3008#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3009    LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3010    BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
3011
3012#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3013    LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3014    BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
3015
3016#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3017    LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3018    BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
3019
3020#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3021    LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
3022    BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
3023
3024#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3025    LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3026    BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
3027
3028#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3029    LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3030    BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
3031
3032#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3033    LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3034    BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
3035
3036#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3037    LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3038    BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
3039
3040#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3041    LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3042    BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
3043
3044#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3045    LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3046    BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
3047
3048
3049
3050#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3051#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3052
3053
3054
3055#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3056    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3057    BASENAME##0;                                                                            \
3058    if(Y_MASK##0 != 0)                                                                      \
3059        BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
3060    else                                                                                    \
3061        BASENAME##0 = 0;
3062
3063#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3064    LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3065    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3066    BASENAME##1;                                                                            \
3067    if(Y_MASK##1 != 0)                                                                      \
3068        BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
3069    else                                                                                    \
3070        BASENAME##1 = 0;
3071
3072#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3073    LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3074    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3075    BASENAME##2;                                                                            \
3076    if(Y_MASK##2 != 0)                                                                      \
3077        BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
3078    else                                                                                    \
3079        BASENAME##2 = 0;
3080
3081#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3082    LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3083    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3084    BASENAME##3;                                                                            \
3085    if(Y_MASK##3 != 0)                                                                      \
3086        BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
3087    else                                                                                    \
3088        BASENAME##3 = 0;
3089
3090#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3091    LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3092    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3093    BASENAME##4;                                                                            \
3094    if(Y_MASK##4 != 0)                                                                      \
3095        BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
3096    else                                                                                    \
3097        BASENAME##4 = 0;
3098
3099#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3100    LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3101    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3102    BASENAME##5;                                                                            \
3103    if(Y_MASK##5 != 0)                                                                      \
3104        BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
3105    else                                                                                    \
3106        BASENAME##5 = 0;
3107
3108#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3109    LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3110    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3111    BASENAME##6;                                                                            \
3112    if(Y_MASK##6 != 0)                                                                      \
3113        BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
3114    else                                                                                    \
3115        BASENAME##6 = 0;
3116
3117#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3118    LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3119    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3120    BASENAME##7;                                                                            \
3121    if(Y_MASK##7 != 0)                                                                      \
3122        BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
3123    else                                                                                    \
3124        BASENAME##7 = 0;
3125
3126#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3127    LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3128    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3129    BASENAME##8;                                                                            \
3130    if(Y_MASK##8 != 0)                                                                      \
3131        BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
3132    else                                                                                    \
3133        BASENAME##8 = 0;
3134
3135#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3136    LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3137    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3138    BASENAME##9;                                                                            \
3139    if(Y_MASK##9 != 0)                                                                      \
3140        BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
3141    else                                                                                    \
3142        BASENAME##9 = 0;
3143
3144#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3145    LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3146    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3147    BASENAME##A;                                                                            \
3148    if(Y_MASK##A != 0)                                                                      \
3149        BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
3150    else                                                                                    \
3151        BASENAME##A = 0;
3152
3153#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3154    LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3155    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3156    BASENAME##B;                                                                            \
3157    if(Y_MASK##B != 0)                                                                      \
3158        BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
3159    else                                                                                    \
3160        BASENAME##B = 0;
3161
3162#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3163    LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3164    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3165    BASENAME##C;                                                                            \
3166    if(Y_MASK##C != 0)                                                                      \
3167        BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
3168    else                                                                                    \
3169        BASENAME##C = 0;
3170
3171#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3172    LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3173    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3174    BASENAME##D;                                                                            \
3175    if(Y_MASK##D != 0)                                                                      \
3176        BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
3177    else                                                                                    \
3178        BASENAME##D = 0;
3179
3180#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3181    LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3182    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3183    BASENAME##E;                                                                            \
3184    if(Y_MASK##E != 0)                                                                      \
3185        BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
3186    else                                                                                    \
3187        BASENAME##E = 0;
3188
3189#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3190    LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3191    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3192    BASENAME##F;                                                                            \
3193    if(Y_MASK##F != 0)                                                                      \
3194        BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
3195    else                                                                                    \
3196        BASENAME##F = 0;
3197
3198
3199#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3200#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3201
3202
3203#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3204    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3205    BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
3206
3207#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3208    LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3209    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3210    BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
3211
3212#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3213    LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3214    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3215    BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
3216
3217#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3218    LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3219    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3220    BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
3221
3222#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3223    LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3224    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3225    BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
3226
3227#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3228    LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3229    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3230    BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
3231
3232#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3233    LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3234    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3235    BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
3236
3237#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3238    LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3239    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3240    BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
3241
3242#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3243    LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3244    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3245    BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
3246
3247#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3248    LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)      \
3249    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3250    BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
3251
3252#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3253    LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3254    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3255    BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
3256
3257#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3258    LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3259    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3260    BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
3261
3262#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3263    LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3264    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3265    BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
3266
3267#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3268    LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3269    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3270    BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
3271
3272#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3273    LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3274    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3275    BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
3276
3277#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3278    LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3279    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3280    BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
3281
3282
3283
3284
3285#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3286#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3287
3288
3289
3290#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3291    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3292    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
3293    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
3294
3295#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3296    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3297    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3298    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
3299    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
3300
3301#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3302    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3303    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3304    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
3305    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
3306
3307#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3308    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3309    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3310    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
3311    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
3312
3313#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3314    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3315    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3316    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
3317    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
3318
3319#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3320    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3321    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3322    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
3323    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
3324
3325#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3326    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3327    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3328    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
3329    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
3330
3331#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3332    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3333    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3334    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
3335    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
3336
3337
3338
3339
3340#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3341#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3342
3343
3344
3345#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
3346    BASENAME##0 *= (DATA_TYPE)SCALE;
3347
3348#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
3349    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
3350    BASENAME##1 *= (DATA_TYPE)SCALE;
3351
3352#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
3353    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
3354    BASENAME##2 *= (DATA_TYPE)SCALE;
3355
3356#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
3357    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
3358    BASENAME##3 *= (DATA_TYPE)SCALE;
3359
3360#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
3361    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
3362    BASENAME##4 *= (DATA_TYPE)SCALE;
3363
3364#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
3365    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
3366    BASENAME##5 *= (DATA_TYPE)SCALE;
3367
3368#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
3369    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
3370    BASENAME##6 *= (DATA_TYPE)SCALE;
3371
3372#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
3373    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
3374    BASENAME##7 *= (DATA_TYPE)SCALE;
3375
3376#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
3377    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
3378    BASENAME##8 *= (DATA_TYPE)SCALE;
3379
3380#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
3381    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
3382    BASENAME##9 *= (DATA_TYPE)SCALE;
3383
3384#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
3385    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
3386    BASENAME##A *= (DATA_TYPE)SCALE;
3387
3388#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
3389    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
3390    BASENAME##B *= (DATA_TYPE)SCALE;
3391
3392#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
3393    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
3394    BASENAME##C *= (DATA_TYPE)SCALE;
3395
3396#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
3397    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
3398    BASENAME##D *= (DATA_TYPE)SCALE;
3399
3400#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
3401    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
3402    BASENAME##E *= (DATA_TYPE)SCALE;
3403
3404#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
3405    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
3406    BASENAME##F *= (DATA_TYPE)SCALE;
3407
3408
3409
3410#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
3411#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
3412
3413
3414
3415#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
3416    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
3417#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
3418    VEC_DATA_TYPE(TYPE, 2)                         \
3419    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
3420#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
3421    VEC_DATA_TYPE(TYPE, 3)                         \
3422    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
3423#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
3424    VEC_DATA_TYPE(TYPE, 4)                         \
3425    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
3426#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
3427    VEC_DATA_TYPE(TYPE, 8)                         \
3428    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
3429#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
3430    VEC_DATA_TYPE(TYPE, 16)                         \
3431    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
3432
3433
3434
3435#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
3436    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
3437#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
3438    VEC_DATA_TYPE(TYPE, 2)                                \
3439    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
3440#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
3441    VEC_DATA_TYPE(TYPE, 3)                                \
3442    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
3443#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
3444    VEC_DATA_TYPE(TYPE, 4)                                \
3445    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
3446#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
3447    VEC_DATA_TYPE(TYPE, 8)                                \
3448    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
3449#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
3450    VEC_DATA_TYPE(TYPE, 16)                                \
3451    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
3452
3453
3454
3455#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
3456    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
3457#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
3458    COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
3459    COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
3460#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
3461    TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
3462    COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
3463#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
3464    TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
3465    COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
3466#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
3467    TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
3468    COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
3469    COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
3470    COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
3471    COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
3472#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
3473    TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
3474    COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
3475    COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
3476    COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
3477    COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
3478    COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
3479    COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
3480    COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
3481    COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
3482
3483
3484
3485
3486#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
3487    CONCAT(COLUMN_VECTOR, K0)                          \
3488    (IDX_COL, BASENAME, BS, TYPE);
3489
3490
3491#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
3492    CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
3493    (IDX_COL, BASENAME, BS, TYPE);
3494
3495
3496#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
3497    CONCAT(TRANSPOSE_K0X, N0)                       \
3498    (K0, BASENAME, BS, TYPE);
3499
3500
3501#define ADD_ROW_1(BASENAME, BIAS) \
3502    BASENAME##0 += BIAS##0;
3503
3504#define ADD_ROW_2(BASENAME, BIAS) \
3505    ADD_ROW_1(BASENAME, BIAS)     \
3506    BASENAME##1 += BIAS##1;
3507
3508#define ADD_ROW_3(BASENAME, BIAS) \
3509    ADD_ROW_2(BASENAME, BIAS)     \
3510    BASENAME##2 += BIAS##2;
3511
3512#define ADD_ROW_4(BASENAME, BIAS) \
3513    ADD_ROW_3(BASENAME, BIAS)     \
3514    BASENAME##3 += BIAS##3;
3515
3516#define ADD_ROW_5(BASENAME, BIAS) \
3517    ADD_ROW_4(BASENAME, BIAS)     \
3518    BASENAME##4 += BIAS##4;
3519
3520#define ADD_ROW_6(BASENAME, BIAS) \
3521    ADD_ROW_5(BASENAME, BIAS)     \
3522    BASENAME##5 += BIAS##5;
3523
3524#define ADD_ROW_7(BASENAME, BIAS) \
3525    ADD_ROW_6(BASENAME, BIAS)     \
3526    BASENAME##6 += BIAS##6;
3527
3528#define ADD_ROW_8(BASENAME, BIAS) \
3529    ADD_ROW_7(BASENAME, BIAS)     \
3530    BASENAME##7 += BIAS##7;
3531
3532#define ADD_ROW_9(BASENAME, BIAS) \
3533    ADD_ROW_8(BASENAME, BIAS)     \
3534    BASENAME##8 += BIAS##8;
3535
3536#define ADD_ROW_10(BASENAME, BIAS) \
3537    ADD_ROW_9(BASENAME, BIAS)      \
3538    BASENAME##9 += BIAS##9;
3539
3540#define ADD_ROW_11(BASENAME, BIAS) \
3541    ADD_ROW_10(BASENAME, BIAS)     \
3542    BASENAME##A += BIAS##A;
3543
3544#define ADD_ROW_12(BASENAME, BIAS) \
3545    ADD_ROW_11(BASENAME, BIAS)     \
3546    BASENAME##B += BIAS##B;
3547
3548#define ADD_ROW_13(BASENAME, BIAS) \
3549    ADD_ROW_12(BASENAME, BIAS)     \
3550    BASENAME##C += BIAS##C;
3551
3552#define ADD_ROW_14(BASENAME, BIAS) \
3553    ADD_ROW_13(BASENAME, BIAS)     \
3554    BASENAME##D += BIAS##D;
3555
3556#define ADD_ROW_15(BASENAME, BIAS) \
3557    ADD_ROW_14(BASENAME, BIAS)     \
3558    BASENAME##E += BIAS##E;
3559
3560#define ADD_ROW_16(BASENAME, BIAS) \
3561    ADD_ROW_15(BASENAME, BIAS)     \
3562    BASENAME##F += BIAS##F;
3563
3564
3565
3566
3567#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
3568#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
3569
3570
3571
3572#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
3573    BASENAME##0 += BIAS;
3574
3575#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
3576    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
3577    BASENAME##1 += BIAS;
3578
3579#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
3580    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
3581    BASENAME##2 += BIAS;
3582
3583#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
3584    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
3585    BASENAME##3 += BIAS;
3586
3587#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
3588    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
3589    BASENAME##4 += BIAS;
3590
3591#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
3592    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
3593    BASENAME##5 += BIAS;
3594
3595#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
3596    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
3597    BASENAME##6 += BIAS;
3598
3599#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
3600    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
3601    BASENAME##7 += BIAS;
3602
3603#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
3604    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
3605    BASENAME##8 += BIAS;
3606
3607#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
3608    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
3609    BASENAME##9 += BIAS;
3610
3611#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
3612    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
3613    BASENAME##A += BIAS;
3614
3615#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
3616    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
3617    BASENAME##B += BIAS;
3618
3619#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
3620    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
3621    BASENAME##C += BIAS;
3622
3623#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
3624    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
3625    BASENAME##D += BIAS;
3626
3627#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
3628    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
3629    BASENAME##E += BIAS;
3630
3631#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
3632    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
3633    BASENAME##F += BIAS;
3634
3635
3636#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
3637#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
3638
3639
3640
3641#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3642    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
3643
3644#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3645    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3646    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
3647
3648#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3649    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3650    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
3651
3652#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3653    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3654    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
3655
3656#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3657    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3658    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
3659
3660#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3661    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3662    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
3663
3664#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3665    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3666    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
3667
3668#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3669    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3670    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
3671
3672#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3673    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3674    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
3675
3676#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3677    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
3678    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
3679
3680#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3681    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3682    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
3683
3684#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3685    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3686    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
3687
3688#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3689    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3690    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
3691
3692#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3693    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3694    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
3695
3696#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3697    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3698    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
3699
3700#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3701    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3702    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
3703
3704
3705
3706#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3707#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3708
3709
3710
3711#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3712    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3713    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
3714
3715#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3716    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3717    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3718    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
3719
3720#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3721    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3722    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3723    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
3724
3725#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3726    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3727    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3728    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
3729
3730#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3731    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3732    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3733    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
3734
3735#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3736    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3737    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3738    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
3739
3740#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3741    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3742    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3743    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
3744
3745#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3746    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3747    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3748    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
3749
3750#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3751    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3752    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3753    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
3754
3755#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3756    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
3757    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3758    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
3759
3760#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3761    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3762    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3763    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
3764
3765#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3766    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3767    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3768    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
3769
3770#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3771    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3772    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3773    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
3774
3775#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3776    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3777    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3778    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
3779
3780#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3781    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3782    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3783    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
3784
3785#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3786    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3787    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3788    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
3789
3790
3791
3792#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3793#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3794
3795
3796#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
3797#define ARM_COMPUTE_HELPERS_ASYMM_H
3798
3799
3800#ifndef ARM_COMPUTE_HELPER_H
3801#define ARM_COMPUTE_HELPER_H
3802
3803
3804
3805
3806#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3807    VSTORE(N0)                                                 \
3808    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3809
3810#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3811    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3812    VSTORE(N0)                                                 \
3813    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3814
3815#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3816    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3817    VSTORE(N0)                                                 \
3818    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3819
3820#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3821    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3822    VSTORE(N0)                                                 \
3823    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3824
3825#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3826    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3827    VSTORE(N0)                                                 \
3828    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3829
3830#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3831    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3832    VSTORE(N0)                                                 \
3833    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3834
3835#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3836    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3837    VSTORE(N0)                                                 \
3838    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3839
3840#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3841    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3842    VSTORE(N0)                                                 \
3843    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3844
3845#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3846    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3847    VSTORE(N0)                                                 \
3848    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3849
3850#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3851    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
3852    VSTORE(N0)                                                  \
3853    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3854
3855#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3856    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3857    VSTORE(N0)                                                  \
3858    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3859
3860#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3861    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3862    VSTORE(N0)                                                  \
3863    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3864
3865#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3866    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3867    VSTORE(N0)                                                  \
3868    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3869
3870#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3871    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3872    VSTORE(N0)                                                  \
3873    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3874
3875#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3876    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3877    VSTORE(N0)                                                  \
3878    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3879
3880#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3881    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3882    VSTORE(N0)                                                  \
3883    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3884
3885
3886
3887#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3888    VSTORE(N0)                                                         \
3889    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3890
3891#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3892    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3893    VSTORE(N0)                                                         \
3894    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3895
3896#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3897    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3898    VSTORE(N0)                                                         \
3899    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3900
3901#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3902    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3903    VSTORE(N0)                                                         \
3904    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3905
3906#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3907    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3908    VSTORE(N0)                                                         \
3909    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3910
3911#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3912    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3913    VSTORE(N0)                                                         \
3914    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3915
3916#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3917    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3918    VSTORE(N0)                                                         \
3919    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3920
3921#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3922    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3923    VSTORE(N0)                                                         \
3924    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3925
3926#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3927    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3928    VSTORE(N0)                                                         \
3929    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3930
3931#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
3932    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3933    VSTORE(N0)                                                     \
3934    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3935
3936#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3937    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3938    VSTORE(N0)                                                          \
3939    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3940
3941#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3942    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3943    VSTORE(N0)                                                          \
3944    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3945
3946#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3947    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3948    VSTORE(N0)                                                          \
3949    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3950
3951#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3952    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3953    VSTORE(N0)                                                          \
3954    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3955
3956#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3957    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3958    VSTORE(N0)                                                          \
3959    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3960
3961#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3962    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3963    VSTORE(N0)                                                          \
3964    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3965
3966
3967
3968
3969#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3970#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3971
3972
3973
3974#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3975#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3976
3977
3978
3979#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3980    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3981    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3982
3983#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3984    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3985    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3986    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3987
3988#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3989    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3990    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3991    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3992
3993#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3994    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3995    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3996    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3997
3998#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3999    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4000    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4001    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
4002
4003#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4004    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4005    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4006    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
4007
4008#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4009    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4010    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4011    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
4012
4013#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4014    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4015    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4016    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
4017
4018#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4019    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4020    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4021    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
4022
4023#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4024    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
4025    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4026    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
4027
4028#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4029    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4030    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4031    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
4032
4033#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4034    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4035    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4036    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
4037
4038#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4039    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4040    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4041    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
4042
4043#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4044    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4045    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4046    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
4047
4048#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4049    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4050    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4051    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
4052
4053#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4054    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4055    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4056    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
4057
4058
4059
4060#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4061#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4062
4063#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4064    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
4065    {                                                                                                                                                     \
4066        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
4067    }                                                                                                                                                     \
4068    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
4069    {                                                                                                                                                     \
4070        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4071    }                                                                                                                                                     \
4072    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
4073    {                                                                                                                                                     \
4074        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4075    }                                                                                                                                                     \
4076    else                                                                                                                                                  \
4077    {                                                                                                                                                     \
4078        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
4079    }
4080
4081#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
4082    if(!(PARTIAL_COND_X))                                                                                         \
4083    {                                                                                                             \
4084        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4085    }                                                                                                             \
4086    else                                                                                                          \
4087    {                                                                                                             \
4088        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4089    }
4090
4091#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
4092    if(!(PARTIAL_COND_Y))                                                                                         \
4093    {                                                                                                             \
4094        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4095    }                                                                                                             \
4096    else                                                                                                          \
4097    {                                                                                                             \
4098        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4099    }
4100
4101
4102#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
4103
4104
4105#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
4106
4107#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4108    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4109
4110#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
4111
4112#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4113    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
4114
4115#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
4116
4117#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4118    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
4119
4120#else
4121
4122#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4123    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
4124
4125#endif
4126
4127#endif
4128
4129
4130#if defined(PARTIAL_STORE_M0)
4131
4132#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4133    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
4134#else
4135#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4136    ((uint)(y * M0))
4137#endif
4138
4139
4140
4141#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
4142    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
4143
4144
4145#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4146#pragma OPENCL EXTENSION cl_khr_fp16 : enable
4147#endif
4148
4149#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
4150#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
4151#endif
4152
4153#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
4154#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
4155#endif
4156
4157#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
4158#pragma OPENCL EXTENSION cl_arm_printf : enable
4159#endif
4160
4161#define GPU_ARCH_MIDGARD 0x100
4162#define GPU_ARCH_BIFROST 0x200
4163#define GPU_ARCH_VALHALL 0x300
4164
4165
4166#define CONCAT(a, b) a##b
4167
4168
4169#define EXPAND(x) x
4170
4171
4172#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
4173
4174
4175#define REV1(x) ((x))
4176#define REV2(x) ((x).s10)
4177#define REV3(x) ((x).s210)
4178#define REV4(x) ((x).s3210)
4179#define REV8(x) ((x).s76543210)
4180#define REV16(x) ((x).sFEDCBA9876543210)
4181
4182
4183
4184#define REVERSE_STR(x, s) REV##s((x))
4185#define REVERSE(x, s) REVERSE_STR(x, s)
4186
4187
4188
4189#define ROT1_0(x) ((x))
4190#define ROT1_1(x) ((x))
4191
4192#define ROT2_0(x) ((x))
4193#define ROT2_1(x) ((x).s10)
4194#define ROT2_2(x) ((x))
4195
4196#define ROT3_0(x) ((x))
4197#define ROT3_1(x) ((x).s201)
4198#define ROT3_2(x) ((x).s120)
4199#define ROT3_3(x) ((x))
4200
4201#define ROT4_0(x) ((x))
4202#define ROT4_1(x) ((x).s3012)
4203#define ROT4_2(x) ((x).s2301)
4204#define ROT4_3(x) ((x).s1230)
4205#define ROT4_4(x) ((x))
4206
4207#define ROT8_0(x) ((x))
4208#define ROT8_1(x) ((x).s70123456)
4209#define ROT8_2(x) ((x).s67012345)
4210#define ROT8_3(x) ((x).s56701234)
4211#define ROT8_4(x) ((x).s45670123)
4212#define ROT8_5(x) ((x).s34567012)
4213#define ROT8_6(x) ((x).s23456701)
4214#define ROT8_7(x) ((x).s12345670)
4215#define ROT8_8(x) ((x))
4216
4217#define ROT16_0(x) ((x))
4218#define ROT16_1(x) ((x).sF0123456789ABCDE)
4219#define ROT16_2(x) ((x).sEF0123456789ABCD)
4220#define ROT16_3(x) ((x).sDEF0123456789ABC)
4221#define ROT16_4(x) ((x).sCDEF0123456789AB)
4222#define ROT16_5(x) ((x).sBCDEF0123456789A)
4223#define ROT16_6(x) ((x).sABCDEF0123456789)
4224#define ROT16_7(x) ((x).s9ABCDEF012345678)
4225#define ROT16_8(x) ((x).s89ABCDEF01234567)
4226#define ROT16_9(x) ((x).s789ABCDEF0123456)
4227#define ROT16_10(x) ((x).s6789ABCDEF012345)
4228#define ROT16_11(x) ((x).s56789ABCDEF01234)
4229#define ROT16_12(x) ((x).s456789ABCDEF0123)
4230#define ROT16_13(x) ((x).s3456789ABCDEF012)
4231#define ROT16_14(x) ((x).s23456789ABCDEF01)
4232#define ROT16_15(x) ((x).s123456789ABCDEF0)
4233#define ROT16_16(x) ((x))
4234
4235
4236
4237#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
4238#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
4239
4240
4241
4242#define V_OFFS1(dt) (dt##1)(0)
4243#define V_OFFS2(dt) (dt##2)(0, 1)
4244#define V_OFFS3(dt) (dt##3)(0, 1, 2)
4245#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
4246#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
4247#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
4248
4249
4250
4251#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
4252#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
4253
4254
4255#define VLOAD_STR(size) vload##size
4256#define VLOAD(size) VLOAD_STR(size)
4257
4258
4259#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
4260#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
4261
4262#define NO_LOAD(data, offs, ptr) \
4263    {                            \
4264    }
4265
4266
4267#define vload_partial_1_0 NO_LOAD
4268#define vload_partial_1_1 vload1
4269#define vload_partial_1_2 NO_LOAD
4270#define vload_partial_1_3 NO_LOAD
4271#define vload_partial_1_4 NO_LOAD
4272#define vload_partial_1_5 NO_LOAD
4273#define vload_partial_1_6 NO_LOAD
4274#define vload_partial_1_7 NO_LOAD
4275#define vload_partial_1_8 NO_LOAD
4276#define vload_partial_1_9 NO_LOAD
4277#define vload_partial_1_10 NO_LOAD
4278#define vload_partial_1_11 NO_LOAD
4279#define vload_partial_1_12 NO_LOAD
4280#define vload_partial_1_13 NO_LOAD
4281#define vload_partial_1_14 NO_LOAD
4282#define vload_partial_1_15 NO_LOAD
4283#define vload_partial_1_16 NO_LOAD
4284
4285#define vload_partial_2_0 NO_LOAD
4286#define vload_partial_2_1 vload_partial_1
4287#define vload_partial_2_2 vload_partial_2
4288#define vload_partial_2_3 NO_LOAD
4289#define vload_partial_2_4 NO_LOAD
4290#define vload_partial_2_5 NO_LOAD
4291#define vload_partial_2_6 NO_LOAD
4292#define vload_partial_2_7 NO_LOAD
4293#define vload_partial_2_8 NO_LOAD
4294#define vload_partial_2_9 NO_LOAD
4295#define vload_partial_2_10 NO_LOAD
4296#define vload_partial_2_11 NO_LOAD
4297#define vload_partial_2_12 NO_LOAD
4298#define vload_partial_2_13 NO_LOAD
4299#define vload_partial_2_14 NO_LOAD
4300#define vload_partial_2_15 NO_LOAD
4301#define vload_partial_2_16 NO_LOAD
4302
4303#define vload_partial_3_0 NO_LOAD
4304#define vload_partial_3_1 vload_partial_1
4305#define vload_partial_3_2 vload_partial_2
4306#define vload_partial_3_3 vload_partial_3
4307#define vload_partial_3_4 NO_LOAD
4308#define vload_partial_3_5 NO_LOAD
4309#define vload_partial_3_6 NO_LOAD
4310#define vload_partial_3_7 NO_LOAD
4311#define vload_partial_3_8 NO_LOAD
4312#define vload_partial_3_9 NO_LOAD
4313#define vload_partial_3_10 NO_LOAD
4314#define vload_partial_3_11 NO_LOAD
4315#define vload_partial_3_12 NO_LOAD
4316#define vload_partial_3_13 NO_LOAD
4317#define vload_partial_3_14 NO_LOAD
4318#define vload_partial_3_15 NO_LOAD
4319#define vload_partial_3_16 NO_LOAD
4320
4321#define vload_partial_4_0 NO_LOAD
4322#define vload_partial_4_1 vload_partial_1
4323#define vload_partial_4_2 vload_partial_2
4324#define vload_partial_4_3 vload_partial_3
4325#define vload_partial_4_4 vload_partial_4
4326#define vload_partial_4_5 NO_LOAD
4327#define vload_partial_4_6 NO_LOAD
4328#define vload_partial_4_7 NO_LOAD
4329#define vload_partial_4_8 NO_LOAD
4330#define vload_partial_4_9 NO_LOAD
4331#define vload_partial_4_10 NO_LOAD
4332#define vload_partial_4_11 NO_LOAD
4333#define vload_partial_4_12 NO_LOAD
4334#define vload_partial_4_13 NO_LOAD
4335#define vload_partial_4_14 NO_LOAD
4336#define vload_partial_4_15 NO_LOAD
4337#define vload_partial_4_16 NO_LOAD
4338
4339#define vload_partial_8_0 NO_LOAD
4340#define vload_partial_8_1 vload_partial_1
4341#define vload_partial_8_2 vload_partial_2
4342#define vload_partial_8_3 vload_partial_3
4343#define vload_partial_8_4 vload_partial_4
4344#define vload_partial_8_5 vload_partial_5
4345#define vload_partial_8_6 vload_partial_6
4346#define vload_partial_8_7 vload_partial_7
4347#define vload_partial_8_8 vload_partial_8
4348#define vload_partial_8_9 NO_LOAD
4349#define vload_partial_8_10 NO_LOAD
4350#define vload_partial_8_11 NO_LOAD
4351#define vload_partial_8_12 NO_LOAD
4352#define vload_partial_8_13 NO_LOAD
4353#define vload_partial_8_14 NO_LOAD
4354#define vload_partial_8_15 NO_LOAD
4355#define vload_partial_8_16 NO_LOAD
4356
4357#define vload_partial_16_0 NO_LOAD
4358#define vload_partial_16_1 vload_partial_1
4359#define vload_partial_16_2 vload_partial_2
4360#define vload_partial_16_3 vload_partial_3
4361#define vload_partial_16_4 vload_partial_4
4362#define vload_partial_16_5 vload_partial_5
4363#define vload_partial_16_6 vload_partial_6
4364#define vload_partial_16_7 vload_partial_7
4365#define vload_partial_16_8 vload_partial_8
4366#define vload_partial_16_9 vload_partial_9
4367#define vload_partial_16_10 vload_partial_10
4368#define vload_partial_16_11 vload_partial_11
4369#define vload_partial_16_12 vload_partial_12
4370#define vload_partial_16_13 vload_partial_13
4371#define vload_partial_16_14 vload_partial_14
4372#define vload_partial_16_15 vload_partial_15
4373#define vload_partial_16_16 vload_partial_16
4374
4375
4376#define vload_partial_1(DATA, OFFSET, PTR) \
4377    DATA.s0 = vload1(OFFSET, PTR);
4378
4379#define vload_partial_2(DATA, OFFSET, PTR) \
4380    DATA.s01 = vload2(OFFSET, PTR);
4381
4382#define vload_partial_3(DATA, OFFSET, PTR) \
4383    DATA.s012 = vload3(OFFSET, PTR);
4384
4385#define vload_partial_4(DATA, OFFSET, PTR) \
4386    DATA.s0123 = vload4(OFFSET, PTR);
4387
4388#define vload_partial_5(DATA, OFFSET, PTR)    \
4389    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4390    DATA.s4 = vload1(OFFSET, PTR + 4);
4391
4392#define vload_partial_6(DATA, OFFSET, PTR)    \
4393    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4394    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
4395
4396#define vload_partial_7(DATA, OFFSET, PTR)    \
4397    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4398    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
4399
4400#define vload_partial_8(DATA, OFFSET, PTR) \
4401    DATA.s01234567 = vload8(OFFSET, PTR);
4402
4403#define vload_partial_9(DATA, OFFSET, PTR)        \
4404    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4405    DATA.s8 = vload1(OFFSET, PTR + 8);
4406
4407#define vload_partial_10(DATA, OFFSET, PTR)       \
4408    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4409    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
4410
4411#define vload_partial_11(DATA, OFFSET, PTR)       \
4412    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4413    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
4414
4415#define vload_partial_12(DATA, OFFSET, PTR)       \
4416    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4417    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
4418
4419#define vload_partial_13(DATA, OFFSET, PTR)       \
4420    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4421    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
4422
4423#define vload_partial_14(DATA, OFFSET, PTR)       \
4424    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4425    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
4426
4427#define vload_partial_15(DATA, OFFSET, PTR)       \
4428    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4429    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
4430
4431#define vload_partial_16(DATA, OFFSET, PTR) \
4432    DATA = vload16(OFFSET, PTR);
4433
4434
4435
4436#define PIXEL_UNIT4 1
4437#define PIXEL_UNIT8 2
4438#define PIXEL_UNIT16 4
4439
4440
4441#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
4442#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
4443
4444
4445#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
4446#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
4447#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
4448
4449#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4450#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
4451#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
4452#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
4453#endif
4454
4455#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
4456#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
4457#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4458
4459#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4460#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
4461#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
4462#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4463#endif
4464
4465
4466#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
4467#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
4468
4469
4470#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
4471#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
4472
4473#define VSTORE_STR(size) vstore##size
4474#define VSTORE(size) VSTORE_STR(size)
4475
4476#define float1 float
4477#define half1 half
4478#define char1 char
4479#define uchar1 uchar
4480#define short1 short
4481#define ushort1 ushort
4482#define int1 int
4483#define uint1 uint
4484#define long1 long
4485#define ulong1 ulong
4486#define double1 double
4487
4488#define vload1(OFFSET, PTR) *(OFFSET + PTR)
4489#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
4490
4491
4492#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
4493#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
4494
4495#define NO_STORE(data, offs, ptr) \
4496    {                             \
4497    }
4498
4499
4500#define vstore_partial_1_0 NO_STORE
4501#define vstore_partial_1_1 vstore1
4502#define vstore_partial_1_2 NO_STORE
4503#define vstore_partial_1_3 NO_STORE
4504#define vstore_partial_1_4 NO_STORE
4505#define vstore_partial_1_5 NO_STORE
4506#define vstore_partial_1_6 NO_STORE
4507#define vstore_partial_1_7 NO_STORE
4508#define vstore_partial_1_8 NO_STORE
4509#define vstore_partial_1_9 NO_STORE
4510#define vstore_partial_1_10 NO_STORE
4511#define vstore_partial_1_11 NO_STORE
4512#define vstore_partial_1_12 NO_STORE
4513#define vstore_partial_1_13 NO_STORE
4514#define vstore_partial_1_14 NO_STORE
4515#define vstore_partial_1_15 NO_STORE
4516#define vstore_partial_1_16 NO_STORE
4517
4518#define vstore_partial_2_0 NO_STORE
4519#define vstore_partial_2_1 vstore_partial_1
4520#define vstore_partial_2_2 vstore_partial_2
4521#define vstore_partial_2_3 NO_STORE
4522#define vstore_partial_2_4 NO_STORE
4523#define vstore_partial_2_5 NO_STORE
4524#define vstore_partial_2_6 NO_STORE
4525#define vstore_partial_2_7 NO_STORE
4526#define vstore_partial_2_8 NO_STORE
4527#define vstore_partial_2_9 NO_STORE
4528#define vstore_partial_2_10 NO_STORE
4529#define vstore_partial_2_11 NO_STORE
4530#define vstore_partial_2_12 NO_STORE
4531#define vstore_partial_2_13 NO_STORE
4532#define vstore_partial_2_14 NO_STORE
4533#define vstore_partial_2_15 NO_STORE
4534#define vstore_partial_2_16 NO_STORE
4535
4536#define vstore_partial_3_0 NO_STORE
4537#define vstore_partial_3_1 vstore_partial_1
4538#define vstore_partial_3_2 vstore_partial_2
4539#define vstore_partial_3_3 vstore_partial_3
4540#define vstore_partial_3_4 NO_STORE
4541#define vstore_partial_3_5 NO_STORE
4542#define vstore_partial_3_6 NO_STORE
4543#define vstore_partial_3_7 NO_STORE
4544#define vstore_partial_3_8 NO_STORE
4545#define vstore_partial_3_9 NO_STORE
4546#define vstore_partial_3_10 NO_STORE
4547#define vstore_partial_3_11 NO_STORE
4548#define vstore_partial_3_12 NO_STORE
4549#define vstore_partial_3_13 NO_STORE
4550#define vstore_partial_3_14 NO_STORE
4551#define vstore_partial_3_15 NO_STORE
4552#define vstore_partial_3_16 NO_STORE
4553
4554#define vstore_partial_4_0 NO_STORE
4555#define vstore_partial_4_1 vstore_partial_1
4556#define vstore_partial_4_2 vstore_partial_2
4557#define vstore_partial_4_3 vstore_partial_3
4558#define vstore_partial_4_4 vstore_partial_4
4559#define vstore_partial_4_5 NO_STORE
4560#define vstore_partial_4_6 NO_STORE
4561#define vstore_partial_4_7 NO_STORE
4562#define vstore_partial_4_8 NO_STORE
4563#define vstore_partial_4_9 NO_STORE
4564#define vstore_partial_4_10 NO_STORE
4565#define vstore_partial_4_11 NO_STORE
4566#define vstore_partial_4_12 NO_STORE
4567#define vstore_partial_4_13 NO_STORE
4568#define vstore_partial_4_14 NO_STORE
4569#define vstore_partial_4_15 NO_STORE
4570#define vstore_partial_4_16 NO_STORE
4571
4572#define vstore_partial_8_0 NO_STORE
4573#define vstore_partial_8_1 vstore_partial_1
4574#define vstore_partial_8_2 vstore_partial_2
4575#define vstore_partial_8_3 vstore_partial_3
4576#define vstore_partial_8_4 vstore_partial_4
4577#define vstore_partial_8_5 vstore_partial_5
4578#define vstore_partial_8_6 vstore_partial_6
4579#define vstore_partial_8_7 vstore_partial_7
4580#define vstore_partial_8_8 vstore_partial_8
4581#define vstore_partial_8_9 NO_STORE
4582#define vstore_partial_8_10 NO_STORE
4583#define vstore_partial_8_11 NO_STORE
4584#define vstore_partial_8_12 NO_STORE
4585#define vstore_partial_8_13 NO_STORE
4586#define vstore_partial_8_14 NO_STORE
4587#define vstore_partial_8_15 NO_STORE
4588#define vstore_partial_8_16 NO_STORE
4589
4590#define vstore_partial_16_0 NO_STORE
4591#define vstore_partial_16_1 vstore_partial_1
4592#define vstore_partial_16_2 vstore_partial_2
4593#define vstore_partial_16_3 vstore_partial_3
4594#define vstore_partial_16_4 vstore_partial_4
4595#define vstore_partial_16_5 vstore_partial_5
4596#define vstore_partial_16_6 vstore_partial_6
4597#define vstore_partial_16_7 vstore_partial_7
4598#define vstore_partial_16_8 vstore_partial_8
4599#define vstore_partial_16_9 vstore_partial_9
4600#define vstore_partial_16_10 vstore_partial_10
4601#define vstore_partial_16_11 vstore_partial_11
4602#define vstore_partial_16_12 vstore_partial_12
4603#define vstore_partial_16_13 vstore_partial_13
4604#define vstore_partial_16_14 vstore_partial_14
4605#define vstore_partial_16_15 vstore_partial_15
4606#define vstore_partial_16_16 vstore_partial_16
4607
4608
4609#define vstore_partial_1(DATA, OFFSET, PTR) \
4610    vstore1(DATA.s0, OFFSET, PTR);
4611
4612#define vstore_partial_2(DATA, OFFSET, PTR) \
4613    vstore2(DATA.s01, OFFSET, PTR);
4614
4615#define vstore_partial_3(DATA, OFFSET, PTR) \
4616    vstore3(DATA.s012, OFFSET, PTR);
4617
4618#define vstore_partial_4(DATA, OFFSET, PTR) \
4619    vstore4(DATA.s0123, OFFSET, PTR);
4620
4621#define vstore_partial_5(DATA, OFFSET, PTR)    \
4622    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4623    vstore1(DATA.s4, OFFSET, PTR + 4);
4624
4625#define vstore_partial_6(DATA, OFFSET, PTR)    \
4626    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4627    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
4628
4629#define vstore_partial_7(DATA, OFFSET, PTR)    \
4630    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4631    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
4632
4633#define vstore_partial_8(DATA, OFFSET, PTR) \
4634    vstore8(DATA.s01234567, OFFSET, PTR);
4635
4636#define vstore_partial_9(DATA, OFFSET, PTR)        \
4637    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4638    vstore1(DATA.s8, OFFSET, PTR + 8);
4639
4640#define vstore_partial_10(DATA, OFFSET, PTR)       \
4641    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4642    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
4643
4644#define vstore_partial_11(DATA, OFFSET, PTR)       \
4645    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4646    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
4647
4648#define vstore_partial_12(DATA, OFFSET, PTR)       \
4649    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4650    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
4651
4652#define vstore_partial_13(DATA, OFFSET, PTR)       \
4653    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4654    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
4655
4656#define vstore_partial_14(DATA, OFFSET, PTR)       \
4657    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4658    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
4659
4660#define vstore_partial_15(DATA, OFFSET, PTR)       \
4661    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4662    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
4663
4664#define vstore_partial_16(DATA, OFFSET, PTR) \
4665    vstore16(DATA, OFFSET, PTR);
4666
4667
4668
4669
4670
4671#define convert_float_sat convert_float
4672#define convert_float1_sat convert_float
4673#define convert_float2_sat convert_float2
4674#define convert_float3_sat convert_float3
4675#define convert_float4_sat convert_float4
4676#define convert_float8_sat convert_float8
4677#define convert_float16_sat convert_float16
4678#define convert_half_sat convert_float
4679#define convert_half1_sat convert_half
4680#define convert_half2_sat convert_half2
4681#define convert_half3_sat convert_half3
4682#define convert_half4_sat convert_half4
4683#define convert_half8_sat convert_half8
4684#define convert_half16_sat convert_half16
4685
4686#define convert_float1 convert_float
4687#define convert_half1 convert_half
4688#define convert_char1 convert_char
4689#define convert_uchar1 convert_uchar
4690#define convert_short1 convert_short
4691#define convert_ushort1 convert_ushort
4692#define convert_int1 convert_int
4693#define convert_uint1 convert_uint
4694#define convert_long1 convert_long
4695#define convert_ulong1 convert_ulong
4696#define convert_double1 convert_double
4697
4698#define convert_char1_sat convert_char_sat
4699#define convert_uchar1_sat convert_uchar_sat
4700#define convert_uchar2_sat convert_uchar2_sat
4701#define convert_uchar3_sat convert_uchar3_sat
4702#define convert_uchar4_sat convert_uchar4_sat
4703#define convert_uchar8_sat convert_uchar8_sat
4704#define convert_uchar16_sat convert_uchar16_sat
4705#define convert_short1_sat convert_short_sat
4706#define convert_ushort1_sat convert_ushort_sat
4707#define convert_int1_sat convert_int_sat
4708#define convert_uint1_sat convert_uint_sat
4709#define convert_long1_sat convert_long_sat
4710#define convert_ulong1_sat convert_ulong_sat
4711#define convert_double1_sat convert_double_sat
4712
4713#define VEC_DATA_TYPE_STR(type, size) type##size
4714#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
4715
4716#define CONVERT_STR(x, type) (convert_##type((x)))
4717#define CONVERT(x, type) CONVERT_STR(x, type)
4718
4719#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
4720#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
4721
4722#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
4723#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
4724
4725#define select_vec_dt_uchar(size) uchar##size
4726#define select_vec_dt_char(size) char##size
4727#define select_vec_dt_ushort(size) ushort##size
4728#define select_vec_dt_short(size) short##size
4729#define select_vec_dt_half(size) short##size
4730#define select_vec_dt_uint(size) uint##size
4731#define select_vec_dt_int(size) int##size
4732#define select_vec_dt_float(size) int##size
4733#define select_vec_dt_ulong(size) ulong##size
4734#define select_vec_dt_long(size) long##size
4735
4736#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
4737#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
4738#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
4739
4740#define signed_int_vec_dt_uchar(size) char##size
4741#define signed_int_vec_dt_char(size) char##size
4742#define signed_int_vec_dt_ushort(size) short##size
4743#define signed_int_vec_dt_short(size) short##size
4744#define signed_int_vec_dt_half(size) short##size
4745#define signed_int_vec_dt_uint(size) int##size
4746#define signed_int_vec_dt_int(size) int##size
4747#define signed_int_vec_dt_float(size) int##size
4748#define signed_int_vec_dt_ulong(size) long##size
4749#define signed_int_vec_dt_long(size) long##size
4750
4751#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
4752#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
4753#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
4754
4755#define sum_reduce_1(x) (x)
4756#define sum_reduce_2(x) ((x).s0) + ((x).s1)
4757#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
4758#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
4759#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
4760#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
4761
4762#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
4763#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
4764
4765#define prod_reduce_1(x) (x)
4766#define prod_reduce_2(x) ((x).s0) * ((x).s1)
4767#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
4768#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
4769#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
4770#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
4771
4772#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
4773#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
4774
4775#define max_reduce_1(x) (x)
4776#define max_reduce_2(x) max(((x).s0), ((x).s1))
4777#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
4778#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
4779#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
4780#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
4781
4782#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
4783#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
4784
4785#define VECTOR_DECLARATION(name)     \
4786    __global uchar *name##_ptr,      \
4787    uint        name##_stride_x, \
4788    uint        name##_step_x,   \
4789    uint        name##_offset_first_element_in_bytes
4790
4791#define IMAGE_DECLARATION(name)      \
4792    __global uchar *name##_ptr,      \
4793    uint        name##_stride_x, \
4794    uint        name##_step_x,   \
4795    uint        name##_stride_y, \
4796    uint        name##_step_y,   \
4797    uint        name##_offset_first_element_in_bytes
4798
4799#define TENSOR3D_DECLARATION(name)   \
4800    __global uchar *name##_ptr,      \
4801    uint        name##_stride_x, \
4802    uint        name##_step_x,   \
4803    uint        name##_stride_y, \
4804    uint        name##_step_y,   \
4805    uint        name##_stride_z, \
4806    uint        name##_step_z,   \
4807    uint        name##_offset_first_element_in_bytes
4808
4809#define TENSOR4D_DECLARATION(name)   \
4810    __global uchar *name##_ptr,      \
4811    uint        name##_stride_x, \
4812    uint        name##_step_x,   \
4813    uint        name##_stride_y, \
4814    uint        name##_step_y,   \
4815    uint        name##_stride_z, \
4816    uint        name##_step_z,   \
4817    uint        name##_stride_w, \
4818    uint        name##_step_w,   \
4819    uint        name##_offset_first_element_in_bytes
4820
4821#define TENSOR5D_DECLARATION(name)   \
4822    __global uchar *name##_ptr,      \
4823    uint        name##_stride_x, \
4824    uint        name##_step_x,   \
4825    uint        name##_stride_y, \
4826    uint        name##_step_y,   \
4827    uint        name##_stride_z, \
4828    uint        name##_step_z,   \
4829    uint        name##_stride_w, \
4830    uint        name##_step_w,   \
4831    uint        name##_stride_v, \
4832    uint        name##_step_v,   \
4833    uint        name##_offset_first_element_in_bytes
4834
4835#define CONVERT_TO_VECTOR_STRUCT(name) \
4836    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
4837
4838#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
4839    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
4840
4841#define CONVERT_TO_IMAGE_STRUCT(name) \
4842    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
4843
4844#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
4845    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
4846
4847#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4848    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4849
4850#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
4851    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
4852
4853#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4854    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4855
4856#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
4857    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4858                                 name##_stride_z, name##_step_z)
4859
4860#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
4861    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
4862
4863#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
4864    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4865                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
4866
4867#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
4868    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
4869
4870#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
4871    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4872                           name##_stride_z, name##_step_z)
4873
4874
4875typedef struct Vector
4876{
4877    __global uchar *ptr;
4878    int             offset_first_element_in_bytes;
4879    int             stride_x;
4880} Vector;
4881
4882
4883typedef struct Image
4884{
4885    __global uchar *ptr;
4886    int             offset_first_element_in_bytes;
4887    int             stride_x;
4888    int             stride_y;
4889} Image;
4890
4891
4892typedef struct Tensor3D
4893{
4894    __global uchar *ptr;
4895    int             offset_first_element_in_bytes;
4896    int             stride_x;
4897    int             stride_y;
4898    int             stride_z;
4899} Tensor3D;
4900
4901
4902typedef struct Tensor4D
4903{
4904    __global uchar *ptr;
4905    int             offset_first_element_in_bytes;
4906    int             stride_x;
4907    int             stride_y;
4908    int             stride_z;
4909    int             stride_w;
4910} Tensor4D;
4911
4912
4913inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
4914{
4915    Vector vector =
4916    {
4917        .ptr                           = ptr,
4918        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4919        .stride_x                      = stride_x,
4920    };
4921    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
4922    return vector;
4923}
4924
4925
4926inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
4927{
4928    Image img =
4929    {
4930        .ptr                           = ptr,
4931        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4932        .stride_x                      = stride_x,
4933        .stride_y                      = stride_y
4934    };
4935    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
4936    return img;
4937}
4938
4939
4940inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4941{
4942    Image img =
4943    {
4944        .ptr                           = ptr,
4945        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4946        .stride_x                      = stride_x,
4947        .stride_y                      = stride_y
4948    };
4949    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4950    return img;
4951}
4952
4953
4954inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4955{
4956    Tensor3D tensor =
4957    {
4958        .ptr                           = ptr,
4959        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4960        .stride_x                      = stride_x,
4961        .stride_y                      = stride_y,
4962        .stride_z                      = stride_z
4963    };
4964    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4965    return tensor;
4966}
4967
4968
4969inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4970{
4971    Tensor3D tensor =
4972    {
4973        .ptr                           = ptr,
4974        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4975        .stride_x                      = stride_x,
4976        .stride_y                      = stride_y,
4977        .stride_z                      = stride_z
4978    };
4979    return tensor;
4980}
4981
4982inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
4983                                             uint step_w,
4984                                             uint mod_size)
4985{
4986    Tensor4D tensor =
4987    {
4988        .ptr                           = ptr,
4989        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4990        .stride_x                      = stride_x,
4991        .stride_y                      = stride_y,
4992        .stride_z                      = stride_z,
4993        .stride_w                      = stride_w
4994    };
4995
4996    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
4997    return tensor;
4998}
4999
5000
5001inline __global const uchar *vector_offset(const Vector *vec, int x)
5002{
5003    return vec->ptr + x * vec->stride_x;
5004}
5005
5006
5007inline __global uchar *offset(const Image *img, int x, int y)
5008{
5009    return img->ptr + x * img->stride_x + y * img->stride_y;
5010}
5011
5012
5013inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
5014{
5015    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
5016}
5017
5018
5019inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
5020{
5021    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
5022}
5023
5024
5025inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
5026{
5027    uint num_elements = width * height;
5028
5029    const uint z = index / num_elements;
5030
5031    index %= num_elements;
5032
5033    const uint y = index / width;
5034
5035    index %= width;
5036
5037    const uint x = index;
5038
5039    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
5040}
5041
5042#endif
5043
5044
5045#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
5046#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
5047
5048
5049inline uchar quantize_qasymm8(float input, float offset, float scale)
5050{
5051    float out_f32 = input / scale + offset;
5052    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
5053    return res_u8;
5054}
5055
5056
5057inline float dequantize_qasymm8(uchar input, float offset, float scale)
5058{
5059    return ((float)input - offset) * scale;
5060}
5061
5062
5063inline float dequantize_qasymm8_signed(char input, float offset, float scale)
5064{
5065    return ((float)input - offset) * scale;
5066}
5067
5068
5069#define QUANTIZE_IMPL(type, size)                                                                                       \
5070    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
5071    {                                                                                                                   \
5072        VEC_DATA_TYPE(float, size)                                                                                      \
5073        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
5074        VEC_DATA_TYPE(type, size)                                                                                       \
5075        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
5076        return res;                                                                                                     \
5077    }
5078
5079
5080#define DEQUANTIZE_IMPL(type, size)                                                                                       \
5081    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
5082    {                                                                                                                     \
5083        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
5084    }
5085
5086
5087#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
5088    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
5089    {                                                                                                                                   \
5090        const VEC_DATA_TYPE(int, size)                                                                                                  \
5091        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
5092        const VEC_DATA_TYPE(int, size)                                                                                                  \
5093        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
5094        VEC_DATA_TYPE(int, size)                                                                                                        \
5095        mask = (one << exponent) - one;                                                                                                 \
5096        VEC_DATA_TYPE(int, size)                                                                                                        \
5097        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
5098        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
5099    }
5100
5101
5102#define ASYMM_MULT_IMPL(size)                                                                                \
5103    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
5104    {                                                                                                        \
5105        VEC_DATA_TYPE(int, size)                                                                             \
5106        overflow = a == b && a == INT_MIN;                                                                   \
5107        VEC_DATA_TYPE(long, size)                                                                            \
5108        a_64 = convert_long##size(a);                                                                        \
5109        VEC_DATA_TYPE(long, size)                                                                            \
5110        b_64 = convert_long##size(b);                                                                        \
5111        VEC_DATA_TYPE(long, size)                                                                            \
5112        ab_64 = a_64 * b_64;                                                                                 \
5113                                                                                      \
5114        VEC_DATA_TYPE(long, size)                                                                            \
5115        mask1 = 1 << 30;                                                                                     \
5116        VEC_DATA_TYPE(long, size)                                                                            \
5117        mask2 = 1 - (1 << 30);                                                                               \
5118        VEC_DATA_TYPE(long, size)                                                                            \
5119        is_positive_or_zero = ab_64 >= 0;                                                                    \
5120        VEC_DATA_TYPE(long, size)                                                                            \
5121        nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero));               \
5122        VEC_DATA_TYPE(long, size)                                                                            \
5123        mask = 1ll << 31;                                                                                    \
5124        VEC_DATA_TYPE(int, size)                                                                             \
5125        ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
5126        return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow));                   \
5127    }
5128
5129
5130#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
5131    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
5132    {                                                                                                                               \
5133        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
5134        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
5135        const int k_fractional_bits = 31;                                                                                           \
5136        VEC_DATA_TYPE(int, size)                                                                                                    \
5137        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
5138        VEC_DATA_TYPE(int, size)                                                                                                    \
5139        x2 = ASYMM_MULT(x, x, size);                                                                                                \
5140        VEC_DATA_TYPE(int, size)                                                                                                    \
5141        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
5142        VEC_DATA_TYPE(int, size)                                                                                                    \
5143        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
5144        VEC_DATA_TYPE(int, size)                                                                                                    \
5145        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
5146        VEC_DATA_TYPE(int, size)                                                                                                    \
5147        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
5148        VEC_DATA_TYPE(int, size)                                                                                                    \
5149        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
5150        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
5151    }
5152
5153
5154#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
5155    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
5156    {                                                                                                                                                                     \
5157        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
5158    }
5159
5160
5161#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
5162    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
5163    {                                                                                    \
5164        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
5165        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
5166        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0));   \
5167    }
5168
5169
5170#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
5171    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
5172    {                                                                                        \
5173        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
5174        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
5175        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
5176    }
5177
5178#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
5179    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
5180    {                                                                                                                                                                                                         \
5181        if(k_integer_bits > exponent)                                                                                                                                                                         \
5182        {                                                                                                                                                                                                     \
5183            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
5184            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
5185                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
5186                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
5187        }                                                                                                                                                                                                     \
5188        \
5189        return result;                                                                                                                                                                                        \
5190    }
5191
5192
5193#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
5194    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
5195    {                                                                                                                         \
5196        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
5197        VEC_DATA_TYPE(int, size)                                                                                              \
5198        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
5199        VEC_DATA_TYPE(int, size)                                                                                              \
5200        mask = k_one_quarter - 1;                                                                                             \
5201        VEC_DATA_TYPE(int, size)                                                                                              \
5202        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
5203        VEC_DATA_TYPE(int, size)                                                                                              \
5204        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
5205        VEC_DATA_TYPE(int, size)                                                                                              \
5206        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
5207        VEC_DATA_TYPE(int, size)                                                                                              \
5208        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
5209        \
5210        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
5211        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
5212        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
5213        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
5214        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
5215        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
5216        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
5217        \
5218        if(k_integer_bits > 5)                                                                                                \
5219        {                                                                                                                     \
5220            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
5221            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
5222        }                                                                                                                     \
5223        \
5224        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
5225        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
5226    }
5227
5228
5229#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
5230    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
5231    {                                                                                                                      \
5232        if(exponent < 0)                                                                                                   \
5233        {                                                                                                                  \
5234            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
5235        }                                                                                                                  \
5236        \
5237        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
5238        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
5239        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
5240        VEC_DATA_TYPE(int, size)                                                                                           \
5241        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
5242        VEC_DATA_TYPE(int, size)                                                                                           \
5243        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
5244        VEC_DATA_TYPE(int, size)                                                                                           \
5245        result = x << exponent;                                                                                            \
5246        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
5247        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
5248        return result;                                                                                                     \
5249    }
5250
5251
5252#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
5253    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
5254    {                                                                                                                     \
5255        VEC_DATA_TYPE(long, size)                                                                                         \
5256        a64 = convert_long##size(a);                                                                                      \
5257        VEC_DATA_TYPE(long, size)                                                                                         \
5258        b64 = convert_long##size(b);                                                                                      \
5259        VEC_DATA_TYPE(long, size)                                                                                         \
5260        sum = a64 + b64;                                                                                                  \
5261        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
5262        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
5263        VEC_DATA_TYPE(long, size)                                                                                         \
5264        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
5265        return convert_int##size((sum + sign) / 2);                                                                       \
5266    }
5267
5268
5269#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
5270    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
5271    {                                                                                                        \
5272        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
5273        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
5274        VEC_DATA_TYPE(int, size)                                                                             \
5275        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
5276        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
5277        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
5278        VEC_DATA_TYPE(int, size)                                                                             \
5279        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
5280        for(int i = 0; i < 3; i++)                                                                           \
5281        {                                                                                                    \
5282            VEC_DATA_TYPE(int, size)                                                                         \
5283            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
5284            VEC_DATA_TYPE(int, size)                                                                         \
5285            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
5286            VEC_DATA_TYPE(int, size)                                                                         \
5287            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
5288            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
5289        }                                                                                                    \
5290        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
5291    }
5292
5293
5294#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
5295    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
5296    {                                                                                                                               \
5297        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
5298        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
5299    }
5300
5301#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
5302#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
5303#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
5304#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
5305
5306#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
5307#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
5308#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
5309#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
5310#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
5311    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
5312#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
5313    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
5314#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
5315#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
5316#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
5317#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
5318#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
5319#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
5320#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
5321#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
5322#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
5323#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
5324#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
5325#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
5326#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
5327
5328#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
5329    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
5330    {                                                                                                                           \
5331        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
5332        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
5333        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
5334    }
5335#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
5336
5337QUANTIZE_IMPL(uchar, 1)
5338QUANTIZE_IMPL(char, 1)
5339QUANTIZE_IMPL(uint, 1)
5340QUANTIZE_IMPL(int, 1)
5341QUANTIZE_IMPL(uchar, 2)
5342QUANTIZE_IMPL(char, 2)
5343QUANTIZE_IMPL(uint, 2)
5344QUANTIZE_IMPL(int, 2)
5345QUANTIZE_IMPL(uchar, 3)
5346QUANTIZE_IMPL(char, 3)
5347QUANTIZE_IMPL(uint, 3)
5348QUANTIZE_IMPL(int, 3)
5349QUANTIZE_IMPL(uchar, 4)
5350QUANTIZE_IMPL(ushort, 4)
5351QUANTIZE_IMPL(short, 4)
5352QUANTIZE_IMPL(int, 4)
5353QUANTIZE_IMPL(uchar, 8)
5354QUANTIZE_IMPL(char, 8)
5355QUANTIZE_IMPL(uint, 8)
5356QUANTIZE_IMPL(int, 8)
5357QUANTIZE_IMPL(uchar, 16)
5358QUANTIZE_IMPL(char, 16)
5359QUANTIZE_IMPL(ushort, 16)
5360QUANTIZE_IMPL(short, 16)
5361QUANTIZE_IMPL(uint, 16)
5362QUANTIZE_IMPL(int, 16)
5363
5364DEQUANTIZE_IMPL(uchar, 1)
5365DEQUANTIZE_IMPL(char, 1)
5366DEQUANTIZE_IMPL(uint, 1)
5367DEQUANTIZE_IMPL(int, 1)
5368DEQUANTIZE_IMPL(uchar, 2)
5369DEQUANTIZE_IMPL(char, 2)
5370DEQUANTIZE_IMPL(uint, 2)
5371DEQUANTIZE_IMPL(int, 2)
5372DEQUANTIZE_IMPL(uchar, 3)
5373DEQUANTIZE_IMPL(char, 3)
5374DEQUANTIZE_IMPL(uint, 3)
5375DEQUANTIZE_IMPL(int, 3)
5376DEQUANTIZE_IMPL(uchar, 4)
5377DEQUANTIZE_IMPL(ushort, 4)
5378DEQUANTIZE_IMPL(short, 4)
5379DEQUANTIZE_IMPL(int, 4)
5380DEQUANTIZE_IMPL(uchar, 8)
5381DEQUANTIZE_IMPL(char, 8)
5382DEQUANTIZE_IMPL(uint, 8)
5383DEQUANTIZE_IMPL(int, 8)
5384DEQUANTIZE_IMPL(uchar, 16)
5385DEQUANTIZE_IMPL(char, 16)
5386DEQUANTIZE_IMPL(ushort, 16)
5387DEQUANTIZE_IMPL(short, 16)
5388DEQUANTIZE_IMPL(uint, 16)
5389DEQUANTIZE_IMPL(int, 16)
5390
5391ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
5392ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
5393ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3)
5394ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
5395ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
5396ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
5397
5398ASYMM_MULT_IMPL(1)
5399ASYMM_MULT_IMPL(2)
5400ASYMM_MULT_IMPL(3)
5401ASYMM_MULT_IMPL(4)
5402ASYMM_MULT_IMPL(8)
5403ASYMM_MULT_IMPL(16)
5404
5405ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1)
5406ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
5407ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3)
5408ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
5409ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
5410ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
5411
5412ASYMM_SELECT_USING_MASK_IMPL(1)
5413ASYMM_SELECT_USING_MASK_IMPL(2)
5414ASYMM_SELECT_USING_MASK_IMPL(3)
5415ASYMM_SELECT_USING_MASK_IMPL(4)
5416ASYMM_SELECT_USING_MASK_IMPL(8)
5417ASYMM_SELECT_USING_MASK_IMPL(16)
5418
5419ASYMM_MASK_IF_ZERO_IMPL(1)
5420ASYMM_MASK_IF_ZERO_IMPL(2)
5421ASYMM_MASK_IF_ZERO_IMPL(3)
5422ASYMM_MASK_IF_ZERO_IMPL(4)
5423ASYMM_MASK_IF_ZERO_IMPL(8)
5424ASYMM_MASK_IF_ZERO_IMPL(16)
5425
5426ASYMM_MASK_IF_NON_ZERO_IMPL(1)
5427ASYMM_MASK_IF_NON_ZERO_IMPL(2)
5428ASYMM_MASK_IF_NON_ZERO_IMPL(3)
5429ASYMM_MASK_IF_NON_ZERO_IMPL(4)
5430ASYMM_MASK_IF_NON_ZERO_IMPL(8)
5431ASYMM_MASK_IF_NON_ZERO_IMPL(16)
5432
5433EXP_BARREL_SHIFTER_IMPL(1)
5434EXP_BARREL_SHIFTER_IMPL(2)
5435EXP_BARREL_SHIFTER_IMPL(3)
5436EXP_BARREL_SHIFTER_IMPL(4)
5437EXP_BARREL_SHIFTER_IMPL(8)
5438EXP_BARREL_SHIFTER_IMPL(16)
5439
5440ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1)
5441ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
5442ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3)
5443ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
5444ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
5445ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
5446
5447ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
5448ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
5449ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3)
5450ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
5451ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
5452ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
5453
5454ASYMM_ROUNDING_HALF_SUM_IMPL(1)
5455ASYMM_ROUNDING_HALF_SUM_IMPL(2)
5456ASYMM_ROUNDING_HALF_SUM_IMPL(3)
5457ASYMM_ROUNDING_HALF_SUM_IMPL(4)
5458ASYMM_ROUNDING_HALF_SUM_IMPL(8)
5459ASYMM_ROUNDING_HALF_SUM_IMPL(16)
5460
5461ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1)
5462ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
5463ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3)
5464ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
5465ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
5466ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
5467
5468ASYMM_RESCALE_IMPL(1)
5469ASYMM_RESCALE_IMPL(2)
5470ASYMM_RESCALE_IMPL(3)
5471ASYMM_RESCALE_IMPL(4)
5472ASYMM_RESCALE_IMPL(8)
5473ASYMM_RESCALE_IMPL(16)
5474
5475MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
5476MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
5477MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3)
5478MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
5479MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
5480MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
5481
5482#endif
5483
5484#ifndef ARM_COMPUTE_REPEAT_H
5485#define ARM_COMPUTE_REPEAT_H
5486
5487
5488#ifndef ARM_COMPUTE_HELPER_H
5489#define ARM_COMPUTE_HELPER_H
5490
5491
5492
5493
5494#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5495    VSTORE(N0)                                                 \
5496    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5497
5498#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5499    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5500    VSTORE(N0)                                                 \
5501    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5502
5503#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5504    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5505    VSTORE(N0)                                                 \
5506    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5507
5508#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5509    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5510    VSTORE(N0)                                                 \
5511    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5512
5513#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5514    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5515    VSTORE(N0)                                                 \
5516    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5517
5518#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5519    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5520    VSTORE(N0)                                                 \
5521    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5522
5523#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5524    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5525    VSTORE(N0)                                                 \
5526    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5527
5528#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5529    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5530    VSTORE(N0)                                                 \
5531    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5532
5533#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5534    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5535    VSTORE(N0)                                                 \
5536    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5537
5538#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5539    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5540    VSTORE(N0)                                                  \
5541    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5542
5543#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5544    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5545    VSTORE(N0)                                                  \
5546    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5547
5548#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5549    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5550    VSTORE(N0)                                                  \
5551    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5552
5553#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5554    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5555    VSTORE(N0)                                                  \
5556    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5557
5558#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5559    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5560    VSTORE(N0)                                                  \
5561    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5562
5563#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5564    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5565    VSTORE(N0)                                                  \
5566    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5567
5568#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5569    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5570    VSTORE(N0)                                                  \
5571    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5572
5573
5574
5575#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5576    VSTORE(N0)                                                         \
5577    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5578
5579#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5580    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5581    VSTORE(N0)                                                         \
5582    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5583
5584#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5585    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5586    VSTORE(N0)                                                         \
5587    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5588
5589#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5590    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5591    VSTORE(N0)                                                         \
5592    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5593
5594#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5595    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5596    VSTORE(N0)                                                         \
5597    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5598
5599#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5600    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5601    VSTORE(N0)                                                         \
5602    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5603
5604#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5605    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5606    VSTORE(N0)                                                         \
5607    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5608
5609#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5610    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5611    VSTORE(N0)                                                         \
5612    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5613
5614#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5615    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5616    VSTORE(N0)                                                         \
5617    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5618
5619#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
5620    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5621    VSTORE(N0)                                                     \
5622    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5623
5624#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5625    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5626    VSTORE(N0)                                                          \
5627    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5628
5629#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5630    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5631    VSTORE(N0)                                                          \
5632    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5633
5634#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5635    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5636    VSTORE(N0)                                                          \
5637    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5638
5639#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5640    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5641    VSTORE(N0)                                                          \
5642    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5643
5644#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5645    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5646    VSTORE(N0)                                                          \
5647    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5648
5649#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5650    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5651    VSTORE(N0)                                                          \
5652    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5653
5654
5655
5656
5657#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5658#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5659
5660
5661
5662#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5663#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5664
5665
5666
5667#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5668    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5669    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5670
5671#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5672    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5673    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5674    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5675
5676#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5677    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5678    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5679    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5680
5681#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5682    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5683    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5684    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5685
5686#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5687    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5688    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5689    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5690
5691#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5692    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5693    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5694    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5695
5696#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5697    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5698    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5699    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5700
5701#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5702    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5703    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5704    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5705
5706#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5707    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5708    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5709    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5710
5711#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5712    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5713    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5714    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5715
5716#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5717    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5718    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5719    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5720
5721#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5722    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5723    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5724    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5725
5726#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5727    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5728    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5729    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5730
5731#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5732    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5733    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5734    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5735
5736#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5737    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5738    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5739    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5740
5741#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5742    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5743    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5744    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5745
5746
5747
5748#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5749#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5750
5751#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5752    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
5753    {                                                                                                                                                     \
5754        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
5755    }                                                                                                                                                     \
5756    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
5757    {                                                                                                                                                     \
5758        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5759    }                                                                                                                                                     \
5760    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
5761    {                                                                                                                                                     \
5762        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5763    }                                                                                                                                                     \
5764    else                                                                                                                                                  \
5765    {                                                                                                                                                     \
5766        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
5767    }
5768
5769#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
5770    if(!(PARTIAL_COND_X))                                                                                         \
5771    {                                                                                                             \
5772        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5773    }                                                                                                             \
5774    else                                                                                                          \
5775    {                                                                                                             \
5776        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5777    }
5778
5779#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
5780    if(!(PARTIAL_COND_Y))                                                                                         \
5781    {                                                                                                             \
5782        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5783    }                                                                                                             \
5784    else                                                                                                          \
5785    {                                                                                                             \
5786        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5787    }
5788
5789
5790#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
5791
5792
5793#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
5794
5795#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5796    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5797
5798#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
5799
5800#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5801    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
5802
5803#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
5804
5805#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5806    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
5807
5808#else
5809
5810#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5811    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
5812
5813#endif
5814
5815#endif
5816
5817
5818#if defined(PARTIAL_STORE_M0)
5819
5820#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5821    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
5822#else
5823#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5824    ((uint)(y * M0))
5825#endif
5826
5827
5828
5829#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
5830    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
5831
5832
5833#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
5834#pragma OPENCL EXTENSION cl_khr_fp16 : enable
5835#endif
5836
5837#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
5838#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
5839#endif
5840
5841#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
5842#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
5843#endif
5844
5845#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
5846#pragma OPENCL EXTENSION cl_arm_printf : enable
5847#endif
5848
5849#define GPU_ARCH_MIDGARD 0x100
5850#define GPU_ARCH_BIFROST 0x200
5851#define GPU_ARCH_VALHALL 0x300
5852
5853
5854#define CONCAT(a, b) a##b
5855
5856
5857#define EXPAND(x) x
5858
5859
5860#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
5861
5862
5863#define REV1(x) ((x))
5864#define REV2(x) ((x).s10)
5865#define REV3(x) ((x).s210)
5866#define REV4(x) ((x).s3210)
5867#define REV8(x) ((x).s76543210)
5868#define REV16(x) ((x).sFEDCBA9876543210)
5869
5870
5871
5872#define REVERSE_STR(x, s) REV##s((x))
5873#define REVERSE(x, s) REVERSE_STR(x, s)
5874
5875
5876
5877#define ROT1_0(x) ((x))
5878#define ROT1_1(x) ((x))
5879
5880#define ROT2_0(x) ((x))
5881#define ROT2_1(x) ((x).s10)
5882#define ROT2_2(x) ((x))
5883
5884#define ROT3_0(x) ((x))
5885#define ROT3_1(x) ((x).s201)
5886#define ROT3_2(x) ((x).s120)
5887#define ROT3_3(x) ((x))
5888
5889#define ROT4_0(x) ((x))
5890#define ROT4_1(x) ((x).s3012)
5891#define ROT4_2(x) ((x).s2301)
5892#define ROT4_3(x) ((x).s1230)
5893#define ROT4_4(x) ((x))
5894
5895#define ROT8_0(x) ((x))
5896#define ROT8_1(x) ((x).s70123456)
5897#define ROT8_2(x) ((x).s67012345)
5898#define ROT8_3(x) ((x).s56701234)
5899#define ROT8_4(x) ((x).s45670123)
5900#define ROT8_5(x) ((x).s34567012)
5901#define ROT8_6(x) ((x).s23456701)
5902#define ROT8_7(x) ((x).s12345670)
5903#define ROT8_8(x) ((x))
5904
5905#define ROT16_0(x) ((x))
5906#define ROT16_1(x) ((x).sF0123456789ABCDE)
5907#define ROT16_2(x) ((x).sEF0123456789ABCD)
5908#define ROT16_3(x) ((x).sDEF0123456789ABC)
5909#define ROT16_4(x) ((x).sCDEF0123456789AB)
5910#define ROT16_5(x) ((x).sBCDEF0123456789A)
5911#define ROT16_6(x) ((x).sABCDEF0123456789)
5912#define ROT16_7(x) ((x).s9ABCDEF012345678)
5913#define ROT16_8(x) ((x).s89ABCDEF01234567)
5914#define ROT16_9(x) ((x).s789ABCDEF0123456)
5915#define ROT16_10(x) ((x).s6789ABCDEF012345)
5916#define ROT16_11(x) ((x).s56789ABCDEF01234)
5917#define ROT16_12(x) ((x).s456789ABCDEF0123)
5918#define ROT16_13(x) ((x).s3456789ABCDEF012)
5919#define ROT16_14(x) ((x).s23456789ABCDEF01)
5920#define ROT16_15(x) ((x).s123456789ABCDEF0)
5921#define ROT16_16(x) ((x))
5922
5923
5924
5925#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
5926#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
5927
5928
5929
5930#define V_OFFS1(dt) (dt##1)(0)
5931#define V_OFFS2(dt) (dt##2)(0, 1)
5932#define V_OFFS3(dt) (dt##3)(0, 1, 2)
5933#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
5934#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
5935#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
5936
5937
5938
5939#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
5940#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
5941
5942
5943#define VLOAD_STR(size) vload##size
5944#define VLOAD(size) VLOAD_STR(size)
5945
5946
5947#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
5948#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
5949
5950#define NO_LOAD(data, offs, ptr) \
5951    {                            \
5952    }
5953
5954
5955#define vload_partial_1_0 NO_LOAD
5956#define vload_partial_1_1 vload1
5957#define vload_partial_1_2 NO_LOAD
5958#define vload_partial_1_3 NO_LOAD
5959#define vload_partial_1_4 NO_LOAD
5960#define vload_partial_1_5 NO_LOAD
5961#define vload_partial_1_6 NO_LOAD
5962#define vload_partial_1_7 NO_LOAD
5963#define vload_partial_1_8 NO_LOAD
5964#define vload_partial_1_9 NO_LOAD
5965#define vload_partial_1_10 NO_LOAD
5966#define vload_partial_1_11 NO_LOAD
5967#define vload_partial_1_12 NO_LOAD
5968#define vload_partial_1_13 NO_LOAD
5969#define vload_partial_1_14 NO_LOAD
5970#define vload_partial_1_15 NO_LOAD
5971#define vload_partial_1_16 NO_LOAD
5972
5973#define vload_partial_2_0 NO_LOAD
5974#define vload_partial_2_1 vload_partial_1
5975#define vload_partial_2_2 vload_partial_2
5976#define vload_partial_2_3 NO_LOAD
5977#define vload_partial_2_4 NO_LOAD
5978#define vload_partial_2_5 NO_LOAD
5979#define vload_partial_2_6 NO_LOAD
5980#define vload_partial_2_7 NO_LOAD
5981#define vload_partial_2_8 NO_LOAD
5982#define vload_partial_2_9 NO_LOAD
5983#define vload_partial_2_10 NO_LOAD
5984#define vload_partial_2_11 NO_LOAD
5985#define vload_partial_2_12 NO_LOAD
5986#define vload_partial_2_13 NO_LOAD
5987#define vload_partial_2_14 NO_LOAD
5988#define vload_partial_2_15 NO_LOAD
5989#define vload_partial_2_16 NO_LOAD
5990
5991#define vload_partial_3_0 NO_LOAD
5992#define vload_partial_3_1 vload_partial_1
5993#define vload_partial_3_2 vload_partial_2
5994#define vload_partial_3_3 vload_partial_3
5995#define vload_partial_3_4 NO_LOAD
5996#define vload_partial_3_5 NO_LOAD
5997#define vload_partial_3_6 NO_LOAD
5998#define vload_partial_3_7 NO_LOAD
5999#define vload_partial_3_8 NO_LOAD
6000#define vload_partial_3_9 NO_LOAD
6001#define vload_partial_3_10 NO_LOAD
6002#define vload_partial_3_11 NO_LOAD
6003#define vload_partial_3_12 NO_LOAD
6004#define vload_partial_3_13 NO_LOAD
6005#define vload_partial_3_14 NO_LOAD
6006#define vload_partial_3_15 NO_LOAD
6007#define vload_partial_3_16 NO_LOAD
6008
6009#define vload_partial_4_0 NO_LOAD
6010#define vload_partial_4_1 vload_partial_1
6011#define vload_partial_4_2 vload_partial_2
6012#define vload_partial_4_3 vload_partial_3
6013#define vload_partial_4_4 vload_partial_4
6014#define vload_partial_4_5 NO_LOAD
6015#define vload_partial_4_6 NO_LOAD
6016#define vload_partial_4_7 NO_LOAD
6017#define vload_partial_4_8 NO_LOAD
6018#define vload_partial_4_9 NO_LOAD
6019#define vload_partial_4_10 NO_LOAD
6020#define vload_partial_4_11 NO_LOAD
6021#define vload_partial_4_12 NO_LOAD
6022#define vload_partial_4_13 NO_LOAD
6023#define vload_partial_4_14 NO_LOAD
6024#define vload_partial_4_15 NO_LOAD
6025#define vload_partial_4_16 NO_LOAD
6026
6027#define vload_partial_8_0 NO_LOAD
6028#define vload_partial_8_1 vload_partial_1
6029#define vload_partial_8_2 vload_partial_2
6030#define vload_partial_8_3 vload_partial_3
6031#define vload_partial_8_4 vload_partial_4
6032#define vload_partial_8_5 vload_partial_5
6033#define vload_partial_8_6 vload_partial_6
6034#define vload_partial_8_7 vload_partial_7
6035#define vload_partial_8_8 vload_partial_8
6036#define vload_partial_8_9 NO_LOAD
6037#define vload_partial_8_10 NO_LOAD
6038#define vload_partial_8_11 NO_LOAD
6039#define vload_partial_8_12 NO_LOAD
6040#define vload_partial_8_13 NO_LOAD
6041#define vload_partial_8_14 NO_LOAD
6042#define vload_partial_8_15 NO_LOAD
6043#define vload_partial_8_16 NO_LOAD
6044
6045#define vload_partial_16_0 NO_LOAD
6046#define vload_partial_16_1 vload_partial_1
6047#define vload_partial_16_2 vload_partial_2
6048#define vload_partial_16_3 vload_partial_3
6049#define vload_partial_16_4 vload_partial_4
6050#define vload_partial_16_5 vload_partial_5
6051#define vload_partial_16_6 vload_partial_6
6052#define vload_partial_16_7 vload_partial_7
6053#define vload_partial_16_8 vload_partial_8
6054#define vload_partial_16_9 vload_partial_9
6055#define vload_partial_16_10 vload_partial_10
6056#define vload_partial_16_11 vload_partial_11
6057#define vload_partial_16_12 vload_partial_12
6058#define vload_partial_16_13 vload_partial_13
6059#define vload_partial_16_14 vload_partial_14
6060#define vload_partial_16_15 vload_partial_15
6061#define vload_partial_16_16 vload_partial_16
6062
6063
6064#define vload_partial_1(DATA, OFFSET, PTR) \
6065    DATA.s0 = vload1(OFFSET, PTR);
6066
6067#define vload_partial_2(DATA, OFFSET, PTR) \
6068    DATA.s01 = vload2(OFFSET, PTR);
6069
6070#define vload_partial_3(DATA, OFFSET, PTR) \
6071    DATA.s012 = vload3(OFFSET, PTR);
6072
6073#define vload_partial_4(DATA, OFFSET, PTR) \
6074    DATA.s0123 = vload4(OFFSET, PTR);
6075
6076#define vload_partial_5(DATA, OFFSET, PTR)    \
6077    vload_partial_4(DATA.s0123, OFFSET, PTR); \
6078    DATA.s4 = vload1(OFFSET, PTR + 4);
6079
6080#define vload_partial_6(DATA, OFFSET, PTR)    \
6081    vload_partial_4(DATA.s0123, OFFSET, PTR); \
6082    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
6083
6084#define vload_partial_7(DATA, OFFSET, PTR)    \
6085    vload_partial_4(DATA.s0123, OFFSET, PTR); \
6086    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
6087
6088#define vload_partial_8(DATA, OFFSET, PTR) \
6089    DATA.s01234567 = vload8(OFFSET, PTR);
6090
6091#define vload_partial_9(DATA, OFFSET, PTR)        \
6092    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6093    DATA.s8 = vload1(OFFSET, PTR + 8);
6094
6095#define vload_partial_10(DATA, OFFSET, PTR)       \
6096    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6097    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
6098
6099#define vload_partial_11(DATA, OFFSET, PTR)       \
6100    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6101    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
6102
6103#define vload_partial_12(DATA, OFFSET, PTR)       \
6104    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6105    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
6106
6107#define vload_partial_13(DATA, OFFSET, PTR)       \
6108    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6109    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
6110
6111#define vload_partial_14(DATA, OFFSET, PTR)       \
6112    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6113    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
6114
6115#define vload_partial_15(DATA, OFFSET, PTR)       \
6116    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6117    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
6118
6119#define vload_partial_16(DATA, OFFSET, PTR) \
6120    DATA = vload16(OFFSET, PTR);
6121
6122
6123
6124#define PIXEL_UNIT4 1
6125#define PIXEL_UNIT8 2
6126#define PIXEL_UNIT16 4
6127
6128
6129#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
6130#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
6131
6132
6133#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
6134#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
6135#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
6136
6137#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
6138#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
6139#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
6140#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
6141#endif
6142
6143#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
6144#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
6145#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
6146
6147#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
6148#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
6149#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
6150#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
6151#endif
6152
6153
6154#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
6155#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
6156
6157
6158#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
6159#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
6160
6161#define VSTORE_STR(size) vstore##size
6162#define VSTORE(size) VSTORE_STR(size)
6163
6164#define float1 float
6165#define half1 half
6166#define char1 char
6167#define uchar1 uchar
6168#define short1 short
6169#define ushort1 ushort
6170#define int1 int
6171#define uint1 uint
6172#define long1 long
6173#define ulong1 ulong
6174#define double1 double
6175
6176#define vload1(OFFSET, PTR) *(OFFSET + PTR)
6177#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
6178
6179
6180#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
6181#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
6182
6183#define NO_STORE(data, offs, ptr) \
6184    {                             \
6185    }
6186
6187
6188#define vstore_partial_1_0 NO_STORE
6189#define vstore_partial_1_1 vstore1
6190#define vstore_partial_1_2 NO_STORE
6191#define vstore_partial_1_3 NO_STORE
6192#define vstore_partial_1_4 NO_STORE
6193#define vstore_partial_1_5 NO_STORE
6194#define vstore_partial_1_6 NO_STORE
6195#define vstore_partial_1_7 NO_STORE
6196#define vstore_partial_1_8 NO_STORE
6197#define vstore_partial_1_9 NO_STORE
6198#define vstore_partial_1_10 NO_STORE
6199#define vstore_partial_1_11 NO_STORE
6200#define vstore_partial_1_12 NO_STORE
6201#define vstore_partial_1_13 NO_STORE
6202#define vstore_partial_1_14 NO_STORE
6203#define vstore_partial_1_15 NO_STORE
6204#define vstore_partial_1_16 NO_STORE
6205
6206#define vstore_partial_2_0 NO_STORE
6207#define vstore_partial_2_1 vstore_partial_1
6208#define vstore_partial_2_2 vstore_partial_2
6209#define vstore_partial_2_3 NO_STORE
6210#define vstore_partial_2_4 NO_STORE
6211#define vstore_partial_2_5 NO_STORE
6212#define vstore_partial_2_6 NO_STORE
6213#define vstore_partial_2_7 NO_STORE
6214#define vstore_partial_2_8 NO_STORE
6215#define vstore_partial_2_9 NO_STORE
6216#define vstore_partial_2_10 NO_STORE
6217#define vstore_partial_2_11 NO_STORE
6218#define vstore_partial_2_12 NO_STORE
6219#define vstore_partial_2_13 NO_STORE
6220#define vstore_partial_2_14 NO_STORE
6221#define vstore_partial_2_15 NO_STORE
6222#define vstore_partial_2_16 NO_STORE
6223
6224#define vstore_partial_3_0 NO_STORE
6225#define vstore_partial_3_1 vstore_partial_1
6226#define vstore_partial_3_2 vstore_partial_2
6227#define vstore_partial_3_3 vstore_partial_3
6228#define vstore_partial_3_4 NO_STORE
6229#define vstore_partial_3_5 NO_STORE
6230#define vstore_partial_3_6 NO_STORE
6231#define vstore_partial_3_7 NO_STORE
6232#define vstore_partial_3_8 NO_STORE
6233#define vstore_partial_3_9 NO_STORE
6234#define vstore_partial_3_10 NO_STORE
6235#define vstore_partial_3_11 NO_STORE
6236#define vstore_partial_3_12 NO_STORE
6237#define vstore_partial_3_13 NO_STORE
6238#define vstore_partial_3_14 NO_STORE
6239#define vstore_partial_3_15 NO_STORE
6240#define vstore_partial_3_16 NO_STORE
6241
6242#define vstore_partial_4_0 NO_STORE
6243#define vstore_partial_4_1 vstore_partial_1
6244#define vstore_partial_4_2 vstore_partial_2
6245#define vstore_partial_4_3 vstore_partial_3
6246#define vstore_partial_4_4 vstore_partial_4
6247#define vstore_partial_4_5 NO_STORE
6248#define vstore_partial_4_6 NO_STORE
6249#define vstore_partial_4_7 NO_STORE
6250#define vstore_partial_4_8 NO_STORE
6251#define vstore_partial_4_9 NO_STORE
6252#define vstore_partial_4_10 NO_STORE
6253#define vstore_partial_4_11 NO_STORE
6254#define vstore_partial_4_12 NO_STORE
6255#define vstore_partial_4_13 NO_STORE
6256#define vstore_partial_4_14 NO_STORE
6257#define vstore_partial_4_15 NO_STORE
6258#define vstore_partial_4_16 NO_STORE
6259
6260#define vstore_partial_8_0 NO_STORE
6261#define vstore_partial_8_1 vstore_partial_1
6262#define vstore_partial_8_2 vstore_partial_2
6263#define vstore_partial_8_3 vstore_partial_3
6264#define vstore_partial_8_4 vstore_partial_4
6265#define vstore_partial_8_5 vstore_partial_5
6266#define vstore_partial_8_6 vstore_partial_6
6267#define vstore_partial_8_7 vstore_partial_7
6268#define vstore_partial_8_8 vstore_partial_8
6269#define vstore_partial_8_9 NO_STORE
6270#define vstore_partial_8_10 NO_STORE
6271#define vstore_partial_8_11 NO_STORE
6272#define vstore_partial_8_12 NO_STORE
6273#define vstore_partial_8_13 NO_STORE
6274#define vstore_partial_8_14 NO_STORE
6275#define vstore_partial_8_15 NO_STORE
6276#define vstore_partial_8_16 NO_STORE
6277
6278#define vstore_partial_16_0 NO_STORE
6279#define vstore_partial_16_1 vstore_partial_1
6280#define vstore_partial_16_2 vstore_partial_2
6281#define vstore_partial_16_3 vstore_partial_3
6282#define vstore_partial_16_4 vstore_partial_4
6283#define vstore_partial_16_5 vstore_partial_5
6284#define vstore_partial_16_6 vstore_partial_6
6285#define vstore_partial_16_7 vstore_partial_7
6286#define vstore_partial_16_8 vstore_partial_8
6287#define vstore_partial_16_9 vstore_partial_9
6288#define vstore_partial_16_10 vstore_partial_10
6289#define vstore_partial_16_11 vstore_partial_11
6290#define vstore_partial_16_12 vstore_partial_12
6291#define vstore_partial_16_13 vstore_partial_13
6292#define vstore_partial_16_14 vstore_partial_14
6293#define vstore_partial_16_15 vstore_partial_15
6294#define vstore_partial_16_16 vstore_partial_16
6295
6296
6297#define vstore_partial_1(DATA, OFFSET, PTR) \
6298    vstore1(DATA.s0, OFFSET, PTR);
6299
6300#define vstore_partial_2(DATA, OFFSET, PTR) \
6301    vstore2(DATA.s01, OFFSET, PTR);
6302
6303#define vstore_partial_3(DATA, OFFSET, PTR) \
6304    vstore3(DATA.s012, OFFSET, PTR);
6305
6306#define vstore_partial_4(DATA, OFFSET, PTR) \
6307    vstore4(DATA.s0123, OFFSET, PTR);
6308
6309#define vstore_partial_5(DATA, OFFSET, PTR)    \
6310    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
6311    vstore1(DATA.s4, OFFSET, PTR + 4);
6312
6313#define vstore_partial_6(DATA, OFFSET, PTR)    \
6314    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
6315    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
6316
6317#define vstore_partial_7(DATA, OFFSET, PTR)    \
6318    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
6319    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
6320
6321#define vstore_partial_8(DATA, OFFSET, PTR) \
6322    vstore8(DATA.s01234567, OFFSET, PTR);
6323
6324#define vstore_partial_9(DATA, OFFSET, PTR)        \
6325    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6326    vstore1(DATA.s8, OFFSET, PTR + 8);
6327
6328#define vstore_partial_10(DATA, OFFSET, PTR)       \
6329    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6330    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
6331
6332#define vstore_partial_11(DATA, OFFSET, PTR)       \
6333    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6334    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
6335
6336#define vstore_partial_12(DATA, OFFSET, PTR)       \
6337    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6338    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
6339
6340#define vstore_partial_13(DATA, OFFSET, PTR)       \
6341    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6342    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
6343
6344#define vstore_partial_14(DATA, OFFSET, PTR)       \
6345    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6346    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
6347
6348#define vstore_partial_15(DATA, OFFSET, PTR)       \
6349    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6350    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
6351
6352#define vstore_partial_16(DATA, OFFSET, PTR) \
6353    vstore16(DATA, OFFSET, PTR);
6354
6355
6356
6357
6358
6359#define convert_float_sat convert_float
6360#define convert_float1_sat convert_float
6361#define convert_float2_sat convert_float2
6362#define convert_float3_sat convert_float3
6363#define convert_float4_sat convert_float4
6364#define convert_float8_sat convert_float8
6365#define convert_float16_sat convert_float16
6366#define convert_half_sat convert_float
6367#define convert_half1_sat convert_half
6368#define convert_half2_sat convert_half2
6369#define convert_half3_sat convert_half3
6370#define convert_half4_sat convert_half4
6371#define convert_half8_sat convert_half8
6372#define convert_half16_sat convert_half16
6373
6374#define convert_float1 convert_float
6375#define convert_half1 convert_half
6376#define convert_char1 convert_char
6377#define convert_uchar1 convert_uchar
6378#define convert_short1 convert_short
6379#define convert_ushort1 convert_ushort
6380#define convert_int1 convert_int
6381#define convert_uint1 convert_uint
6382#define convert_long1 convert_long
6383#define convert_ulong1 convert_ulong
6384#define convert_double1 convert_double
6385
6386#define convert_char1_sat convert_char_sat
6387#define convert_uchar1_sat convert_uchar_sat
6388#define convert_uchar2_sat convert_uchar2_sat
6389#define convert_uchar3_sat convert_uchar3_sat
6390#define convert_uchar4_sat convert_uchar4_sat
6391#define convert_uchar8_sat convert_uchar8_sat
6392#define convert_uchar16_sat convert_uchar16_sat
6393#define convert_short1_sat convert_short_sat
6394#define convert_ushort1_sat convert_ushort_sat
6395#define convert_int1_sat convert_int_sat
6396#define convert_uint1_sat convert_uint_sat
6397#define convert_long1_sat convert_long_sat
6398#define convert_ulong1_sat convert_ulong_sat
6399#define convert_double1_sat convert_double_sat
6400
6401#define VEC_DATA_TYPE_STR(type, size) type##size
6402#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
6403
6404#define CONVERT_STR(x, type) (convert_##type((x)))
6405#define CONVERT(x, type) CONVERT_STR(x, type)
6406
6407#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
6408#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
6409
6410#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
6411#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
6412
6413#define select_vec_dt_uchar(size) uchar##size
6414#define select_vec_dt_char(size) char##size
6415#define select_vec_dt_ushort(size) ushort##size
6416#define select_vec_dt_short(size) short##size
6417#define select_vec_dt_half(size) short##size
6418#define select_vec_dt_uint(size) uint##size
6419#define select_vec_dt_int(size) int##size
6420#define select_vec_dt_float(size) int##size
6421#define select_vec_dt_ulong(size) ulong##size
6422#define select_vec_dt_long(size) long##size
6423
6424#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
6425#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
6426#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
6427
6428#define signed_int_vec_dt_uchar(size) char##size
6429#define signed_int_vec_dt_char(size) char##size
6430#define signed_int_vec_dt_ushort(size) short##size
6431#define signed_int_vec_dt_short(size) short##size
6432#define signed_int_vec_dt_half(size) short##size
6433#define signed_int_vec_dt_uint(size) int##size
6434#define signed_int_vec_dt_int(size) int##size
6435#define signed_int_vec_dt_float(size) int##size
6436#define signed_int_vec_dt_ulong(size) long##size
6437#define signed_int_vec_dt_long(size) long##size
6438
6439#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
6440#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
6441#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
6442
6443#define sum_reduce_1(x) (x)
6444#define sum_reduce_2(x) ((x).s0) + ((x).s1)
6445#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
6446#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
6447#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
6448#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
6449
6450#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
6451#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
6452
6453#define prod_reduce_1(x) (x)
6454#define prod_reduce_2(x) ((x).s0) * ((x).s1)
6455#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
6456#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
6457#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
6458#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
6459
6460#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
6461#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
6462
6463#define max_reduce_1(x) (x)
6464#define max_reduce_2(x) max(((x).s0), ((x).s1))
6465#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
6466#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
6467#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
6468#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
6469
6470#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
6471#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
6472
6473#define VECTOR_DECLARATION(name)     \
6474    __global uchar *name##_ptr,      \
6475    uint        name##_stride_x, \
6476    uint        name##_step_x,   \
6477    uint        name##_offset_first_element_in_bytes
6478
6479#define IMAGE_DECLARATION(name)      \
6480    __global uchar *name##_ptr,      \
6481    uint        name##_stride_x, \
6482    uint        name##_step_x,   \
6483    uint        name##_stride_y, \
6484    uint        name##_step_y,   \
6485    uint        name##_offset_first_element_in_bytes
6486
6487#define TENSOR3D_DECLARATION(name)   \
6488    __global uchar *name##_ptr,      \
6489    uint        name##_stride_x, \
6490    uint        name##_step_x,   \
6491    uint        name##_stride_y, \
6492    uint        name##_step_y,   \
6493    uint        name##_stride_z, \
6494    uint        name##_step_z,   \
6495    uint        name##_offset_first_element_in_bytes
6496
6497#define TENSOR4D_DECLARATION(name)   \
6498    __global uchar *name##_ptr,      \
6499    uint        name##_stride_x, \
6500    uint        name##_step_x,   \
6501    uint        name##_stride_y, \
6502    uint        name##_step_y,   \
6503    uint        name##_stride_z, \
6504    uint        name##_step_z,   \
6505    uint        name##_stride_w, \
6506    uint        name##_step_w,   \
6507    uint        name##_offset_first_element_in_bytes
6508
6509#define TENSOR5D_DECLARATION(name)   \
6510    __global uchar *name##_ptr,      \
6511    uint        name##_stride_x, \
6512    uint        name##_step_x,   \
6513    uint        name##_stride_y, \
6514    uint        name##_step_y,   \
6515    uint        name##_stride_z, \
6516    uint        name##_step_z,   \
6517    uint        name##_stride_w, \
6518    uint        name##_step_w,   \
6519    uint        name##_stride_v, \
6520    uint        name##_step_v,   \
6521    uint        name##_offset_first_element_in_bytes
6522
6523#define CONVERT_TO_VECTOR_STRUCT(name) \
6524    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
6525
6526#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
6527    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
6528
6529#define CONVERT_TO_IMAGE_STRUCT(name) \
6530    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
6531
6532#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
6533    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
6534
6535#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
6536    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
6537
6538#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
6539    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
6540
6541#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
6542    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
6543
6544#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
6545    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6546                                 name##_stride_z, name##_step_z)
6547
6548#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
6549    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
6550
6551#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
6552    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6553                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
6554
6555#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
6556    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
6557
6558#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
6559    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6560                           name##_stride_z, name##_step_z)
6561
6562
6563typedef struct Vector
6564{
6565    __global uchar *ptr;
6566    int             offset_first_element_in_bytes;
6567    int             stride_x;
6568} Vector;
6569
6570
6571typedef struct Image
6572{
6573    __global uchar *ptr;
6574    int             offset_first_element_in_bytes;
6575    int             stride_x;
6576    int             stride_y;
6577} Image;
6578
6579
6580typedef struct Tensor3D
6581{
6582    __global uchar *ptr;
6583    int             offset_first_element_in_bytes;
6584    int             stride_x;
6585    int             stride_y;
6586    int             stride_z;
6587} Tensor3D;
6588
6589
6590typedef struct Tensor4D
6591{
6592    __global uchar *ptr;
6593    int             offset_first_element_in_bytes;
6594    int             stride_x;
6595    int             stride_y;
6596    int             stride_z;
6597    int             stride_w;
6598} Tensor4D;
6599
6600
6601inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
6602{
6603    Vector vector =
6604    {
6605        .ptr                           = ptr,
6606        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6607        .stride_x                      = stride_x,
6608    };
6609    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
6610    return vector;
6611}
6612
6613
6614inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
6615{
6616    Image img =
6617    {
6618        .ptr                           = ptr,
6619        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6620        .stride_x                      = stride_x,
6621        .stride_y                      = stride_y
6622    };
6623    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
6624    return img;
6625}
6626
6627
6628inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6629{
6630    Image img =
6631    {
6632        .ptr                           = ptr,
6633        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6634        .stride_x                      = stride_x,
6635        .stride_y                      = stride_y
6636    };
6637    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
6638    return img;
6639}
6640
6641
6642inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6643{
6644    Tensor3D tensor =
6645    {
6646        .ptr                           = ptr,
6647        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6648        .stride_x                      = stride_x,
6649        .stride_y                      = stride_y,
6650        .stride_z                      = stride_z
6651    };
6652    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
6653    return tensor;
6654}
6655
6656
6657inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6658{
6659    Tensor3D tensor =
6660    {
6661        .ptr                           = ptr,
6662        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6663        .stride_x                      = stride_x,
6664        .stride_y                      = stride_y,
6665        .stride_z                      = stride_z
6666    };
6667    return tensor;
6668}
6669
6670inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
6671                                             uint step_w,
6672                                             uint mod_size)
6673{
6674    Tensor4D tensor =
6675    {
6676        .ptr                           = ptr,
6677        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6678        .stride_x                      = stride_x,
6679        .stride_y                      = stride_y,
6680        .stride_z                      = stride_z,
6681        .stride_w                      = stride_w
6682    };
6683
6684    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
6685    return tensor;
6686}
6687
6688
6689inline __global const uchar *vector_offset(const Vector *vec, int x)
6690{
6691    return vec->ptr + x * vec->stride_x;
6692}
6693
6694
6695inline __global uchar *offset(const Image *img, int x, int y)
6696{
6697    return img->ptr + x * img->stride_x + y * img->stride_y;
6698}
6699
6700
6701inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
6702{
6703    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
6704}
6705
6706
6707inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
6708{
6709    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
6710}
6711
6712
6713inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
6714{
6715    uint num_elements = width * height;
6716
6717    const uint z = index / num_elements;
6718
6719    index %= num_elements;
6720
6721    const uint y = index / width;
6722
6723    index %= width;
6724
6725    const uint x = index;
6726
6727    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
6728}
6729
6730#endif
6731
6732
6733
6734#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
6735#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
6736    P_X##_DEF(1, P_A, P_B, P_C);       \
6737    REPEAT_3_1(P_X, P_A, P_B, P_C)
6738#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
6739    P_X##_DEF(2, P_A, P_B, P_C);       \
6740    REPEAT_3_2(P_X, P_A, P_B, P_C)
6741#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
6742    P_X##_DEF(3, P_A, P_B, P_C);       \
6743    REPEAT_3_3(P_X, P_A, P_B, P_C)
6744#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
6745    P_X##_DEF(4, P_A, P_B, P_C);       \
6746    REPEAT_3_4(P_X, P_A, P_B, P_C)
6747#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
6748    P_X##_DEF(5, P_A, P_B, P_C);       \
6749    REPEAT_3_5(P_X, P_A, P_B, P_C)
6750#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
6751    P_X##_DEF(6, P_A, P_B, P_C);       \
6752    REPEAT_3_6(P_X, P_A, P_B, P_C)
6753#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
6754    P_X##_DEF(7, P_A, P_B, P_C);       \
6755    REPEAT_3_7(P_X, P_A, P_B, P_C)
6756#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
6757    P_X##_DEF(8, P_A, P_B, P_C);       \
6758    REPEAT_3_8(P_X, P_A, P_B, P_C)
6759#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
6760    P_X##_DEF(9, P_A, P_B, P_C);        \
6761    REPEAT_3_9(P_X, P_A, P_B, P_C)
6762#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
6763    P_X##_DEF(A, P_A, P_B, P_C);        \
6764    REPEAT_3_10(P_X, P_A, P_B, P_C)
6765#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
6766    P_X##_DEF(B, P_A, P_B, P_C);        \
6767    REPEAT_3_11(P_X, P_A, P_B, P_C)
6768#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
6769    P_X##_DEF(C, P_A, P_B, P_C);        \
6770    REPEAT_3_12(P_X, P_A, P_B, P_C)
6771#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
6772    P_X##_DEF(D, P_A, P_B, P_C);        \
6773    REPEAT_3_13(P_X, P_A, P_B, P_C)
6774#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
6775    P_X##_DEF(E, P_A, P_B, P_C);        \
6776    REPEAT_3_14(P_X, P_A, P_B, P_C)
6777#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
6778    P_X##_DEF(F, P_A, P_B, P_C);        \
6779    REPEAT_3_15(P_X, P_A, P_B, P_C)
6780
6781#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C)
6782#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
6783
6784
6785#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
6786#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
6787    P_X##_DEF(1, P_A, P_B, P_C, P_D);       \
6788    REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
6789#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
6790    P_X##_DEF(2, P_A, P_B, P_C, P_D);       \
6791    REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
6792#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
6793    P_X##_DEF(3, P_A, P_B, P_C, P_D);       \
6794    REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
6795#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
6796    P_X##_DEF(4, P_A, P_B, P_C, P_D);       \
6797    REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
6798#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
6799    P_X##_DEF(5, P_A, P_B, P_C, P_D);       \
6800    REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
6801#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
6802    P_X##_DEF(6, P_A, P_B, P_C, P_D);       \
6803    REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
6804#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
6805    P_X##_DEF(7, P_A, P_B, P_C, P_D);       \
6806    REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
6807#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
6808    P_X##_DEF(8, P_A, P_B, P_C, P_D);       \
6809    REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
6810#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
6811    P_X##_DEF(9, P_A, P_B, P_C, P_D);        \
6812    REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
6813#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
6814    P_X##_DEF(A, P_A, P_B, P_C, P_D);        \
6815    REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
6816#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
6817    P_X##_DEF(B, P_A, P_B, P_C, P_D);        \
6818    REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
6819#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
6820    P_X##_DEF(C, P_A, P_B, P_C, P_D);        \
6821    REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
6822#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
6823    P_X##_DEF(D, P_A, P_B, P_C, P_D);        \
6824    REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
6825#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
6826    P_X##_DEF(E, P_A, P_B, P_C, P_D);        \
6827    REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
6828#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
6829    P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
6830    REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
6831
6832#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D)
6833#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
6834
6835
6836#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
6837#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
6838
6839
6840#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
6841#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
6842
6843
6844#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
6845#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
6846
6847
6848#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
6849#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
6850
6851
6852#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
6853#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
6854
6855
6856#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
6857#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
6858
6859
6860#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
6861#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
6862
6863
6864#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
6865#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
6866
6867
6868#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
6869#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
6870
6871
6872#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
6873#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
6874
6875
6876#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
6877#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
6878
6879
6880#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
6881    ({                                                                                                        \
6882        VEC_DATA_TYPE(int, N0)                                                                                \
6883        VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
6884        VEC_DATA_TYPE(int, N0)                                                                                \
6885        VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
6886        VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
6887    })
6888#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
6889
6890#endif
6891
6892#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
6893#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
6894
6895
6896
6897
6898#define TILE_VECTOR_SIZE1 1
6899#define TILE_VECTOR_SIZE2 2
6900#define TILE_VECTOR_SIZE3 3
6901#define TILE_VECTOR_SIZE4 4
6902#define TILE_VECTOR_SIZE5 8
6903#define TILE_VECTOR_SIZE6 8
6904#define TILE_VECTOR_SIZE7 8
6905#define TILE_VECTOR_SIZE8 8
6906#define TILE_VECTOR_SIZE9 16
6907#define TILE_VECTOR_SIZE10 16
6908#define TILE_VECTOR_SIZE11 16
6909#define TILE_VECTOR_SIZE12 16
6910#define TILE_VECTOR_SIZE13 16
6911#define TILE_VECTOR_SIZE14 16
6912#define TILE_VECTOR_SIZE15 16
6913#define TILE_VECTOR_SIZE16 16
6914
6915#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
6916#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
6917#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
6918#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
6919#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
6920#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
6921#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
6922#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
6923#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
6924#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
6925#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
6926#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
6927#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
6928#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
6929#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
6930#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
6931
6932
6933#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
6934#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
6935    union {                                 \
6936        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
6937        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
6938    } BASENAME[H]
6939
6940#define TENSOR4D_IMAGE(name)          \
6941    __read_only image2d_t name##_img, \
6942    __global uchar *name##_ptr,       \
6943    uint            name##_stride_x,  \
6944    uint            name##_step_x,    \
6945    uint            name##_stride_y,  \
6946    uint            name##_step_y,    \
6947    uint            name##_stride_z,  \
6948    uint            name##_step_z,    \
6949    uint            name##_stride_w,  \
6950    uint            name##_step_w,    \
6951    uint            name##_offset_first_element_in_bytes
6952
6953#define TENSOR4D_BUFFER(name)    \
6954    __global uchar *name##_ptr,  \
6955    uint        name##_stride_x, \
6956    uint        name##_step_x,   \
6957    uint        name##_stride_y, \
6958    uint        name##_step_y,   \
6959    uint        name##_stride_z, \
6960    uint        name##_step_z,   \
6961    uint        name##_stride_w, \
6962    uint        name##_step_w,   \
6963    uint        name##_offset_first_element_in_bytes
6964
6965#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
6966#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
6967
6968#define TENSOR4D_T_IMAGE(name)          \
6969    __read_only image2d_t name##_img, \
6970    __global uchar *name##_ptr,       \
6971    uint        name##_stride_y, \
6972    uint        name##_stride_z, \
6973    uint        name##_stride_w, \
6974    uint        name##_c,   \
6975    uint        name##_w,   \
6976    uint        name##_h,   \
6977    uint        name##_n,   \
6978    uint        name##_offset_first_element_in_bytes
6979
6980#define TENSOR4D_T_BUFFER(name)    \
6981    __global uchar *name##_ptr,  \
6982    uint        name##_stride_y, \
6983    uint        name##_stride_z, \
6984    uint        name##_stride_w, \
6985    uint        name##_c,   \
6986    uint        name##_w,   \
6987    uint        name##_h,   \
6988    uint        name##_n,   \
6989    uint        name##_offset_first_element_in_bytes
6990
6991#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
6992
6993
6994#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
6995
6996#define TENSOR4D_RO_T_IMAGE(name)          \
6997    __read_only image2d_t name##_img, \
6998    TENSOR4D_T_BUFFER(name)
6999
7000#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
7001
7002#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
7003
7004
7005#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
7006
7007#define TENSOR4D_WO_T_IMAGE(name)          \
7008    __write_only image2d_t name##_img, \
7009    TENSOR4D_T_BUFFER(name)
7010
7011#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
7012
7013#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
7014
7015
7016#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
7017
7018#define TENSOR3D_T_IMAGE(name)          \
7019    __read_only image2d_t name##_img, \
7020    __global uchar *name##_ptr,       \
7021    uint        name##_stride_y, \
7022    uint        name##_stride_z, \
7023    uint        name##_w,   \
7024    uint        name##_h,   \
7025    uint        name##_n,   \
7026    uint        name##_offset_first_element_in_bytes
7027
7028#define TENSOR3D_T_BUFFER(name)    \
7029    __global uchar *name##_ptr,  \
7030    uint        name##_stride_y, \
7031    uint        name##_stride_z, \
7032    uint        name##_w,   \
7033    uint        name##_h,   \
7034    uint        name##_n,   \
7035    uint        name##_offset_first_element_in_bytes
7036
7037#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
7038#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
7039
7040#if !defined(UNROLL_WITH_PRAGMA)
7041#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
7042
7043#define LOOP_UNROLLING_1(idx, step, macro) (macro)
7044#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
7045#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
7046#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
7047#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
7048#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
7049#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
7050#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
7051#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
7052#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
7053#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
7054#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
7055#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
7056#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
7057#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
7058#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
7059#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
7060#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
7061#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
7062#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
7063#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
7064#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
7065#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
7066#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
7067#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
7068#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
7069#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
7070#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
7071#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
7072#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
7073#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
7074#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
7075#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
7076#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
7077#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
7078#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
7079#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
7080#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
7081#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
7082#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
7083#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
7084#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
7085#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
7086#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
7087#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
7088#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
7089#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
7090#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
7091#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
7092#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
7093#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
7094#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
7095#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
7096#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
7097#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
7098#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
7099#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
7100#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
7101#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
7102#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
7103#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
7104#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
7105#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
7106#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
7107#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
7108#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
7109#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
7110#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
7111#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
7112#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
7113#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
7114#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
7115#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
7116#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
7117#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
7118#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
7119#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
7120#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
7121#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
7122#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
7123#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
7124#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
7125#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
7126#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
7127#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
7128#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
7129#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
7130#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
7131#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
7132#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
7133#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
7134#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
7135#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
7136#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
7137#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
7138#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
7139#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
7140#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
7141#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
7142#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
7143#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
7144#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
7145#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
7146#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
7147#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
7148#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
7149#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
7150#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
7151#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
7152#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
7153#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
7154#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
7155#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
7156#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
7157#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
7158#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
7159#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
7160#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
7161#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
7162#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
7163#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
7164#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
7165#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
7166#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
7167#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
7168#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
7169#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
7170#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
7171
7172#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
7173    {                                                          \
7174        type idx = start;                                      \
7175        LOOP_UNROLLING_##num(idx, step, macro);                \
7176    }
7177#else
7178#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
7179    {                                                          \
7180        _Pragma("unroll")                                      \
7181        for(type idx = start; idx < (num * step); idx += step) \
7182        {                                                      \
7183            (macro);                                           \
7184        }                                                      \
7185    }
7186#endif
7187#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
7188
7189
7190#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
7191
7192
7193#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
7194#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
7195#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7196    ({                                                \
7197        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
7198    })
7199#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
7200#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
7201#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
7202#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
7203#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
7204#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
7205#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
7206#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
7207#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
7208#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
7209#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
7210#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
7211#else
7212#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
7213    ({                                                  \
7214        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
7215        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
7216    })
7217#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
7218    ({                                                  \
7219        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
7220        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
7221    })
7222#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
7223    ({                                                    \
7224        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
7225        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
7226        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
7227        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
7228    })
7229#endif
7230#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7231    ({                                                \
7232        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
7233        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
7234    })
7235#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7236    ({                                                \
7237        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
7238        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
7239    })
7240#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7241    ({                                                \
7242        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
7243        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
7244    })
7245#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7246    ({                                                \
7247        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
7248        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
7249    })
7250#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7251    ({                                                \
7252        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7253        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
7254    })
7255#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7256    ({                                                \
7257        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7258        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
7259    })
7260#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7261    ({                                                \
7262        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7263        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
7264    })
7265#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7266    ({                                                \
7267        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7268        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
7269    })
7270#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7271    ({                                                \
7272        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7273        DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c);     \
7274    })
7275#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7276    ({                                                \
7277        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7278        DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c);     \
7279    })
7280#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7281    ({                                                \
7282        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7283        DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c);     \
7284    })
7285#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7286    ({                                                 \
7287        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
7288        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
7289    })
7290
7291
7292#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
7293#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
7294
7295
7296#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
7297#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
7298#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
7299    VLOAD(WIDTH)                                                \
7300    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
7301#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
7302
7303
7304#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
7305#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
7306#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
7307    VSTORE(WIDTH)                                                \
7308    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
7309#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
7310
7311
7312#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
7313    ({                                                                                                                 \
7314        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
7315        {                                                                                                              \
7316            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
7317        })                                                                                                             \
7318    })
7319
7320
7321#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
7322    ({                                                                                                  \
7323        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
7324        {                                                                                               \
7325            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
7326        })                                                                                              \
7327    })
7328
7329
7330#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
7331    ({                                                                                                                                                                                             \
7332        if(WIDTH1_CONDITION)                                                                                                                                                                       \
7333        {                                                                                                                                                                                          \
7334            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7335            {                                                                                                                                                                                      \
7336                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
7337                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
7338            })                                                                                                                                                                                     \
7339        }                                                                                                                                                                                          \
7340        else                                                                                                                                                                                       \
7341        {                                                                                                                                                                                          \
7342            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7343            {                                                                                                                                                                                      \
7344                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
7345            })                                                                                                                                                                                     \
7346        }                                                                                                                                                                                          \
7347    })
7348
7349#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
7350    ({                                                                                                                                                \
7351        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
7352        {                                                                                                                                             \
7353            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
7354            {                                                                                                                                         \
7355                int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
7356                _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
7357                int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
7358                if(_src_valid_y != 0)                                                                                                                 \
7359                {                                                                                                                                     \
7360                    dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
7361                }                                                                                                                                     \
7362            })                                                                                                                                        \
7363        })                                                                                                                                            \
7364    })
7365
7366
7367#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
7368    ({ \
7369        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
7370        { \
7371            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
7372            { \
7373                int _src_y = (X) + _xk * (DILATION_X); \
7374                int _src_z = ((Y) + _yk * (DILATION_Y)); \
7375                int _src_w    = (B); \
7376                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
7377                if(!(BOUNDARY_CHECK)) \
7378                { \
7379                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
7380                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
7381                } \
7382                else \
7383                { \
7384                    if(_src_valid_y) \
7385                    { \
7386                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
7387                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
7388                    }                                                                                                                                                                                                 \
7389                } \
7390            })                                                                                                                                                                                                             \
7391        })                                                                                                                                                                                                             \
7392    })
7393
7394
7395#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
7396    ({                                                                                                                                                                \
7397        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
7398        {                                                                                                                                                             \
7399            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
7400            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
7401            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
7402            if(_src_valid_y != 0)                                                                                                                                     \
7403            {                                                                                                                                                         \
7404                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
7405            }                                                                                                                                                         \
7406        })                                                                                                                                                            \
7407    })
7408
7409
7410#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
7411#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
7412#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
7413    ({ \
7414        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
7415        { \
7416            if(yi[0].s[_i] >= 0) \
7417            { \
7418                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
7419            } \
7420        }) \
7421    })
7422
7423#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
7424    ({ \
7425        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
7426        { \
7427            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
7428        }) \
7429    })
7430
7431
7432#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
7433    ({                                                                                                                                                                \
7434        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
7435        {                                                                                                                                                             \
7436            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
7437            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
7438            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
7439                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
7440            if(_src_valid_y != 0)                                                                                                                                     \
7441            {                                                                                                                                                         \
7442                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
7443            }                                                                                                                                                         \
7444        })                                                                                                                                                            \
7445    })
7446
7447
7448#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
7449    ({                                                                                                                                                                                             \
7450        if(WIDTH1_CONDITION)                                                                                                                                                                       \
7451        {                                                                                                                                                                                          \
7452            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7453            {                                                                                                                                                                                      \
7454                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
7455                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
7456            })                                                                                                                                                                                     \
7457        }                                                                                                                                                                                          \
7458        else                                                                                                                                                                                       \
7459        {                                                                                                                                                                                          \
7460            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7461            {                                                                                                                                                                                      \
7462                VSTORE(WIDTH0)                                                                                                                                                                     \
7463                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
7464            })                                                                                                                                                                                     \
7465        }                                                                                                                                                                                          \
7466    })
7467
7468
7469#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
7470    ({                                                                                               \
7471        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
7472        {                                                                                            \
7473            ACC_DATA_TYPE _tm = 0;                                                                   \
7474            LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
7475            {                                                                                        \
7476                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
7477            })                                                                                       \
7478            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
7479            {                                                                                        \
7480                dst[_m0].s[_n0] += _tm;                                                              \
7481                LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
7482                {                                                                                    \
7483                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
7484                })                                                                                   \
7485            })                                                                                       \
7486        })                                                                                          \
7487    })
7488
7489
7490#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
7491#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
7492
7493
7494#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
7495    ({ \
7496        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7497        { \
7498            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7499            { \
7500                SRC_DATA_TYPE _tmp = 0; \
7501                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7502                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
7503                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
7504                long a_64 = (long)(_src); \
7505                long b_64 = (long)(DST_MULTIPLIER); \
7506                long ab_64 = a_64 * b_64; \
7507                long mask1 = 1 << 30; \
7508                long mask2 = 1 - (1 << 30); \
7509                long is_positive_or_zero = ab_64 >= 0; \
7510                long nudge = select(mask2, mask1, is_positive_or_zero); \
7511                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7512                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7513                if(DST_SHIFT >= 0) \
7514                { \
7515                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
7516                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
7517                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
7518                } \
7519                _tmp += DST_OFFSET; \
7520                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7521            })                                                                                                                                          \
7522        })                                                                                                                                          \
7523    })
7524
7525
7526#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
7527    ({ \
7528        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7529        { \
7530            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7531            { \
7532                SRC_DATA_TYPE _tmp = 0; \
7533                SRC_DATA_TYPE _tmp2 = 0; \
7534                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7535                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
7536                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
7537                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
7538                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
7539                long a_64 = (long)(_src); \
7540                long b_64 = (long)(_dst_multiplier); \
7541                long ab_64 = a_64 * b_64; \
7542                long mask1 = 1 << 30; \
7543                long mask2 = 1 - (1 << 30); \
7544                long is_positive_or_zero = ab_64 >= 0; \
7545                long nudge = select(mask2, mask1, is_positive_or_zero); \
7546                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7547                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7548                long mask = ((((int)1) << _dst_shift) - (int)1); \
7549                long threshold = (mask >> 1) + any(_tmp); \
7550                _tmp2 = _tmp >> _dst_shift; \
7551                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
7552                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
7553                _tmp += DST_OFFSET; \
7554                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7555            })                                                                                                                                          \
7556        })                                                                                                                                         \
7557    })
7558
7559
7560#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
7561    ({ \
7562        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7563        { \
7564            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7565            { \
7566                SRC_DATA_TYPE _tmp = 0; \
7567                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7568                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
7569                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
7570                long a_64 = (long)(_src); \
7571                long b_64 = (long)(DST_MULTIPLIER); \
7572                long ab_64 = a_64 * b_64; \
7573                long mask1 = 1 << 30; \
7574                long mask2 = 1 - (1 << 30); \
7575                long is_positive_or_zero = ab_64 >= 0; \
7576                long nudge = select(mask2, mask1, is_positive_or_zero); \
7577                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7578                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7579                if(DST_SHIFT >= 0) \
7580                { \
7581                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
7582                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
7583                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
7584                } \
7585                _tmp += DST_OFFSET; \
7586                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7587            })                                                                                                                                          \
7588        })                                                                                                                                          \
7589    })
7590
7591
7592#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
7593    ({                                                                                                                                                     \
7594        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
7595        {                                                                                                                                                  \
7596            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
7597            {                                                                                                                                              \
7598                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
7599            })                                                                                                                                             \
7600        })                                                                                                                                                 \
7601    })
7602
7603
7604#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
7605    ({                                                                                         \
7606        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
7607        {                                                                                      \
7608            dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
7609        })                                                                                     \
7610    })
7611
7612
7613#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
7614
7615#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
7616
7617#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
7618
7619#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
7620
7621#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
7622
7623#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
7624#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
7625
7626#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
7627#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
7628#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
7629#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
7630
7631
7632#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst)               \
7633    ({ \
7634        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7635        { \
7636            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
7637        })                                                                                          \
7638    })
7639
7640
7641#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
7642    ({                                                            \
7643        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7644        {                                                         \
7645            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
7646        })                                                        \
7647    })
7648
7649
7650#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
7651    ({                                                            \
7652        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7653        {                                                         \
7654            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
7655        })                                                        \
7656    })
7657
7658#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7659#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7660#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7661
7662#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7663#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7664
7665#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7666
7667#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7668#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7669
7670
7671#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
7672    ({                                                            \
7673        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7674        {                                                         \
7675            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
7676        })                                                        \
7677    })
7678
7679
7680#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7681    ({                                                      \
7682        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7683        {                                                   \
7684            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7685        })                                                  \
7686    })
7687
7688
7689#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7690    ({                                                      \
7691        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7692        {                                                   \
7693            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7694        })                                                  \
7695    })
7696
7697#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7698#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7699#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7700#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7701
7702
7703#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7704    ({                                                      \
7705        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7706        {                                                   \
7707            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7708        })                                                  \
7709    })
7710
7711
7712#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
7713    ({                                                      \
7714        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7715        {                                                   \
7716            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7717        })                                                  \
7718    })
7719
7720
7721#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7722#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7723#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7724#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7725#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7726#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7727#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7728#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7729#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
7730    {                                                                                     \
7731        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
7732        {                                                                                 \
7733            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
7734            {                                                                             \
7735                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
7736                {                                                                         \
7737                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
7738                })                                                                        \
7739            })                                                                            \
7740        })                                                                                \
7741    }
7742
7743#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
7744    ({ \
7745        LOOP_UNROLLING(int, _m, 0, 1, M0, \
7746        { \
7747            LOOP_UNROLLING(int, _n, 0, 1, N0, \
7748            { \
7749                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
7750            })                                                                                             \
7751        })                                                                                             \
7752    })
7753
7754#endif
7755
7756#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
7757
7758#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
7759#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
7760#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
7761#else
7762#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
7763#endif
7764#endif
7765
7766#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
7767
7768#define ARM_DOT1(a, b, c)                                                                                                                               \
7769    ({                                                                                                                                                  \
7770        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
7771    })
7772#define ARM_DOT2(a, b, c)                                                                                                                               \
7773    ({                                                                                                                                                  \
7774        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
7775    })
7776#define ARM_DOT3(a, b, c)                                                                                           \
7777    ({                                                                                                              \
7778        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
7779    })
7780#define ARM_DOT4(a, b, c) \
7781    ({                    \
7782        ARM_DOT(a, b, c); \
7783    })
7784#define ARM_DOT8(a, b, c)            \
7785    ({                               \
7786        ARM_DOT4((a.lo), (b.lo), c); \
7787        ARM_DOT4((a.hi), (b.hi), c); \
7788    })
7789#define ARM_DOT16(a, b, c)           \
7790    ({                               \
7791        ARM_DOT8((a.lo), (b.lo), c); \
7792        ARM_DOT8((a.hi), (b.hi), c); \
7793    })
7794
7795#else
7796
7797
7798#define ARM_DOT1(a, b, c)          \
7799    ({                             \
7800        c += (ACC_DATA_TYPE)a * b; \
7801    })
7802#define ARM_DOT2(a, b, c)                \
7803    ({                                   \
7804        c += (ACC_DATA_TYPE)a.s0 * b.s0; \
7805        c += (ACC_DATA_TYPE)a.s1 * b.s1; \
7806    })
7807#define ARM_DOT3(a, b, c)                \
7808    ({                                   \
7809        ARM_DOT2(a, b, c);               \
7810        c += (ACC_DATA_TYPE)a.s2 * b.s2; \
7811    })
7812#define ARM_DOT4(a, b, c)                \
7813    ({                                   \
7814        ARM_DOT3(a, b, c);               \
7815        c += (ACC_DATA_TYPE)a.s3 * b.s3; \
7816    })
7817#define ARM_DOT8(a, b, c)            \
7818    ({                               \
7819        ARM_DOT4((a.lo), (b.lo), c); \
7820        ARM_DOT4((a.hi), (b.hi), c); \
7821    })
7822#define ARM_DOT16(a, b, c)           \
7823    ({                               \
7824        ARM_DOT8((a.lo), (b.lo), c); \
7825        ARM_DOT8((a.hi), (b.hi), c); \
7826    })
7827#endif
7828
7829
7830#define ARM_DOT_K0X1(k0, a, b, c)         \
7831    ({                                    \
7832        ARM_DOT_K0(k0, (a), (b##0), (c)); \
7833    })
7834#define ARM_DOT_K0X2(k0, a, b, c)            \
7835    ({                                       \
7836        ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
7837        ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
7838    })
7839#define ARM_DOT_K0X3(k0, a, b, c)            \
7840    ({                                       \
7841        ARM_DOT_K0X2(k0, a, b, c);           \
7842        ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
7843    })
7844#define ARM_DOT_K0X4(k0, a, b, c)            \
7845    ({                                       \
7846        ARM_DOT_K0X3(k0, a, b, c);           \
7847        ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
7848    })
7849#define ARM_DOT_K0X8(k0, a, b, c)            \
7850    ({                                       \
7851        ARM_DOT_K0X4(k0, a, b, c);           \
7852        ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
7853        ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
7854        ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
7855        ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
7856    })
7857#define ARM_DOT_K0X16(k0, a, b, c)           \
7858    ({                                       \
7859        ARM_DOT_K0X8(k0, a, b, c);           \
7860        ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
7861        ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
7862        ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
7863        ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
7864        ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
7865        ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
7866        ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
7867        ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
7868    })
7869
7870
7871#define ARM_MM_K0XN0X1(n0, k0, a, b, c)           \
7872    ({                                            \
7873        ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
7874    })
7875#define ARM_MM_K0XN0X2(n0, k0, a, b, c)           \
7876    ({                                            \
7877        ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
7878        ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
7879    })
7880#define ARM_MM_K0XN0X3(n0, k0, a, b, c)           \
7881    ({                                            \
7882        ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
7883        ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
7884    })
7885#define ARM_MM_K0XN0X4(n0, k0, a, b, c)           \
7886    ({                                            \
7887        ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
7888        ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
7889    })
7890#define ARM_MM_K0XN0X5(n0, k0, a, b, c)           \
7891    ({                                            \
7892        ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
7893        ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
7894    })
7895#define ARM_MM_K0XN0X6(n0, k0, a, b, c)           \
7896    ({                                            \
7897        ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
7898        ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
7899    })
7900#define ARM_MM_K0XN0X7(n0, k0, a, b, c)           \
7901    ({                                            \
7902        ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
7903        ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
7904    })
7905#define ARM_MM_K0XN0X8(n0, k0, a, b, c)           \
7906    ({                                            \
7907        ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
7908        ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
7909    })
7910
7911#define ARM_DOT_K0(k0, a, b, c) \
7912    ({                          \
7913        CONCAT(ARM_DOT, k0)     \
7914        ((a), (b), (c));        \
7915    })
7916
7917#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
7918    ({                                 \
7919        CONCAT(ARM_DOT_K0X, n0)        \
7920        (k0, (a), b, (c));             \
7921    })
7922
7923#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
7924    ({                                       \
7925        CONCAT(ARM_MM_K0XN0X, m0)            \
7926        (n0, k0, a, b, c);                   \
7927    })
7928
7929
7930#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c)   \
7931    ({                                           \
7932        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \
7933    })
7934#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)        \
7935    ({                                                \
7936        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
7937        c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
7938    })
7939#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)        \
7940    ({                                                \
7941        ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
7942        c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
7943    })
7944#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)        \
7945    ({                                                \
7946        ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
7947        c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
7948    })
7949#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)        \
7950    ({                                                \
7951        ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
7952        c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
7953        c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
7954        c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
7955        c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
7956    })
7957#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)       \
7958    ({                                                \
7959        ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
7960        c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
7961        c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
7962        c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
7963        c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
7964        c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
7965        c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
7966        c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
7967        c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
7968    })
7969
7970#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c)    \
7971    ({                                                         \
7972        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \
7973    })
7974#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c)    \
7975    ({                                                         \
7976        ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);   \
7977        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
7978    })
7979#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c)    \
7980    ({                                                         \
7981        ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);   \
7982        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
7983    })
7984#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c)    \
7985    ({                                                         \
7986        ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);   \
7987        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
7988    })
7989#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c)    \
7990    ({                                                         \
7991        ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);   \
7992        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
7993    })
7994#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c)    \
7995    ({                                                         \
7996        ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);   \
7997        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
7998    })
7999#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c)    \
8000    ({                                                         \
8001        ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);   \
8002        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
8003    })
8004#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c)    \
8005    ({                                                         \
8006        ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);   \
8007        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
8008    })
8009#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
8010    ({                                              \
8011        CONCAT(ARM_MUL_N0X, k0)                     \
8012        (VECTOR_ACC_TYPE, (a), b, (c));             \
8013    })
8014#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
8015    ({                                                           \
8016        CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                         \
8017        (VECTOR_ACC_TYPE, k0, a, b, c);                          \
8018    })
8019
8020#if defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
8021
8022__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
8023                                                IMAGE_DECLARATION(rhs),
8024                                                IMAGE_DECLARATION(dst),
8025                                                uint k,
8026                                                uint lhs_stride_z,
8027                                                uint rhs_stride_z,
8028                                                uint dst_stride_z
8029#if defined(REINTERPRET_OUTPUT_AS_3D)
8030                                                ,
8031                                                uint dst_cross_plane_pad
8032#endif
8033                                               )
8034{
8035
8036#define LHS_BLOCK_SIZE ((K0) * (M0))
8037
8038#if defined(LHS_INTERLEAVE)
8039#define LHS_OFFSET_X (K0)
8040#define LHS_STEP_X ((K0) * (V0))
8041#define LHS_STEP_LOOP (1)
8042#else
8043#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
8044#define LHS_STEP_X (K0)
8045#define LHS_STEP_LOOP (V0)
8046#endif
8047
8048
8049#define RHS_BLOCK_SIZE ((K0) * (N0))
8050
8051
8052#if defined(RHS_INTERLEAVE)
8053#define RHS_OFFSET_X (K0)
8054#define RHS_STEP_X ((K0) * (H0))
8055#define RHS_STEP_LOOP (1)
8056#else
8057#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
8058#define RHS_STEP_X (K0)
8059#define RHS_STEP_LOOP (H0)
8060#endif
8061
8062    uint x = get_global_id(0);
8063    uint y = get_global_id(1);
8064    uint z = get_global_id(2);
8065
8066#if defined(DUMMY_WORK_ITEMS)
8067    if((x * N0 >= N) || (y * M0 >= M))
8068    {
8069        return;
8070    }
8071#endif
8072
8073
8074    __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z));
8075
8076
8077    __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y);
8078
8079#if defined(MATRIX_B_DEPTH)
8080
8081    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
8082#else
8083    rhs_addr += z * rhs_stride_z;
8084#endif
8085
8086    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
8087    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
8088
8089
8090    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0);
8091
8092    for(int i = 0; i < k; i += K0)
8093    {
8094
8095        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
8096
8097
8098        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
8099
8100
8101        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
8102
8103
8104        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
8105        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
8106    }
8107
8108    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
8109
8110    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0);
8111
8112#if defined(REINTERPRET_OUTPUT_AS_3D)
8113
8114    CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
8115
8116
8117
8118    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
8119
8120#else
8121
8122
8123    dst_addr += z * dst_stride_z;
8124
8125#endif
8126
8127
8128    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
8129    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
8130
8131
8132    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
8133    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
8134
8135#undef LHS_BLOCK_SIZE
8136#undef LHS_OFFSET_X
8137#undef LHS_STEP_X
8138#undef RHS_BLOCK_SIZE
8139#undef RHS_OFFSET_X
8140#undef RHS_STEP_X
8141}
8142#endif
8143
8144#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
8145#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
8146#define FUSED_OUTPUT_STAGE_FIXED_POINT
8147#endif
8148
8149
8150#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
8151__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint
8152#elif defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
8153__kernel void gemmlowp_mm_reshaped_only_rhs_t
8154#endif
8155(IMAGE_DECLARATION(lhs),
8156 IMAGE_DECLARATION(rhs),
8157 IMAGE_DECLARATION(dst),
8158 uint lhs_stride_z,
8159 uint rhs_stride_z,
8160 uint dst_stride_z
8161#if defined(REINTERPRET_INPUT_AS_3D)
8162 ,
8163 uint lhs_cross_plane_pad
8164#endif
8165#if defined(REINTERPRET_OUTPUT_AS_3D)
8166 ,
8167 uint dst_cross_plane_pad
8168#endif
8169#if defined(A_OFFSET)
8170 ,
8171 IMAGE_DECLARATION(sum_col)
8172#endif
8173#if defined(B_OFFSET)
8174 ,
8175 IMAGE_DECLARATION(sum_row)
8176#endif
8177#if defined(ADD_BIAS)
8178 ,
8179 VECTOR_DECLARATION(biases)
8180#endif
8181#if defined(PER_CHANNEL_QUANTIZATION)
8182 ,
8183 VECTOR_DECLARATION(result_multipliers),
8184 VECTOR_DECLARATION(result_shifts)
8185#endif
8186)
8187{
8188
8189#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y)
8190#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y)
8191
8192
8193#if defined(RHS_INTERLEAVE)
8194#define RHS_OFFSET_X (K0)
8195#define RHS_STEP_X (K0 * H0)
8196#else
8197#define RHS_OFFSET_X (K0 * N0)
8198#define RHS_STEP_X (K0)
8199#endif
8200#define RHS_STEP_LOOP (N0 * K0 * H0)
8201
8202    uint x  = GET_SPATIAL_IDX(0, 1, 1);
8203    uint y  = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
8204    uint z  = GET_SPATIAL_IDX(2, 1, 1);
8205    int  xo = (x * N0);
8206
8207#if defined(DUMMY_WORK_ITEMS)
8208    if((xo >= N) || (y >= M))
8209    {
8210        return;
8211    }
8212#endif
8213
8214
8215    uint lhs_y = y + z * FULL_LHS_HEIGHT;
8216
8217
8218    uint rhs_offset_x = (x % H0) * RHS_OFFSET_X;
8219    uint rhs_offset_y = (x / H0) * rhs_stride_y;
8220
8221#if defined(MATRIX_B_DEPTH)
8222
8223    rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z;
8224#else
8225    rhs_offset_y += z * rhs_stride_z;
8226#endif
8227
8228
8229    TILE(ACC_DATA_TYPE, M0, N0, c);
8230    LOOP_UNROLLING(int, i, 0, 1, M0,
8231    {
8232        c[i].v = 0;
8233    })
8234
8235    int i = 0;
8236    for(; i <= (K - K0); i += K0)
8237    {
8238        TILE(DATA_TYPE, M0, K0, a);
8239        TILE(DATA_TYPE, N0, K0, b);
8240
8241
8242        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
8243
8244
8245        LOOP_UNROLLING(int, _i, 0, 1, N0,
8246        {
8247            b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X));
8248        })
8249
8250
8251        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
8252
8253        rhs_offset_x += RHS_STEP_LOOP;
8254    }
8255
8256#if((K % K0) != 0)
8257
8258
8259    for(; i < K; ++i)
8260    {
8261        TILE(DATA_TYPE, M0, 1, a);
8262        TILE(DATA_TYPE, N0, 1, b);
8263
8264
8265        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
8266
8267        LOOP_UNROLLING(int, _i, 0, 1, N0,
8268        {
8269            b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X);
8270        })
8271
8272        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
8273
8274        rhs_offset_x += 1;
8275    }
8276#endif
8277
8278#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
8279
8280    TILE(int, M0, N0, c_int);
8281    TILE(int, M0, N0, offset_s32);
8282    LOOP_UNROLLING(int, i, 0, 1, M0,
8283    {
8284        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
8285    })
8286
8287    LOOP_UNROLLING(int, i, 0, 1, M0,
8288    {
8289        c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
8290    })
8291
8292#if defined(A_OFFSET)
8293
8294#if defined(SUM_COL_HAS_BATCHES)
8295    int sum_col_y = z;
8296#else
8297    int sum_col_y = 0;
8298#endif
8299    TILE(int, 1, N0, a_offset_s32);
8300
8301    T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32);
8302
8303    a_offset_s32[0].v *= A_OFFSET;
8304
8305    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
8306#endif
8307
8308#if defined(B_OFFSET)
8309
8310
8311
8312
8313    TILE(int, M0, N0, b_offset_s32);
8314
8315    T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32);
8316
8317    LOOP_UNROLLING(int, i, 0, 1, M0,
8318    {
8319        offset_s32[i].v += b_offset_s32[i].v *B_OFFSET;
8320    })
8321
8322#endif
8323
8324#if defined(ADD_BIAS)
8325
8326    TILE(int, 1, N0, bias);
8327
8328    T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
8329
8330    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
8331#endif
8332
8333    LOOP_UNROLLING(int, i, 0, 1, M0,
8334    {
8335        c_int[i].v += offset_s32[i].v;
8336    })
8337
8338    TILE(DATA_TYPE, M0, N0, c_lp);
8339
8340
8341#if defined(PER_CHANNEL_QUANTIZATION)
8342    TILE(int, 1, N0, res_mul);
8343    TILE(int, 1, N0, res_shift);
8344
8345    T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul);
8346    T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift);
8347
8348    T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp);
8349#else
8350    T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp);
8351#endif
8352
8353#if defined(MIN_BOUND)
8354    LOOP_UNROLLING(int, i, 0, 1, M0,
8355    {
8356        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND);
8357    })
8358#endif
8359#if defined(MAX_BOUND)
8360    LOOP_UNROLLING(int, i, 0, 1, M0,
8361    {
8362        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND);
8363    })
8364#endif
8365
8366#else
8367    TILE(int, M0, N0, c_lp);
8368
8369    LOOP_UNROLLING(int, i, 0, 1, M0,
8370    {
8371        c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
8372    })
8373#endif
8374
8375    TILE(uint, M0, 1, dst_indirect_y);
8376
8377    LOOP_UNROLLING(int, i, 0, 1, M0,
8378    {
8379#if defined(REINTERPRET_OUTPUT_AS_3D)
8380        dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1);
8381        dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT;
8382        dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D;
8383#else
8384        dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT;
8385#endif
8386    })
8387
8388    const bool cond_x = (xo > (N - N0)) & (PARTIAL_STORE_N0 != 0);
8389
8390#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
8391    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
8392#else
8393    T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
8394#endif
8395
8396#undef RHS_OFFSET_X
8397#undef RHS_STEP_X
8398#undef RHS_STEP_LOOP
8399}
8400#endif
8401
8402#if defined(GEMMLOWP_MM_NATIVE)
8403
8404
8405__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
8406                                 IMAGE_DECLARATION(rhs),
8407                                 IMAGE_DECLARATION(dst),
8408                                 uint lhs_stride_z,
8409                                 uint rhs_stride_z,
8410                                 uint dst_stride_z
8411#if defined(REINTERPRET_INPUT_AS_3D)
8412                                 ,
8413                                 uint lhs_cross_plane_pad
8414#endif
8415#if defined(REINTERPRET_OUTPUT_AS_3D)
8416                                 ,
8417                                 uint dst_cross_plane_pad
8418#endif
8419                                )
8420{
8421    uint x = get_global_id(0);
8422    uint y = get_global_id(1);
8423    uint z = get_global_id(2);
8424
8425#if defined(DUMMY_WORK_ITEMS)
8426    if((x * N0 >= N) || (y * M0 >= M))
8427    {
8428        return;
8429    }
8430#endif
8431
8432
8433    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
8434
8435
8436    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
8437
8438#if defined(MATRIX_B_DEPTH)
8439
8440    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
8441#else
8442    rhs_offset += z * rhs_stride_z;
8443#endif
8444
8445    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
8446    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
8447
8448#if defined(REINTERPRET_INPUT_AS_3D)
8449
8450    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
8451
8452
8453
8454    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
8455
8456#else
8457
8458
8459    lhs_offset += z * lhs_stride_z;
8460
8461#endif
8462
8463
8464    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0);
8465
8466    int i = 0;
8467
8468    for(; i <= (K - K0); i += K0)
8469    {
8470
8471        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
8472
8473
8474        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
8475
8476
8477#if(GPU_ARCH == GPU_ARCH_MIDGARD)
8478        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
8479#else
8480
8481        TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
8482
8483        ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
8484#endif
8485
8486
8487        lhs_offset += K0;
8488        rhs_offset += K0 * rhs_stride_y;
8489    }
8490
8491
8492    for(; i < K; ++i)
8493    {
8494
8495        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
8496
8497
8498        LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
8499
8500
8501#if(GPU_ARCH == GPU_ARCH_MIDGARD)
8502        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
8503#else
8504
8505        TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
8506
8507        ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
8508#endif
8509
8510
8511        lhs_offset += 1;
8512        rhs_offset += rhs_stride_y;
8513    }
8514
8515    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
8516
8517    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
8518
8519#if defined(REINTERPRET_OUTPUT_AS_3D)
8520
8521    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
8522
8523
8524
8525    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
8526
8527#else
8528
8529
8530    dst_addr += z * dst_stride_z;
8531
8532#endif
8533    const bool cond_y = y == 0;
8534    const bool cond_x = ((x + 1) * N0 >= N);
8535
8536
8537    REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res);
8538    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
8539}
8540#endif
8541
8542#if defined(GEMMLOWP_MATRIX_A_REDUCTION)
8543
8544__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
8545                                          IMAGE_DECLARATION(dst))
8546{
8547
8548    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
8549    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
8550
8551    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
8552    sum_row_32            = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
8553    ACC_DATA_TYPE sum_row = 0;
8554
8555    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
8556
8557    int i = 0;
8558
8559
8560    for(; i <= ((int)COLS_A - 16); i += 16)
8561    {
8562        const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
8563
8564        sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF,
8565                      VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
8566    }
8567
8568
8569    for(; i < COLS_A; ++i)
8570    {
8571        sum_row += (ACC_DATA_TYPE)matrix_a[i];
8572    }
8573
8574    sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
8575
8576#if defined(SCALAR)
8577    sum_row *= (int)SCALAR;
8578#endif
8579    *((__global int *)dst.ptr) = (int)sum_row;
8580}
8581#endif
8582
8583#if defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
8584
8585__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
8586                                               IMAGE_DECLARATION(dst))
8587{
8588
8589    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
8590    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
8591
8592    ACC_DATA_TYPE sum_row = 0;
8593
8594    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
8595
8596    int i = 0;
8597
8598
8599    for(; i <= ((int)COLS_A - 32); i += 32)
8600    {
8601        VEC_DATA_TYPE(DATA_TYPE, 16)
8602        a0 = vload16(0, matrix_a + i);
8603
8604        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8605        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8606        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8607        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8608
8609        a0 = vload16(1, matrix_a + i);
8610
8611        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8612        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8613        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8614        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8615    }
8616
8617
8618    for(; i < COLS_A; ++i)
8619    {
8620        sum_row += (ACC_DATA_TYPE)matrix_a[i];
8621    }
8622
8623#if defined(SCALAR)
8624    sum_row *= (int)SCALAR;
8625#endif
8626    *((__global int *)dst.ptr) = (int)sum_row;
8627}
8628#endif
8629
8630#if defined(GEMMLOWP_MATRIX_B_REDUCTION)
8631
8632__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
8633                                          IMAGE_DECLARATION(dst))
8634{
8635
8636    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
8637    const uint y      = get_global_id(1);
8638
8639    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z);
8640    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y;
8641
8642    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
8643    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0;
8644
8645    int i = 0;
8646
8647    for(; i <= ((int)ROWS_B - 4); i += 4)
8648    {
8649        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8650        b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y);
8651        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8652        b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y);
8653        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8654        b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y);
8655        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8656        b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y);
8657
8658        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3,
8659                      VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
8660
8661        matrix_b += 4 * src_stride_y;
8662    }
8663
8664
8665    for(; i < (int)ROWS_B; ++i)
8666    {
8667        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8668        b0 = VLOAD(VEC_SIZE)(0, matrix_b);
8669
8670        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
8671
8672        matrix_b += src_stride_y;
8673    }
8674
8675#if defined(SCALAR)
8676    sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR;
8677#endif
8678    VEC_DATA_TYPE(int, VEC_SIZE)
8679    res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE));
8680
8681    STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
8682}
8683#endif
8684
8685#endif
8686
8687#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
8688
8689#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
8690
8691
8692inline VEC_INT offset_contribution(
8693    int x,
8694    int y,
8695    int z
8696#if defined(A_OFFSET)
8697    ,
8698    IMAGE_DECLARATION(sum_col)
8699#endif
8700#if defined(B_OFFSET)
8701    ,
8702    IMAGE_DECLARATION(sum_row)
8703#endif
8704#if defined(ADD_BIAS)
8705    ,
8706    VECTOR_DECLARATION(biases)
8707#endif
8708)
8709{
8710    VEC_INT a_offset_s32 = (VEC_INT)0;
8711    VEC_INT b_offset_s32 = (VEC_INT)0;
8712
8713    int batch_id = z;
8714#if defined(DEPTH_INPUT3D)
8715    batch_id /= (int)DEPTH_INPUT3D;
8716#endif
8717
8718#if defined(A_OFFSET)
8719
8720    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
8721
8722
8723#if defined(SUM_COL_HAS_BATCHES)
8724    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
8725#else
8726    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr);
8727#endif
8728
8729    a_offset_s32 *= (VEC_INT)A_OFFSET;
8730#endif
8731
8732#if defined(B_OFFSET)
8733
8734    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
8735
8736
8737#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
8738    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
8739#else
8740    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
8741#endif
8742    b_offset_s32 *= (VEC_INT)B_OFFSET;
8743#endif
8744
8745#if defined(ADD_BIAS)
8746
8747    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
8748
8749    VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
8750    b_offset_s32 += (VEC_INT)biases_values;
8751#endif
8752
8753    return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
8754}
8755
8756#if defined(GEMMLOWP_OFFSET_CONTRIBUTION)
8757
8758__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
8759#if defined(A_OFFSET)
8760                                           ,
8761                                           IMAGE_DECLARATION(sum_col)
8762#endif
8763#if defined(B_OFFSET)
8764                                           ,
8765                                           IMAGE_DECLARATION(sum_row)
8766#endif
8767#if defined(ADD_BIAS)
8768                                           ,
8769                                           VECTOR_DECLARATION(biases)
8770#endif
8771                                          )
8772{
8773    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
8774    const int y = get_global_id(1);
8775    const int z = get_global_id(2);
8776
8777
8778    VEC_INT offset_term_s32 = offset_contribution(
8779                                  x, y, z
8780#if defined(A_OFFSET)
8781                                  ,
8782                                  sum_col_ptr,
8783                                  sum_col_stride_x,
8784                                  sum_col_step_x,
8785                                  sum_col_stride_y,
8786                                  sum_col_step_y,
8787                                  sum_col_offset_first_element_in_bytes
8788#endif
8789#if defined(B_OFFSET)
8790                                  ,
8791                                  sum_row_ptr,
8792                                  sum_row_stride_x,
8793                                  sum_row_step_x,
8794                                  sum_row_stride_y,
8795                                  sum_row_step_y,
8796                                  sum_row_offset_first_element_in_bytes
8797#endif
8798#if defined(ADD_BIAS)
8799                                  ,
8800                                  biases_ptr,
8801                                  biases_stride_x,
8802                                  biases_step_x,
8803                                  biases_offset_first_element_in_bytes
8804#endif
8805                              );
8806
8807    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
8808
8809    VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
8810
8811
8812    in_s32_0 += offset_term_s32;
8813
8814
8815    STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
8816}
8817#endif
8818
8819#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
8820
8821__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
8822#if defined(A_OFFSET)
8823                                                         ,
8824                                                         IMAGE_DECLARATION(sum_col)
8825#endif
8826#if defined(B_OFFSET)
8827                                                         ,
8828                                                         IMAGE_DECLARATION(sum_row)
8829#endif
8830                                                         ,
8831#if defined(ADD_BIAS)
8832                                                         VECTOR_DECLARATION(biases),
8833#endif
8834                                                         TENSOR3D_DECLARATION(dst)
8835#if defined(PER_CHANNEL_QUANTIZATION)
8836                                                         ,
8837                                                         VECTOR_DECLARATION(result_multipliers),
8838                                                         VECTOR_DECLARATION(result_shifts)
8839#endif
8840                                                        )
8841{
8842    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
8843    const int y = get_global_id(1);
8844    const int z = get_global_id(2);
8845
8846    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
8847
8848
8849    VEC_INT offset_term_s32 = offset_contribution(
8850                                  x, y, z
8851#if defined(A_OFFSET)
8852                                  ,
8853                                  sum_col_ptr,
8854                                  sum_col_stride_x,
8855                                  sum_col_step_x,
8856                                  sum_col_stride_y,
8857                                  sum_col_step_y,
8858                                  sum_col_offset_first_element_in_bytes
8859#endif
8860#if defined(B_OFFSET)
8861                                  ,
8862                                  sum_row_ptr,
8863                                  sum_row_stride_x,
8864                                  sum_row_step_x,
8865                                  sum_row_stride_y,
8866                                  sum_row_step_y,
8867                                  sum_row_offset_first_element_in_bytes
8868#endif
8869#if defined(ADD_BIAS)
8870                                  ,
8871                                  biases_ptr,
8872                                  biases_stride_x,
8873                                  biases_step_x,
8874                                  biases_offset_first_element_in_bytes
8875#endif
8876                              );
8877
8878    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
8879
8880    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
8881
8882
8883    in_s32 += offset_term_s32;
8884
8885
8886
8887
8888    in_s32 += (VEC_INT)RESULT_OFFSET;
8889
8890
8891#if defined(PER_CHANNEL_QUANTIZATION)
8892    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
8893    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
8894    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
8895    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
8896
8897    in_s32 *= result_multipliers_values;
8898    in_s32 >>= result_shifts_values;
8899#else
8900    in_s32 *= RESULT_MULTIPLIER;
8901
8902    in_s32 >>= RESULT_SHIFT;
8903#endif
8904
8905    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
8906    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
8907
8908#if defined(MIN_BOUND)
8909    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
8910#endif
8911#if defined(MAX_BOUND)
8912    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
8913#endif
8914
8915
8916    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
8917}
8918#endif
8919
8920#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
8921
8922__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
8923#if defined(A_OFFSET)
8924                                                                    ,
8925                                                                    IMAGE_DECLARATION(sum_col)
8926#endif
8927#if defined(B_OFFSET)
8928                                                                    ,
8929                                                                    IMAGE_DECLARATION(sum_row)
8930#endif
8931                                                                    ,
8932#if defined(ADD_BIAS)
8933                                                                    VECTOR_DECLARATION(biases),
8934#endif
8935                                                                    TENSOR3D_DECLARATION(dst)
8936#if defined(PER_CHANNEL_QUANTIZATION)
8937                                                                    ,
8938                                                                    VECTOR_DECLARATION(result_multipliers),
8939                                                                    VECTOR_DECLARATION(result_shifts)
8940#endif
8941                                                                   )
8942{
8943    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
8944    const int y = get_global_id(1);
8945    const int z = get_global_id(2);
8946
8947
8948    VEC_INT offset_term_s32 = offset_contribution(
8949                                  x, y, z
8950#if defined(A_OFFSET)
8951                                  ,
8952                                  sum_col_ptr,
8953                                  sum_col_stride_x,
8954                                  sum_col_step_x,
8955                                  sum_col_stride_y,
8956                                  sum_col_step_y,
8957                                  sum_col_offset_first_element_in_bytes
8958#endif
8959#if defined(B_OFFSET)
8960                                  ,
8961                                  sum_row_ptr,
8962                                  sum_row_stride_x,
8963                                  sum_row_step_x,
8964                                  sum_row_stride_y,
8965                                  sum_row_step_y,
8966                                  sum_row_offset_first_element_in_bytes
8967#endif
8968#if defined(ADD_BIAS)
8969                                  ,
8970                                  biases_ptr,
8971                                  biases_stride_x,
8972                                  biases_step_x,
8973                                  biases_offset_first_element_in_bytes
8974#endif
8975                              );
8976
8977    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
8978
8979    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
8980
8981    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
8982
8983
8984    in_s32 += offset_term_s32;
8985
8986
8987
8988
8989#if defined(PER_CHANNEL_QUANTIZATION)
8990    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
8991    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
8992    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
8993    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
8994
8995    VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
8996    VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
8997    in_s32                   = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
8998#else
8999
9000#if RESULT_SHIFT < 0
9001    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9002#else
9003    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9004#endif
9005
9006#endif
9007
9008
9009    in_s32 += (VEC_INT)RESULT_OFFSET;
9010
9011    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
9012    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
9013
9014#if defined(MIN_BOUND)
9015    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
9016#endif
9017#if defined(MAX_BOUND)
9018    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
9019#endif
9020
9021
9022    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9023}
9024#endif
9025
9026#undef VEC_INT
9027
9028#endif
9029
9030#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
9031
9032__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
9033#if defined(ADD_BIAS)
9034                                                  VECTOR_DECLARATION(biases),
9035#endif
9036                                                  TENSOR3D_DECLARATION(dst))
9037{
9038
9039    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
9040    int y = get_global_id(1);
9041    int z = get_global_id(2);
9042
9043    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
9044
9045    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
9046
9047    VEC_DATA_TYPE(int, VEC_SIZE)
9048    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
9049
9050#if defined(ADD_BIAS)
9051
9052    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
9053
9054    VEC_DATA_TYPE(int, VEC_SIZE)
9055    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
9056    input_values += biases_values;
9057#endif
9058
9059
9060    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
9061
9062
9063    input_values *= RESULT_MULT_INT;
9064
9065#if RESULT_SHIFT < 0
9066    input_values >>= -RESULT_SHIFT;
9067#else
9068    input_values >>= RESULT_SHIFT;
9069#endif
9070
9071    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
9072    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
9073
9074#if defined(MIN_BOUND)
9075    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
9076#endif
9077#if defined(MAX_BOUND)
9078    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
9079#endif
9080
9081
9082    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9083}
9084#endif
9085
9086#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
9087
9088__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
9089#if defined(ADD_BIAS)
9090                                                             VECTOR_DECLARATION(biases),
9091#endif
9092                                                             TENSOR3D_DECLARATION(dst))
9093{
9094
9095    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
9096    int y = get_global_id(1);
9097    int z = get_global_id(2);
9098
9099    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
9100
9101    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
9102
9103    VEC_DATA_TYPE(int, VEC_SIZE)
9104    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
9105
9106#if defined(ADD_BIAS)
9107
9108    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
9109
9110    VEC_DATA_TYPE(int, VEC_SIZE)
9111    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
9112    input_values += biases_values;
9113#endif
9114
9115
9116#if RESULT_SHIFT < 0
9117    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9118#else
9119    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9120#endif
9121
9122
9123    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
9124
9125    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
9126    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
9127
9128#if defined(MIN_BOUND)
9129    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
9130#endif
9131#if defined(MAX_BOUND)
9132    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
9133#endif
9134
9135
9136    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9137}
9138#endif
9139
9140#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
9141
9142__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
9143#if defined(ADD_BIAS)
9144                                                                     VECTOR_DECLARATION(biases),
9145#endif
9146                                                                     TENSOR3D_DECLARATION(dst))
9147{
9148
9149    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
9150    int y = get_global_id(1);
9151    int z = get_global_id(2);
9152
9153    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
9154
9155    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
9156
9157    VEC_DATA_TYPE(int, VEC_SIZE)
9158    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
9159
9160#if defined(ADD_BIAS)
9161
9162    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
9163
9164    VEC_DATA_TYPE(int, VEC_SIZE)
9165    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
9166    input_values += biases_values;
9167#endif
9168
9169
9170#if RESULT_SHIFT < 0
9171    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9172#else
9173    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9174#endif
9175
9176    VEC_DATA_TYPE(short, VEC_SIZE)
9177    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
9178
9179#if defined(MIN_BOUND)
9180    res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
9181#endif
9182#if defined(MAX_BOUND)
9183    res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
9184#endif
9185
9186
9187    STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9188}
9189#endif
9190
9191#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)
9192
9193__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
9194#if defined(ADD_BIAS)
9195                                                        VECTOR_DECLARATION(biases),
9196#endif
9197#if defined(DST_HEIGHT)
9198                                                        TENSOR4D_DECLARATION(dst))
9199#else
9200                                                        TENSOR3D_DECLARATION(dst))
9201#endif
9202{
9203
9204    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
9205    int y = get_global_id(1);
9206    int z = get_global_id(2);
9207
9208    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
9209
9210    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
9211
9212    VEC_DATA_TYPE(int, VEC_SIZE)
9213    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
9214
9215#if defined(ADD_BIAS)
9216
9217    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
9218
9219    VEC_DATA_TYPE(int, VEC_SIZE)
9220    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
9221    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values;
9222#endif
9223
9224
9225    VEC_DATA_TYPE(float, VEC_SIZE)
9226    input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE));
9227    input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
9228
9229    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
9230    res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
9231
9232#if defined(MIN_BOUND)
9233    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
9234#endif
9235#if defined(MAX_BOUND)
9236    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
9237#endif
9238
9239
9240    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9241}
9242#endif  )"