xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/common/gemm.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1R"(
2
3
4
5
6#ifndef ARM_COMPUTE_HELPER_H
7#define ARM_COMPUTE_HELPER_H
8
9
10
11
12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13    VSTORE(N0)                                                 \
14    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
15
16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
17    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
18    VSTORE(N0)                                                 \
19    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
20
21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
22    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
23    VSTORE(N0)                                                 \
24    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
25
26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
27    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
28    VSTORE(N0)                                                 \
29    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
30
31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
32    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
33    VSTORE(N0)                                                 \
34    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
35
36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
37    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
38    VSTORE(N0)                                                 \
39    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
40
41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
42    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
43    VSTORE(N0)                                                 \
44    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
45
46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
47    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
48    VSTORE(N0)                                                 \
49    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
50
51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
52    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
53    VSTORE(N0)                                                 \
54    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
55
56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
57    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
58    VSTORE(N0)                                                  \
59    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
60
61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
62    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
63    VSTORE(N0)                                                  \
64    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
65
66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
67    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
68    VSTORE(N0)                                                  \
69    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
70
71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
72    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
73    VSTORE(N0)                                                  \
74    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
75
76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
77    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
78    VSTORE(N0)                                                  \
79    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
80
81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
82    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
83    VSTORE(N0)                                                  \
84    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
85
86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
87    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
88    VSTORE(N0)                                                  \
89    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
90
91
92
93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94    VSTORE(N0)                                                         \
95    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
96
97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
98    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
99    VSTORE(N0)                                                         \
100    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
101
102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
103    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
104    VSTORE(N0)                                                         \
105    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
106
107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
108    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
109    VSTORE(N0)                                                         \
110    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
111
112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
113    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
114    VSTORE(N0)                                                         \
115    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
116
117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119    VSTORE(N0)                                                         \
120    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
121
122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124    VSTORE(N0)                                                         \
125    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
126
127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129    VSTORE(N0)                                                         \
130    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
131
132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134    VSTORE(N0)                                                         \
135    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
136
137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
138    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139    VSTORE(N0)                                                     \
140    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
141
142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144    VSTORE(N0)                                                          \
145    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
146
147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149    VSTORE(N0)                                                          \
150    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
151
152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154    VSTORE(N0)                                                          \
155    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156
157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
159    VSTORE(N0)                                                          \
160    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
161
162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164    VSTORE(N0)                                                          \
165    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
166
167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169    VSTORE(N0)                                                          \
170    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
171
172
173
174
175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177
178
179
180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
182
183
184
185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
187    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
188
189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
190    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
191    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
192    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
193
194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
195    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
196    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
197    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
198
199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
200    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
201    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
202    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
203
204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
206    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
207    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
208
209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
210    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
211    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
212    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
213
214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
215    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
216    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
217    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
218
219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
220    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
221    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
222    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
223
224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
225    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
226    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
227    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
228
229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
230    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
231    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
232    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
233
234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
235    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
236    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
237    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
238
239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
240    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
241    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
242    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
243
244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
245    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
246    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
247    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
248
249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
251    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
252    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
253
254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
255    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
256    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
257    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
258
259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
260    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
261    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
262    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
263
264
265
266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
268
269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
270    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
271    {                                                                                                                                                     \
272        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
273    }                                                                                                                                                     \
274    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
275    {                                                                                                                                                     \
276        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277    }                                                                                                                                                     \
278    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
279    {                                                                                                                                                     \
280        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
281    }                                                                                                                                                     \
282    else                                                                                                                                                  \
283    {                                                                                                                                                     \
284        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
285    }
286
287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
288    if(!(PARTIAL_COND_X))                                                                                         \
289    {                                                                                                             \
290        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
291    }                                                                                                             \
292    else                                                                                                          \
293    {                                                                                                             \
294        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
295    }
296
297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
298    if(!(PARTIAL_COND_Y))                                                                                         \
299    {                                                                                                             \
300        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
301    }                                                                                                             \
302    else                                                                                                          \
303    {                                                                                                             \
304        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
305    }
306
307
308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
309
310
311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
312
313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
314    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
315
316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
317
318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
319    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
320
321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
322
323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
324    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
325
326#else
327
328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
329    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
330
331#endif
332
333#endif
334
335
336#if defined(PARTIAL_STORE_M0)
337
338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
340#else
341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342    ((uint)(y * M0))
343#endif
344
345
346
347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
348    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
349
350
351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
352#pragma OPENCL EXTENSION cl_khr_fp16 : enable
353#endif
354
355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
357#endif
358
359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
361#endif
362
363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
364#pragma OPENCL EXTENSION cl_arm_printf : enable
365#endif
366
367#define GPU_ARCH_MIDGARD 0x100
368#define GPU_ARCH_BIFROST 0x200
369#define GPU_ARCH_VALHALL 0x300
370
371
372#define CONCAT(a, b) a##b
373
374
375#define EXPAND(x) x
376
377
378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
379
380
381#define REV1(x) ((x))
382#define REV2(x) ((x).s10)
383#define REV3(x) ((x).s210)
384#define REV4(x) ((x).s3210)
385#define REV8(x) ((x).s76543210)
386#define REV16(x) ((x).sFEDCBA9876543210)
387
388
389
390#define REVERSE_STR(x, s) REV##s((x))
391#define REVERSE(x, s) REVERSE_STR(x, s)
392
393
394
395#define ROT1_0(x) ((x))
396#define ROT1_1(x) ((x))
397
398#define ROT2_0(x) ((x))
399#define ROT2_1(x) ((x).s10)
400#define ROT2_2(x) ((x))
401
402#define ROT3_0(x) ((x))
403#define ROT3_1(x) ((x).s201)
404#define ROT3_2(x) ((x).s120)
405#define ROT3_3(x) ((x))
406
407#define ROT4_0(x) ((x))
408#define ROT4_1(x) ((x).s3012)
409#define ROT4_2(x) ((x).s2301)
410#define ROT4_3(x) ((x).s1230)
411#define ROT4_4(x) ((x))
412
413#define ROT8_0(x) ((x))
414#define ROT8_1(x) ((x).s70123456)
415#define ROT8_2(x) ((x).s67012345)
416#define ROT8_3(x) ((x).s56701234)
417#define ROT8_4(x) ((x).s45670123)
418#define ROT8_5(x) ((x).s34567012)
419#define ROT8_6(x) ((x).s23456701)
420#define ROT8_7(x) ((x).s12345670)
421#define ROT8_8(x) ((x))
422
423#define ROT16_0(x) ((x))
424#define ROT16_1(x) ((x).sF0123456789ABCDE)
425#define ROT16_2(x) ((x).sEF0123456789ABCD)
426#define ROT16_3(x) ((x).sDEF0123456789ABC)
427#define ROT16_4(x) ((x).sCDEF0123456789AB)
428#define ROT16_5(x) ((x).sBCDEF0123456789A)
429#define ROT16_6(x) ((x).sABCDEF0123456789)
430#define ROT16_7(x) ((x).s9ABCDEF012345678)
431#define ROT16_8(x) ((x).s89ABCDEF01234567)
432#define ROT16_9(x) ((x).s789ABCDEF0123456)
433#define ROT16_10(x) ((x).s6789ABCDEF012345)
434#define ROT16_11(x) ((x).s56789ABCDEF01234)
435#define ROT16_12(x) ((x).s456789ABCDEF0123)
436#define ROT16_13(x) ((x).s3456789ABCDEF012)
437#define ROT16_14(x) ((x).s23456789ABCDEF01)
438#define ROT16_15(x) ((x).s123456789ABCDEF0)
439#define ROT16_16(x) ((x))
440
441
442
443#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
444#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
445
446
447
448#define V_OFFS1(dt) (dt##1)(0)
449#define V_OFFS2(dt) (dt##2)(0, 1)
450#define V_OFFS3(dt) (dt##3)(0, 1, 2)
451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
454
455
456
457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
459
460
461#define VLOAD_STR(size) vload##size
462#define VLOAD(size) VLOAD_STR(size)
463
464
465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
467
468#define NO_LOAD(data, offs, ptr) \
469    {                            \
470    }
471
472
473#define vload_partial_1_0 NO_LOAD
474#define vload_partial_1_1 vload1
475#define vload_partial_1_2 NO_LOAD
476#define vload_partial_1_3 NO_LOAD
477#define vload_partial_1_4 NO_LOAD
478#define vload_partial_1_5 NO_LOAD
479#define vload_partial_1_6 NO_LOAD
480#define vload_partial_1_7 NO_LOAD
481#define vload_partial_1_8 NO_LOAD
482#define vload_partial_1_9 NO_LOAD
483#define vload_partial_1_10 NO_LOAD
484#define vload_partial_1_11 NO_LOAD
485#define vload_partial_1_12 NO_LOAD
486#define vload_partial_1_13 NO_LOAD
487#define vload_partial_1_14 NO_LOAD
488#define vload_partial_1_15 NO_LOAD
489#define vload_partial_1_16 NO_LOAD
490
491#define vload_partial_2_0 NO_LOAD
492#define vload_partial_2_1 vload_partial_1
493#define vload_partial_2_2 vload_partial_2
494#define vload_partial_2_3 NO_LOAD
495#define vload_partial_2_4 NO_LOAD
496#define vload_partial_2_5 NO_LOAD
497#define vload_partial_2_6 NO_LOAD
498#define vload_partial_2_7 NO_LOAD
499#define vload_partial_2_8 NO_LOAD
500#define vload_partial_2_9 NO_LOAD
501#define vload_partial_2_10 NO_LOAD
502#define vload_partial_2_11 NO_LOAD
503#define vload_partial_2_12 NO_LOAD
504#define vload_partial_2_13 NO_LOAD
505#define vload_partial_2_14 NO_LOAD
506#define vload_partial_2_15 NO_LOAD
507#define vload_partial_2_16 NO_LOAD
508
509#define vload_partial_3_0 NO_LOAD
510#define vload_partial_3_1 vload_partial_1
511#define vload_partial_3_2 vload_partial_2
512#define vload_partial_3_3 vload_partial_3
513#define vload_partial_3_4 NO_LOAD
514#define vload_partial_3_5 NO_LOAD
515#define vload_partial_3_6 NO_LOAD
516#define vload_partial_3_7 NO_LOAD
517#define vload_partial_3_8 NO_LOAD
518#define vload_partial_3_9 NO_LOAD
519#define vload_partial_3_10 NO_LOAD
520#define vload_partial_3_11 NO_LOAD
521#define vload_partial_3_12 NO_LOAD
522#define vload_partial_3_13 NO_LOAD
523#define vload_partial_3_14 NO_LOAD
524#define vload_partial_3_15 NO_LOAD
525#define vload_partial_3_16 NO_LOAD
526
527#define vload_partial_4_0 NO_LOAD
528#define vload_partial_4_1 vload_partial_1
529#define vload_partial_4_2 vload_partial_2
530#define vload_partial_4_3 vload_partial_3
531#define vload_partial_4_4 vload_partial_4
532#define vload_partial_4_5 NO_LOAD
533#define vload_partial_4_6 NO_LOAD
534#define vload_partial_4_7 NO_LOAD
535#define vload_partial_4_8 NO_LOAD
536#define vload_partial_4_9 NO_LOAD
537#define vload_partial_4_10 NO_LOAD
538#define vload_partial_4_11 NO_LOAD
539#define vload_partial_4_12 NO_LOAD
540#define vload_partial_4_13 NO_LOAD
541#define vload_partial_4_14 NO_LOAD
542#define vload_partial_4_15 NO_LOAD
543#define vload_partial_4_16 NO_LOAD
544
545#define vload_partial_8_0 NO_LOAD
546#define vload_partial_8_1 vload_partial_1
547#define vload_partial_8_2 vload_partial_2
548#define vload_partial_8_3 vload_partial_3
549#define vload_partial_8_4 vload_partial_4
550#define vload_partial_8_5 vload_partial_5
551#define vload_partial_8_6 vload_partial_6
552#define vload_partial_8_7 vload_partial_7
553#define vload_partial_8_8 vload_partial_8
554#define vload_partial_8_9 NO_LOAD
555#define vload_partial_8_10 NO_LOAD
556#define vload_partial_8_11 NO_LOAD
557#define vload_partial_8_12 NO_LOAD
558#define vload_partial_8_13 NO_LOAD
559#define vload_partial_8_14 NO_LOAD
560#define vload_partial_8_15 NO_LOAD
561#define vload_partial_8_16 NO_LOAD
562
563#define vload_partial_16_0 NO_LOAD
564#define vload_partial_16_1 vload_partial_1
565#define vload_partial_16_2 vload_partial_2
566#define vload_partial_16_3 vload_partial_3
567#define vload_partial_16_4 vload_partial_4
568#define vload_partial_16_5 vload_partial_5
569#define vload_partial_16_6 vload_partial_6
570#define vload_partial_16_7 vload_partial_7
571#define vload_partial_16_8 vload_partial_8
572#define vload_partial_16_9 vload_partial_9
573#define vload_partial_16_10 vload_partial_10
574#define vload_partial_16_11 vload_partial_11
575#define vload_partial_16_12 vload_partial_12
576#define vload_partial_16_13 vload_partial_13
577#define vload_partial_16_14 vload_partial_14
578#define vload_partial_16_15 vload_partial_15
579#define vload_partial_16_16 vload_partial_16
580
581
582#define vload_partial_1(DATA, OFFSET, PTR) \
583    DATA.s0 = vload1(OFFSET, PTR);
584
585#define vload_partial_2(DATA, OFFSET, PTR) \
586    DATA.s01 = vload2(OFFSET, PTR);
587
588#define vload_partial_3(DATA, OFFSET, PTR) \
589    DATA.s012 = vload3(OFFSET, PTR);
590
591#define vload_partial_4(DATA, OFFSET, PTR) \
592    DATA.s0123 = vload4(OFFSET, PTR);
593
594#define vload_partial_5(DATA, OFFSET, PTR)    \
595    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596    DATA.s4 = vload1(OFFSET, PTR + 4);
597
598#define vload_partial_6(DATA, OFFSET, PTR)    \
599    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
601
602#define vload_partial_7(DATA, OFFSET, PTR)    \
603    vload_partial_4(DATA.s0123, OFFSET, PTR); \
604    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
605
606#define vload_partial_8(DATA, OFFSET, PTR) \
607    DATA.s01234567 = vload8(OFFSET, PTR);
608
609#define vload_partial_9(DATA, OFFSET, PTR)        \
610    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611    DATA.s8 = vload1(OFFSET, PTR + 8);
612
613#define vload_partial_10(DATA, OFFSET, PTR)       \
614    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
616
617#define vload_partial_11(DATA, OFFSET, PTR)       \
618    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
620
621#define vload_partial_12(DATA, OFFSET, PTR)       \
622    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
624
625#define vload_partial_13(DATA, OFFSET, PTR)       \
626    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
628
629#define vload_partial_14(DATA, OFFSET, PTR)       \
630    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
632
633#define vload_partial_15(DATA, OFFSET, PTR)       \
634    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
635    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
636
637#define vload_partial_16(DATA, OFFSET, PTR) \
638    DATA = vload16(OFFSET, PTR);
639
640
641
642#define PIXEL_UNIT4 1
643#define PIXEL_UNIT8 2
644#define PIXEL_UNIT16 4
645
646
647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
649
650
651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
654
655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
659#endif
660
661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
664
665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
669#endif
670
671
672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
674
675
676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
678
679#define VSTORE_STR(size) vstore##size
680#define VSTORE(size) VSTORE_STR(size)
681
682#define float1 float
683#define half1 half
684#define char1 char
685#define uchar1 uchar
686#define short1 short
687#define ushort1 ushort
688#define int1 int
689#define uint1 uint
690#define long1 long
691#define ulong1 ulong
692#define double1 double
693
694#define vload1(OFFSET, PTR) *(OFFSET + PTR)
695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
696
697
698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
700
701#define NO_STORE(data, offs, ptr) \
702    {                             \
703    }
704
705
706#define vstore_partial_1_0 NO_STORE
707#define vstore_partial_1_1 vstore1
708#define vstore_partial_1_2 NO_STORE
709#define vstore_partial_1_3 NO_STORE
710#define vstore_partial_1_4 NO_STORE
711#define vstore_partial_1_5 NO_STORE
712#define vstore_partial_1_6 NO_STORE
713#define vstore_partial_1_7 NO_STORE
714#define vstore_partial_1_8 NO_STORE
715#define vstore_partial_1_9 NO_STORE
716#define vstore_partial_1_10 NO_STORE
717#define vstore_partial_1_11 NO_STORE
718#define vstore_partial_1_12 NO_STORE
719#define vstore_partial_1_13 NO_STORE
720#define vstore_partial_1_14 NO_STORE
721#define vstore_partial_1_15 NO_STORE
722#define vstore_partial_1_16 NO_STORE
723
724#define vstore_partial_2_0 NO_STORE
725#define vstore_partial_2_1 vstore_partial_1
726#define vstore_partial_2_2 vstore_partial_2
727#define vstore_partial_2_3 NO_STORE
728#define vstore_partial_2_4 NO_STORE
729#define vstore_partial_2_5 NO_STORE
730#define vstore_partial_2_6 NO_STORE
731#define vstore_partial_2_7 NO_STORE
732#define vstore_partial_2_8 NO_STORE
733#define vstore_partial_2_9 NO_STORE
734#define vstore_partial_2_10 NO_STORE
735#define vstore_partial_2_11 NO_STORE
736#define vstore_partial_2_12 NO_STORE
737#define vstore_partial_2_13 NO_STORE
738#define vstore_partial_2_14 NO_STORE
739#define vstore_partial_2_15 NO_STORE
740#define vstore_partial_2_16 NO_STORE
741
742#define vstore_partial_3_0 NO_STORE
743#define vstore_partial_3_1 vstore_partial_1
744#define vstore_partial_3_2 vstore_partial_2
745#define vstore_partial_3_3 vstore_partial_3
746#define vstore_partial_3_4 NO_STORE
747#define vstore_partial_3_5 NO_STORE
748#define vstore_partial_3_6 NO_STORE
749#define vstore_partial_3_7 NO_STORE
750#define vstore_partial_3_8 NO_STORE
751#define vstore_partial_3_9 NO_STORE
752#define vstore_partial_3_10 NO_STORE
753#define vstore_partial_3_11 NO_STORE
754#define vstore_partial_3_12 NO_STORE
755#define vstore_partial_3_13 NO_STORE
756#define vstore_partial_3_14 NO_STORE
757#define vstore_partial_3_15 NO_STORE
758#define vstore_partial_3_16 NO_STORE
759
760#define vstore_partial_4_0 NO_STORE
761#define vstore_partial_4_1 vstore_partial_1
762#define vstore_partial_4_2 vstore_partial_2
763#define vstore_partial_4_3 vstore_partial_3
764#define vstore_partial_4_4 vstore_partial_4
765#define vstore_partial_4_5 NO_STORE
766#define vstore_partial_4_6 NO_STORE
767#define vstore_partial_4_7 NO_STORE
768#define vstore_partial_4_8 NO_STORE
769#define vstore_partial_4_9 NO_STORE
770#define vstore_partial_4_10 NO_STORE
771#define vstore_partial_4_11 NO_STORE
772#define vstore_partial_4_12 NO_STORE
773#define vstore_partial_4_13 NO_STORE
774#define vstore_partial_4_14 NO_STORE
775#define vstore_partial_4_15 NO_STORE
776#define vstore_partial_4_16 NO_STORE
777
778#define vstore_partial_8_0 NO_STORE
779#define vstore_partial_8_1 vstore_partial_1
780#define vstore_partial_8_2 vstore_partial_2
781#define vstore_partial_8_3 vstore_partial_3
782#define vstore_partial_8_4 vstore_partial_4
783#define vstore_partial_8_5 vstore_partial_5
784#define vstore_partial_8_6 vstore_partial_6
785#define vstore_partial_8_7 vstore_partial_7
786#define vstore_partial_8_8 vstore_partial_8
787#define vstore_partial_8_9 NO_STORE
788#define vstore_partial_8_10 NO_STORE
789#define vstore_partial_8_11 NO_STORE
790#define vstore_partial_8_12 NO_STORE
791#define vstore_partial_8_13 NO_STORE
792#define vstore_partial_8_14 NO_STORE
793#define vstore_partial_8_15 NO_STORE
794#define vstore_partial_8_16 NO_STORE
795
796#define vstore_partial_16_0 NO_STORE
797#define vstore_partial_16_1 vstore_partial_1
798#define vstore_partial_16_2 vstore_partial_2
799#define vstore_partial_16_3 vstore_partial_3
800#define vstore_partial_16_4 vstore_partial_4
801#define vstore_partial_16_5 vstore_partial_5
802#define vstore_partial_16_6 vstore_partial_6
803#define vstore_partial_16_7 vstore_partial_7
804#define vstore_partial_16_8 vstore_partial_8
805#define vstore_partial_16_9 vstore_partial_9
806#define vstore_partial_16_10 vstore_partial_10
807#define vstore_partial_16_11 vstore_partial_11
808#define vstore_partial_16_12 vstore_partial_12
809#define vstore_partial_16_13 vstore_partial_13
810#define vstore_partial_16_14 vstore_partial_14
811#define vstore_partial_16_15 vstore_partial_15
812#define vstore_partial_16_16 vstore_partial_16
813
814
815#define vstore_partial_1(DATA, OFFSET, PTR) \
816    vstore1(DATA.s0, OFFSET, PTR);
817
818#define vstore_partial_2(DATA, OFFSET, PTR) \
819    vstore2(DATA.s01, OFFSET, PTR);
820
821#define vstore_partial_3(DATA, OFFSET, PTR) \
822    vstore3(DATA.s012, OFFSET, PTR);
823
824#define vstore_partial_4(DATA, OFFSET, PTR) \
825    vstore4(DATA.s0123, OFFSET, PTR);
826
827#define vstore_partial_5(DATA, OFFSET, PTR)    \
828    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829    vstore1(DATA.s4, OFFSET, PTR + 4);
830
831#define vstore_partial_6(DATA, OFFSET, PTR)    \
832    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
834
835#define vstore_partial_7(DATA, OFFSET, PTR)    \
836    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
837    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
838
839#define vstore_partial_8(DATA, OFFSET, PTR) \
840    vstore8(DATA.s01234567, OFFSET, PTR);
841
842#define vstore_partial_9(DATA, OFFSET, PTR)        \
843    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844    vstore1(DATA.s8, OFFSET, PTR + 8);
845
846#define vstore_partial_10(DATA, OFFSET, PTR)       \
847    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
849
850#define vstore_partial_11(DATA, OFFSET, PTR)       \
851    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
853
854#define vstore_partial_12(DATA, OFFSET, PTR)       \
855    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
857
858#define vstore_partial_13(DATA, OFFSET, PTR)       \
859    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
861
862#define vstore_partial_14(DATA, OFFSET, PTR)       \
863    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
865
866#define vstore_partial_15(DATA, OFFSET, PTR)       \
867    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
868    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
869
870#define vstore_partial_16(DATA, OFFSET, PTR) \
871    vstore16(DATA, OFFSET, PTR);
872
873
874
875
876
877#define convert_float_sat convert_float
878#define convert_float1_sat convert_float
879#define convert_float2_sat convert_float2
880#define convert_float3_sat convert_float3
881#define convert_float4_sat convert_float4
882#define convert_float8_sat convert_float8
883#define convert_float16_sat convert_float16
884#define convert_half_sat convert_float
885#define convert_half1_sat convert_half
886#define convert_half2_sat convert_half2
887#define convert_half3_sat convert_half3
888#define convert_half4_sat convert_half4
889#define convert_half8_sat convert_half8
890#define convert_half16_sat convert_half16
891
892#define convert_float1 convert_float
893#define convert_half1 convert_half
894#define convert_char1 convert_char
895#define convert_uchar1 convert_uchar
896#define convert_short1 convert_short
897#define convert_ushort1 convert_ushort
898#define convert_int1 convert_int
899#define convert_uint1 convert_uint
900#define convert_long1 convert_long
901#define convert_ulong1 convert_ulong
902#define convert_double1 convert_double
903
904#define convert_char1_sat convert_char_sat
905#define convert_uchar1_sat convert_uchar_sat
906#define convert_uchar2_sat convert_uchar2_sat
907#define convert_uchar3_sat convert_uchar3_sat
908#define convert_uchar4_sat convert_uchar4_sat
909#define convert_uchar8_sat convert_uchar8_sat
910#define convert_uchar16_sat convert_uchar16_sat
911#define convert_short1_sat convert_short_sat
912#define convert_ushort1_sat convert_ushort_sat
913#define convert_int1_sat convert_int_sat
914#define convert_uint1_sat convert_uint_sat
915#define convert_long1_sat convert_long_sat
916#define convert_ulong1_sat convert_ulong_sat
917#define convert_double1_sat convert_double_sat
918
919#define VEC_DATA_TYPE_STR(type, size) type##size
920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
921
922#define CONVERT_STR(x, type) (convert_##type((x)))
923#define CONVERT(x, type) CONVERT_STR(x, type)
924
925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
927
928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
930
931#define select_vec_dt_uchar(size) uchar##size
932#define select_vec_dt_char(size) char##size
933#define select_vec_dt_ushort(size) ushort##size
934#define select_vec_dt_short(size) short##size
935#define select_vec_dt_half(size) short##size
936#define select_vec_dt_uint(size) uint##size
937#define select_vec_dt_int(size) int##size
938#define select_vec_dt_float(size) int##size
939#define select_vec_dt_ulong(size) ulong##size
940#define select_vec_dt_long(size) long##size
941
942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
945
946#define signed_int_vec_dt_uchar(size) char##size
947#define signed_int_vec_dt_char(size) char##size
948#define signed_int_vec_dt_ushort(size) short##size
949#define signed_int_vec_dt_short(size) short##size
950#define signed_int_vec_dt_half(size) short##size
951#define signed_int_vec_dt_uint(size) int##size
952#define signed_int_vec_dt_int(size) int##size
953#define signed_int_vec_dt_float(size) int##size
954#define signed_int_vec_dt_ulong(size) long##size
955#define signed_int_vec_dt_long(size) long##size
956
957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
960
961#define sum_reduce_1(x) (x)
962#define sum_reduce_2(x) ((x).s0) + ((x).s1)
963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
967
968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
970
971#define prod_reduce_1(x) (x)
972#define prod_reduce_2(x) ((x).s0) * ((x).s1)
973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
977
978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
980
981#define max_reduce_1(x) (x)
982#define max_reduce_2(x) max(((x).s0), ((x).s1))
983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
987
988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
990
991#define VECTOR_DECLARATION(name)     \
992    __global uchar *name##_ptr,      \
993    uint        name##_stride_x, \
994    uint        name##_step_x,   \
995    uint        name##_offset_first_element_in_bytes
996
997#define IMAGE_DECLARATION(name)      \
998    __global uchar *name##_ptr,      \
999    uint        name##_stride_x, \
1000    uint        name##_step_x,   \
1001    uint        name##_stride_y, \
1002    uint        name##_step_y,   \
1003    uint        name##_offset_first_element_in_bytes
1004
1005#define TENSOR3D_DECLARATION(name)   \
1006    __global uchar *name##_ptr,      \
1007    uint        name##_stride_x, \
1008    uint        name##_step_x,   \
1009    uint        name##_stride_y, \
1010    uint        name##_step_y,   \
1011    uint        name##_stride_z, \
1012    uint        name##_step_z,   \
1013    uint        name##_offset_first_element_in_bytes
1014
1015#define TENSOR4D_DECLARATION(name)   \
1016    __global uchar *name##_ptr,      \
1017    uint        name##_stride_x, \
1018    uint        name##_step_x,   \
1019    uint        name##_stride_y, \
1020    uint        name##_step_y,   \
1021    uint        name##_stride_z, \
1022    uint        name##_step_z,   \
1023    uint        name##_stride_w, \
1024    uint        name##_step_w,   \
1025    uint        name##_offset_first_element_in_bytes
1026
1027#define TENSOR5D_DECLARATION(name)   \
1028    __global uchar *name##_ptr,      \
1029    uint        name##_stride_x, \
1030    uint        name##_step_x,   \
1031    uint        name##_stride_y, \
1032    uint        name##_step_y,   \
1033    uint        name##_stride_z, \
1034    uint        name##_step_z,   \
1035    uint        name##_stride_w, \
1036    uint        name##_step_w,   \
1037    uint        name##_stride_v, \
1038    uint        name##_step_v,   \
1039    uint        name##_offset_first_element_in_bytes
1040
1041#define CONVERT_TO_VECTOR_STRUCT(name) \
1042    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1043
1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1045    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1046
1047#define CONVERT_TO_IMAGE_STRUCT(name) \
1048    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1049
1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1051    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1052
1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1054    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1055
1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1057    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1058
1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1060    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1061
1062#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1063    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1064                                 name##_stride_z, name##_step_z)
1065
1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1067    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1068
1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1070    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1071                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1072
1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1074    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1075
1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1077    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1078                           name##_stride_z, name##_step_z)
1079
1080
1081typedef struct Vector
1082{
1083    __global uchar *ptr;
1084    int             offset_first_element_in_bytes;
1085    int             stride_x;
1086} Vector;
1087
1088
1089typedef struct Image
1090{
1091    __global uchar *ptr;
1092    int             offset_first_element_in_bytes;
1093    int             stride_x;
1094    int             stride_y;
1095} Image;
1096
1097
1098typedef struct Tensor3D
1099{
1100    __global uchar *ptr;
1101    int             offset_first_element_in_bytes;
1102    int             stride_x;
1103    int             stride_y;
1104    int             stride_z;
1105} Tensor3D;
1106
1107
1108typedef struct Tensor4D
1109{
1110    __global uchar *ptr;
1111    int             offset_first_element_in_bytes;
1112    int             stride_x;
1113    int             stride_y;
1114    int             stride_z;
1115    int             stride_w;
1116} Tensor4D;
1117
1118
1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1120{
1121    Vector vector =
1122    {
1123        .ptr                           = ptr,
1124        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1125        .stride_x                      = stride_x,
1126    };
1127    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1128    return vector;
1129}
1130
1131
1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1133{
1134    Image img =
1135    {
1136        .ptr                           = ptr,
1137        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1138        .stride_x                      = stride_x,
1139        .stride_y                      = stride_y
1140    };
1141    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1142    return img;
1143}
1144
1145
1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1147{
1148    Image img =
1149    {
1150        .ptr                           = ptr,
1151        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1152        .stride_x                      = stride_x,
1153        .stride_y                      = stride_y
1154    };
1155    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1156    return img;
1157}
1158
1159
1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1161{
1162    Tensor3D tensor =
1163    {
1164        .ptr                           = ptr,
1165        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1166        .stride_x                      = stride_x,
1167        .stride_y                      = stride_y,
1168        .stride_z                      = stride_z
1169    };
1170    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1171    return tensor;
1172}
1173
1174
1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1176{
1177    Tensor3D tensor =
1178    {
1179        .ptr                           = ptr,
1180        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1181        .stride_x                      = stride_x,
1182        .stride_y                      = stride_y,
1183        .stride_z                      = stride_z
1184    };
1185    return tensor;
1186}
1187
1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1189                                             uint step_w,
1190                                             uint mod_size)
1191{
1192    Tensor4D tensor =
1193    {
1194        .ptr                           = ptr,
1195        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1196        .stride_x                      = stride_x,
1197        .stride_y                      = stride_y,
1198        .stride_z                      = stride_z,
1199        .stride_w                      = stride_w
1200    };
1201
1202    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1203    return tensor;
1204}
1205
1206
1207inline __global const uchar *vector_offset(const Vector *vec, int x)
1208{
1209    return vec->ptr + x * vec->stride_x;
1210}
1211
1212
1213inline __global uchar *offset(const Image *img, int x, int y)
1214{
1215    return img->ptr + x * img->stride_x + y * img->stride_y;
1216}
1217
1218
1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1220{
1221    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1222}
1223
1224
1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1226{
1227    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1228}
1229
1230
1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1232{
1233    uint num_elements = width * height;
1234
1235    const uint z = index / num_elements;
1236
1237    index %= num_elements;
1238
1239    const uint y = index / width;
1240
1241    index %= width;
1242
1243    const uint x = index;
1244
1245    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1246}
1247
1248#endif
1249
1250#if GPU_ARCH == GPU_ARCH_BIFROST
1251#define MLA(a, b, c) (fma(c, b, a))
1252#else
1253#define MLA(a, b, c) ((b) * (c) + (a))
1254#endif
1255
1256
1257#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1258
1259
1260#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1261
1262
1263#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1264
1265
1266#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1267
1268
1269#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1270
1271
1272#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1273
1274
1275#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1276
1277
1278#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1279
1280
1281#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1282
1283
1284#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1285
1286
1287#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1288
1289
1290#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1291
1292
1293#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1294
1295
1296#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1297
1298
1299#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1300
1301#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1302
1303#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1304
1305#ifndef ARM_COMPUTE_HELPER_H
1306#define ARM_COMPUTE_HELPER_H
1307
1308
1309
1310
1311#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312    VSTORE(N0)                                                 \
1313    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1314
1315#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1316    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1317    VSTORE(N0)                                                 \
1318    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1319
1320#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1321    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1322    VSTORE(N0)                                                 \
1323    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1324
1325#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1326    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1327    VSTORE(N0)                                                 \
1328    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1329
1330#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1331    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1332    VSTORE(N0)                                                 \
1333    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1334
1335#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1336    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1337    VSTORE(N0)                                                 \
1338    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1339
1340#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1341    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1342    VSTORE(N0)                                                 \
1343    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1344
1345#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1346    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1347    VSTORE(N0)                                                 \
1348    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1349
1350#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1351    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1352    VSTORE(N0)                                                 \
1353    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1354
1355#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1356    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1357    VSTORE(N0)                                                  \
1358    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1359
1360#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1361    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1362    VSTORE(N0)                                                  \
1363    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1364
1365#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1366    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1367    VSTORE(N0)                                                  \
1368    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1369
1370#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1371    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1372    VSTORE(N0)                                                  \
1373    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1374
1375#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1376    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1377    VSTORE(N0)                                                  \
1378    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1379
1380#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1381    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1382    VSTORE(N0)                                                  \
1383    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1384
1385#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1386    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1387    VSTORE(N0)                                                  \
1388    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1389
1390
1391
1392#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393    VSTORE(N0)                                                         \
1394    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1395
1396#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1397    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1398    VSTORE(N0)                                                         \
1399    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1400
1401#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1402    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1403    VSTORE(N0)                                                         \
1404    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1405
1406#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1407    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1408    VSTORE(N0)                                                         \
1409    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1410
1411#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1412    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1413    VSTORE(N0)                                                         \
1414    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1415
1416#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1417    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1418    VSTORE(N0)                                                         \
1419    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1420
1421#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1422    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1423    VSTORE(N0)                                                         \
1424    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1425
1426#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1427    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1428    VSTORE(N0)                                                         \
1429    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1430
1431#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1432    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1433    VSTORE(N0)                                                         \
1434    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1435
1436#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1437    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1438    VSTORE(N0)                                                     \
1439    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1440
1441#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1442    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1443    VSTORE(N0)                                                          \
1444    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1445
1446#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1447    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1448    VSTORE(N0)                                                          \
1449    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1450
1451#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1452    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1453    VSTORE(N0)                                                          \
1454    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1455
1456#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1457    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1458    VSTORE(N0)                                                          \
1459    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1460
1461#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1462    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1463    VSTORE(N0)                                                          \
1464    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1465
1466#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1467    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1468    VSTORE(N0)                                                          \
1469    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1470
1471
1472
1473
1474#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1476
1477
1478
1479#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1481
1482
1483
1484#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1486    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1487
1488#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1489    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1490    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1491    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1492
1493#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1494    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1495    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1496    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1497
1498#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1499    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1500    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1501    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1502
1503#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1504    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1505    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1506    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1507
1508#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1509    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1510    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1511    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1512
1513#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1514    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1515    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1516    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1517
1518#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1519    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1520    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1521    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1522
1523#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1524    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1525    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1526    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1527
1528#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1529    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1530    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1531    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1532
1533#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1534    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1535    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1536    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1537
1538#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1539    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1540    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1541    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1542
1543#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1544    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1545    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1546    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1547
1548#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1549    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1550    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1551    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1552
1553#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1554    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1555    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1556    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1557
1558#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1559    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1560    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1561    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1562
1563
1564
1565#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1567
1568#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1570    {                                                                                                                                                     \
1571        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1572    }                                                                                                                                                     \
1573    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1574    {                                                                                                                                                     \
1575        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1576    }                                                                                                                                                     \
1577    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1578    {                                                                                                                                                     \
1579        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1580    }                                                                                                                                                     \
1581    else                                                                                                                                                  \
1582    {                                                                                                                                                     \
1583        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1584    }
1585
1586#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1587    if(!(PARTIAL_COND_X))                                                                                         \
1588    {                                                                                                             \
1589        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1590    }                                                                                                             \
1591    else                                                                                                          \
1592    {                                                                                                             \
1593        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1594    }
1595
1596#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1597    if(!(PARTIAL_COND_Y))                                                                                         \
1598    {                                                                                                             \
1599        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1600    }                                                                                                             \
1601    else                                                                                                          \
1602    {                                                                                                             \
1603        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1604    }
1605
1606
1607#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1608
1609
1610#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1611
1612#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1613    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1614
1615#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1616
1617#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1618    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1619
1620#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1621
1622#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1623    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1624
1625#else
1626
1627#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1628    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1629
1630#endif
1631
1632#endif
1633
1634
1635#if defined(PARTIAL_STORE_M0)
1636
1637#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1638    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1639#else
1640#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1641    ((uint)(y * M0))
1642#endif
1643
1644
1645
1646#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1647    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1648
1649
1650#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1651#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1652#endif
1653
1654#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1655#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1656#endif
1657
1658#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1659#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1660#endif
1661
1662#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1663#pragma OPENCL EXTENSION cl_arm_printf : enable
1664#endif
1665
1666#define GPU_ARCH_MIDGARD 0x100
1667#define GPU_ARCH_BIFROST 0x200
1668#define GPU_ARCH_VALHALL 0x300
1669
1670
1671#define CONCAT(a, b) a##b
1672
1673
1674#define EXPAND(x) x
1675
1676
1677#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1678
1679
1680#define REV1(x) ((x))
1681#define REV2(x) ((x).s10)
1682#define REV3(x) ((x).s210)
1683#define REV4(x) ((x).s3210)
1684#define REV8(x) ((x).s76543210)
1685#define REV16(x) ((x).sFEDCBA9876543210)
1686
1687
1688
1689#define REVERSE_STR(x, s) REV##s((x))
1690#define REVERSE(x, s) REVERSE_STR(x, s)
1691
1692
1693
1694#define ROT1_0(x) ((x))
1695#define ROT1_1(x) ((x))
1696
1697#define ROT2_0(x) ((x))
1698#define ROT2_1(x) ((x).s10)
1699#define ROT2_2(x) ((x))
1700
1701#define ROT3_0(x) ((x))
1702#define ROT3_1(x) ((x).s201)
1703#define ROT3_2(x) ((x).s120)
1704#define ROT3_3(x) ((x))
1705
1706#define ROT4_0(x) ((x))
1707#define ROT4_1(x) ((x).s3012)
1708#define ROT4_2(x) ((x).s2301)
1709#define ROT4_3(x) ((x).s1230)
1710#define ROT4_4(x) ((x))
1711
1712#define ROT8_0(x) ((x))
1713#define ROT8_1(x) ((x).s70123456)
1714#define ROT8_2(x) ((x).s67012345)
1715#define ROT8_3(x) ((x).s56701234)
1716#define ROT8_4(x) ((x).s45670123)
1717#define ROT8_5(x) ((x).s34567012)
1718#define ROT8_6(x) ((x).s23456701)
1719#define ROT8_7(x) ((x).s12345670)
1720#define ROT8_8(x) ((x))
1721
1722#define ROT16_0(x) ((x))
1723#define ROT16_1(x) ((x).sF0123456789ABCDE)
1724#define ROT16_2(x) ((x).sEF0123456789ABCD)
1725#define ROT16_3(x) ((x).sDEF0123456789ABC)
1726#define ROT16_4(x) ((x).sCDEF0123456789AB)
1727#define ROT16_5(x) ((x).sBCDEF0123456789A)
1728#define ROT16_6(x) ((x).sABCDEF0123456789)
1729#define ROT16_7(x) ((x).s9ABCDEF012345678)
1730#define ROT16_8(x) ((x).s89ABCDEF01234567)
1731#define ROT16_9(x) ((x).s789ABCDEF0123456)
1732#define ROT16_10(x) ((x).s6789ABCDEF012345)
1733#define ROT16_11(x) ((x).s56789ABCDEF01234)
1734#define ROT16_12(x) ((x).s456789ABCDEF0123)
1735#define ROT16_13(x) ((x).s3456789ABCDEF012)
1736#define ROT16_14(x) ((x).s23456789ABCDEF01)
1737#define ROT16_15(x) ((x).s123456789ABCDEF0)
1738#define ROT16_16(x) ((x))
1739
1740
1741
1742#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1743#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1744
1745
1746
1747#define V_OFFS1(dt) (dt##1)(0)
1748#define V_OFFS2(dt) (dt##2)(0, 1)
1749#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1750#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1751#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1752#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1753
1754
1755
1756#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1757#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1758
1759
1760#define VLOAD_STR(size) vload##size
1761#define VLOAD(size) VLOAD_STR(size)
1762
1763
1764#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1765#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1766
1767#define NO_LOAD(data, offs, ptr) \
1768    {                            \
1769    }
1770
1771
1772#define vload_partial_1_0 NO_LOAD
1773#define vload_partial_1_1 vload1
1774#define vload_partial_1_2 NO_LOAD
1775#define vload_partial_1_3 NO_LOAD
1776#define vload_partial_1_4 NO_LOAD
1777#define vload_partial_1_5 NO_LOAD
1778#define vload_partial_1_6 NO_LOAD
1779#define vload_partial_1_7 NO_LOAD
1780#define vload_partial_1_8 NO_LOAD
1781#define vload_partial_1_9 NO_LOAD
1782#define vload_partial_1_10 NO_LOAD
1783#define vload_partial_1_11 NO_LOAD
1784#define vload_partial_1_12 NO_LOAD
1785#define vload_partial_1_13 NO_LOAD
1786#define vload_partial_1_14 NO_LOAD
1787#define vload_partial_1_15 NO_LOAD
1788#define vload_partial_1_16 NO_LOAD
1789
1790#define vload_partial_2_0 NO_LOAD
1791#define vload_partial_2_1 vload_partial_1
1792#define vload_partial_2_2 vload_partial_2
1793#define vload_partial_2_3 NO_LOAD
1794#define vload_partial_2_4 NO_LOAD
1795#define vload_partial_2_5 NO_LOAD
1796#define vload_partial_2_6 NO_LOAD
1797#define vload_partial_2_7 NO_LOAD
1798#define vload_partial_2_8 NO_LOAD
1799#define vload_partial_2_9 NO_LOAD
1800#define vload_partial_2_10 NO_LOAD
1801#define vload_partial_2_11 NO_LOAD
1802#define vload_partial_2_12 NO_LOAD
1803#define vload_partial_2_13 NO_LOAD
1804#define vload_partial_2_14 NO_LOAD
1805#define vload_partial_2_15 NO_LOAD
1806#define vload_partial_2_16 NO_LOAD
1807
1808#define vload_partial_3_0 NO_LOAD
1809#define vload_partial_3_1 vload_partial_1
1810#define vload_partial_3_2 vload_partial_2
1811#define vload_partial_3_3 vload_partial_3
1812#define vload_partial_3_4 NO_LOAD
1813#define vload_partial_3_5 NO_LOAD
1814#define vload_partial_3_6 NO_LOAD
1815#define vload_partial_3_7 NO_LOAD
1816#define vload_partial_3_8 NO_LOAD
1817#define vload_partial_3_9 NO_LOAD
1818#define vload_partial_3_10 NO_LOAD
1819#define vload_partial_3_11 NO_LOAD
1820#define vload_partial_3_12 NO_LOAD
1821#define vload_partial_3_13 NO_LOAD
1822#define vload_partial_3_14 NO_LOAD
1823#define vload_partial_3_15 NO_LOAD
1824#define vload_partial_3_16 NO_LOAD
1825
1826#define vload_partial_4_0 NO_LOAD
1827#define vload_partial_4_1 vload_partial_1
1828#define vload_partial_4_2 vload_partial_2
1829#define vload_partial_4_3 vload_partial_3
1830#define vload_partial_4_4 vload_partial_4
1831#define vload_partial_4_5 NO_LOAD
1832#define vload_partial_4_6 NO_LOAD
1833#define vload_partial_4_7 NO_LOAD
1834#define vload_partial_4_8 NO_LOAD
1835#define vload_partial_4_9 NO_LOAD
1836#define vload_partial_4_10 NO_LOAD
1837#define vload_partial_4_11 NO_LOAD
1838#define vload_partial_4_12 NO_LOAD
1839#define vload_partial_4_13 NO_LOAD
1840#define vload_partial_4_14 NO_LOAD
1841#define vload_partial_4_15 NO_LOAD
1842#define vload_partial_4_16 NO_LOAD
1843
1844#define vload_partial_8_0 NO_LOAD
1845#define vload_partial_8_1 vload_partial_1
1846#define vload_partial_8_2 vload_partial_2
1847#define vload_partial_8_3 vload_partial_3
1848#define vload_partial_8_4 vload_partial_4
1849#define vload_partial_8_5 vload_partial_5
1850#define vload_partial_8_6 vload_partial_6
1851#define vload_partial_8_7 vload_partial_7
1852#define vload_partial_8_8 vload_partial_8
1853#define vload_partial_8_9 NO_LOAD
1854#define vload_partial_8_10 NO_LOAD
1855#define vload_partial_8_11 NO_LOAD
1856#define vload_partial_8_12 NO_LOAD
1857#define vload_partial_8_13 NO_LOAD
1858#define vload_partial_8_14 NO_LOAD
1859#define vload_partial_8_15 NO_LOAD
1860#define vload_partial_8_16 NO_LOAD
1861
1862#define vload_partial_16_0 NO_LOAD
1863#define vload_partial_16_1 vload_partial_1
1864#define vload_partial_16_2 vload_partial_2
1865#define vload_partial_16_3 vload_partial_3
1866#define vload_partial_16_4 vload_partial_4
1867#define vload_partial_16_5 vload_partial_5
1868#define vload_partial_16_6 vload_partial_6
1869#define vload_partial_16_7 vload_partial_7
1870#define vload_partial_16_8 vload_partial_8
1871#define vload_partial_16_9 vload_partial_9
1872#define vload_partial_16_10 vload_partial_10
1873#define vload_partial_16_11 vload_partial_11
1874#define vload_partial_16_12 vload_partial_12
1875#define vload_partial_16_13 vload_partial_13
1876#define vload_partial_16_14 vload_partial_14
1877#define vload_partial_16_15 vload_partial_15
1878#define vload_partial_16_16 vload_partial_16
1879
1880
1881#define vload_partial_1(DATA, OFFSET, PTR) \
1882    DATA.s0 = vload1(OFFSET, PTR);
1883
1884#define vload_partial_2(DATA, OFFSET, PTR) \
1885    DATA.s01 = vload2(OFFSET, PTR);
1886
1887#define vload_partial_3(DATA, OFFSET, PTR) \
1888    DATA.s012 = vload3(OFFSET, PTR);
1889
1890#define vload_partial_4(DATA, OFFSET, PTR) \
1891    DATA.s0123 = vload4(OFFSET, PTR);
1892
1893#define vload_partial_5(DATA, OFFSET, PTR)    \
1894    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1895    DATA.s4 = vload1(OFFSET, PTR + 4);
1896
1897#define vload_partial_6(DATA, OFFSET, PTR)    \
1898    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1899    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1900
1901#define vload_partial_7(DATA, OFFSET, PTR)    \
1902    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1903    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1904
1905#define vload_partial_8(DATA, OFFSET, PTR) \
1906    DATA.s01234567 = vload8(OFFSET, PTR);
1907
1908#define vload_partial_9(DATA, OFFSET, PTR)        \
1909    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1910    DATA.s8 = vload1(OFFSET, PTR + 8);
1911
1912#define vload_partial_10(DATA, OFFSET, PTR)       \
1913    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1914    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1915
1916#define vload_partial_11(DATA, OFFSET, PTR)       \
1917    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1918    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1919
1920#define vload_partial_12(DATA, OFFSET, PTR)       \
1921    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1922    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1923
1924#define vload_partial_13(DATA, OFFSET, PTR)       \
1925    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1926    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1927
1928#define vload_partial_14(DATA, OFFSET, PTR)       \
1929    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1930    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1931
1932#define vload_partial_15(DATA, OFFSET, PTR)       \
1933    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1934    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1935
1936#define vload_partial_16(DATA, OFFSET, PTR) \
1937    DATA = vload16(OFFSET, PTR);
1938
1939
1940
1941#define PIXEL_UNIT4 1
1942#define PIXEL_UNIT8 2
1943#define PIXEL_UNIT16 4
1944
1945
1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1947#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1948
1949
1950#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1951#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1952#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1953
1954#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1955#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1956#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1957#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1958#endif
1959
1960#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1961#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1962#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1963
1964#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1965#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1966#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1967#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1968#endif
1969
1970
1971#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1972#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1973
1974
1975#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1976#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1977
1978#define VSTORE_STR(size) vstore##size
1979#define VSTORE(size) VSTORE_STR(size)
1980
1981#define float1 float
1982#define half1 half
1983#define char1 char
1984#define uchar1 uchar
1985#define short1 short
1986#define ushort1 ushort
1987#define int1 int
1988#define uint1 uint
1989#define long1 long
1990#define ulong1 ulong
1991#define double1 double
1992
1993#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1994#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1995
1996
1997#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1998#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1999
2000#define NO_STORE(data, offs, ptr) \
2001    {                             \
2002    }
2003
2004
2005#define vstore_partial_1_0 NO_STORE
2006#define vstore_partial_1_1 vstore1
2007#define vstore_partial_1_2 NO_STORE
2008#define vstore_partial_1_3 NO_STORE
2009#define vstore_partial_1_4 NO_STORE
2010#define vstore_partial_1_5 NO_STORE
2011#define vstore_partial_1_6 NO_STORE
2012#define vstore_partial_1_7 NO_STORE
2013#define vstore_partial_1_8 NO_STORE
2014#define vstore_partial_1_9 NO_STORE
2015#define vstore_partial_1_10 NO_STORE
2016#define vstore_partial_1_11 NO_STORE
2017#define vstore_partial_1_12 NO_STORE
2018#define vstore_partial_1_13 NO_STORE
2019#define vstore_partial_1_14 NO_STORE
2020#define vstore_partial_1_15 NO_STORE
2021#define vstore_partial_1_16 NO_STORE
2022
2023#define vstore_partial_2_0 NO_STORE
2024#define vstore_partial_2_1 vstore_partial_1
2025#define vstore_partial_2_2 vstore_partial_2
2026#define vstore_partial_2_3 NO_STORE
2027#define vstore_partial_2_4 NO_STORE
2028#define vstore_partial_2_5 NO_STORE
2029#define vstore_partial_2_6 NO_STORE
2030#define vstore_partial_2_7 NO_STORE
2031#define vstore_partial_2_8 NO_STORE
2032#define vstore_partial_2_9 NO_STORE
2033#define vstore_partial_2_10 NO_STORE
2034#define vstore_partial_2_11 NO_STORE
2035#define vstore_partial_2_12 NO_STORE
2036#define vstore_partial_2_13 NO_STORE
2037#define vstore_partial_2_14 NO_STORE
2038#define vstore_partial_2_15 NO_STORE
2039#define vstore_partial_2_16 NO_STORE
2040
2041#define vstore_partial_3_0 NO_STORE
2042#define vstore_partial_3_1 vstore_partial_1
2043#define vstore_partial_3_2 vstore_partial_2
2044#define vstore_partial_3_3 vstore_partial_3
2045#define vstore_partial_3_4 NO_STORE
2046#define vstore_partial_3_5 NO_STORE
2047#define vstore_partial_3_6 NO_STORE
2048#define vstore_partial_3_7 NO_STORE
2049#define vstore_partial_3_8 NO_STORE
2050#define vstore_partial_3_9 NO_STORE
2051#define vstore_partial_3_10 NO_STORE
2052#define vstore_partial_3_11 NO_STORE
2053#define vstore_partial_3_12 NO_STORE
2054#define vstore_partial_3_13 NO_STORE
2055#define vstore_partial_3_14 NO_STORE
2056#define vstore_partial_3_15 NO_STORE
2057#define vstore_partial_3_16 NO_STORE
2058
2059#define vstore_partial_4_0 NO_STORE
2060#define vstore_partial_4_1 vstore_partial_1
2061#define vstore_partial_4_2 vstore_partial_2
2062#define vstore_partial_4_3 vstore_partial_3
2063#define vstore_partial_4_4 vstore_partial_4
2064#define vstore_partial_4_5 NO_STORE
2065#define vstore_partial_4_6 NO_STORE
2066#define vstore_partial_4_7 NO_STORE
2067#define vstore_partial_4_8 NO_STORE
2068#define vstore_partial_4_9 NO_STORE
2069#define vstore_partial_4_10 NO_STORE
2070#define vstore_partial_4_11 NO_STORE
2071#define vstore_partial_4_12 NO_STORE
2072#define vstore_partial_4_13 NO_STORE
2073#define vstore_partial_4_14 NO_STORE
2074#define vstore_partial_4_15 NO_STORE
2075#define vstore_partial_4_16 NO_STORE
2076
2077#define vstore_partial_8_0 NO_STORE
2078#define vstore_partial_8_1 vstore_partial_1
2079#define vstore_partial_8_2 vstore_partial_2
2080#define vstore_partial_8_3 vstore_partial_3
2081#define vstore_partial_8_4 vstore_partial_4
2082#define vstore_partial_8_5 vstore_partial_5
2083#define vstore_partial_8_6 vstore_partial_6
2084#define vstore_partial_8_7 vstore_partial_7
2085#define vstore_partial_8_8 vstore_partial_8
2086#define vstore_partial_8_9 NO_STORE
2087#define vstore_partial_8_10 NO_STORE
2088#define vstore_partial_8_11 NO_STORE
2089#define vstore_partial_8_12 NO_STORE
2090#define vstore_partial_8_13 NO_STORE
2091#define vstore_partial_8_14 NO_STORE
2092#define vstore_partial_8_15 NO_STORE
2093#define vstore_partial_8_16 NO_STORE
2094
2095#define vstore_partial_16_0 NO_STORE
2096#define vstore_partial_16_1 vstore_partial_1
2097#define vstore_partial_16_2 vstore_partial_2
2098#define vstore_partial_16_3 vstore_partial_3
2099#define vstore_partial_16_4 vstore_partial_4
2100#define vstore_partial_16_5 vstore_partial_5
2101#define vstore_partial_16_6 vstore_partial_6
2102#define vstore_partial_16_7 vstore_partial_7
2103#define vstore_partial_16_8 vstore_partial_8
2104#define vstore_partial_16_9 vstore_partial_9
2105#define vstore_partial_16_10 vstore_partial_10
2106#define vstore_partial_16_11 vstore_partial_11
2107#define vstore_partial_16_12 vstore_partial_12
2108#define vstore_partial_16_13 vstore_partial_13
2109#define vstore_partial_16_14 vstore_partial_14
2110#define vstore_partial_16_15 vstore_partial_15
2111#define vstore_partial_16_16 vstore_partial_16
2112
2113
2114#define vstore_partial_1(DATA, OFFSET, PTR) \
2115    vstore1(DATA.s0, OFFSET, PTR);
2116
2117#define vstore_partial_2(DATA, OFFSET, PTR) \
2118    vstore2(DATA.s01, OFFSET, PTR);
2119
2120#define vstore_partial_3(DATA, OFFSET, PTR) \
2121    vstore3(DATA.s012, OFFSET, PTR);
2122
2123#define vstore_partial_4(DATA, OFFSET, PTR) \
2124    vstore4(DATA.s0123, OFFSET, PTR);
2125
2126#define vstore_partial_5(DATA, OFFSET, PTR)    \
2127    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2128    vstore1(DATA.s4, OFFSET, PTR + 4);
2129
2130#define vstore_partial_6(DATA, OFFSET, PTR)    \
2131    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2132    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2133
2134#define vstore_partial_7(DATA, OFFSET, PTR)    \
2135    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2136    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2137
2138#define vstore_partial_8(DATA, OFFSET, PTR) \
2139    vstore8(DATA.s01234567, OFFSET, PTR);
2140
2141#define vstore_partial_9(DATA, OFFSET, PTR)        \
2142    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2143    vstore1(DATA.s8, OFFSET, PTR + 8);
2144
2145#define vstore_partial_10(DATA, OFFSET, PTR)       \
2146    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2147    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2148
2149#define vstore_partial_11(DATA, OFFSET, PTR)       \
2150    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2151    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2152
2153#define vstore_partial_12(DATA, OFFSET, PTR)       \
2154    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2155    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2156
2157#define vstore_partial_13(DATA, OFFSET, PTR)       \
2158    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2159    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2160
2161#define vstore_partial_14(DATA, OFFSET, PTR)       \
2162    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2163    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2164
2165#define vstore_partial_15(DATA, OFFSET, PTR)       \
2166    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2167    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2168
2169#define vstore_partial_16(DATA, OFFSET, PTR) \
2170    vstore16(DATA, OFFSET, PTR);
2171
2172
2173
2174
2175
2176#define convert_float_sat convert_float
2177#define convert_float1_sat convert_float
2178#define convert_float2_sat convert_float2
2179#define convert_float3_sat convert_float3
2180#define convert_float4_sat convert_float4
2181#define convert_float8_sat convert_float8
2182#define convert_float16_sat convert_float16
2183#define convert_half_sat convert_float
2184#define convert_half1_sat convert_half
2185#define convert_half2_sat convert_half2
2186#define convert_half3_sat convert_half3
2187#define convert_half4_sat convert_half4
2188#define convert_half8_sat convert_half8
2189#define convert_half16_sat convert_half16
2190
2191#define convert_float1 convert_float
2192#define convert_half1 convert_half
2193#define convert_char1 convert_char
2194#define convert_uchar1 convert_uchar
2195#define convert_short1 convert_short
2196#define convert_ushort1 convert_ushort
2197#define convert_int1 convert_int
2198#define convert_uint1 convert_uint
2199#define convert_long1 convert_long
2200#define convert_ulong1 convert_ulong
2201#define convert_double1 convert_double
2202
2203#define convert_char1_sat convert_char_sat
2204#define convert_uchar1_sat convert_uchar_sat
2205#define convert_uchar2_sat convert_uchar2_sat
2206#define convert_uchar3_sat convert_uchar3_sat
2207#define convert_uchar4_sat convert_uchar4_sat
2208#define convert_uchar8_sat convert_uchar8_sat
2209#define convert_uchar16_sat convert_uchar16_sat
2210#define convert_short1_sat convert_short_sat
2211#define convert_ushort1_sat convert_ushort_sat
2212#define convert_int1_sat convert_int_sat
2213#define convert_uint1_sat convert_uint_sat
2214#define convert_long1_sat convert_long_sat
2215#define convert_ulong1_sat convert_ulong_sat
2216#define convert_double1_sat convert_double_sat
2217
2218#define VEC_DATA_TYPE_STR(type, size) type##size
2219#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2220
2221#define CONVERT_STR(x, type) (convert_##type((x)))
2222#define CONVERT(x, type) CONVERT_STR(x, type)
2223
2224#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2225#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2226
2227#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2228#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2229
2230#define select_vec_dt_uchar(size) uchar##size
2231#define select_vec_dt_char(size) char##size
2232#define select_vec_dt_ushort(size) ushort##size
2233#define select_vec_dt_short(size) short##size
2234#define select_vec_dt_half(size) short##size
2235#define select_vec_dt_uint(size) uint##size
2236#define select_vec_dt_int(size) int##size
2237#define select_vec_dt_float(size) int##size
2238#define select_vec_dt_ulong(size) ulong##size
2239#define select_vec_dt_long(size) long##size
2240
2241#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2242#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2243#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2244
2245#define signed_int_vec_dt_uchar(size) char##size
2246#define signed_int_vec_dt_char(size) char##size
2247#define signed_int_vec_dt_ushort(size) short##size
2248#define signed_int_vec_dt_short(size) short##size
2249#define signed_int_vec_dt_half(size) short##size
2250#define signed_int_vec_dt_uint(size) int##size
2251#define signed_int_vec_dt_int(size) int##size
2252#define signed_int_vec_dt_float(size) int##size
2253#define signed_int_vec_dt_ulong(size) long##size
2254#define signed_int_vec_dt_long(size) long##size
2255
2256#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2257#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2258#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2259
2260#define sum_reduce_1(x) (x)
2261#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2262#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2263#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2264#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2265#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2266
2267#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2268#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2269
2270#define prod_reduce_1(x) (x)
2271#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2272#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2273#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2274#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2275#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2276
2277#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2278#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2279
2280#define max_reduce_1(x) (x)
2281#define max_reduce_2(x) max(((x).s0), ((x).s1))
2282#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2283#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2284#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2285#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2286
2287#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2288#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2289
2290#define VECTOR_DECLARATION(name)     \
2291    __global uchar *name##_ptr,      \
2292    uint        name##_stride_x, \
2293    uint        name##_step_x,   \
2294    uint        name##_offset_first_element_in_bytes
2295
2296#define IMAGE_DECLARATION(name)      \
2297    __global uchar *name##_ptr,      \
2298    uint        name##_stride_x, \
2299    uint        name##_step_x,   \
2300    uint        name##_stride_y, \
2301    uint        name##_step_y,   \
2302    uint        name##_offset_first_element_in_bytes
2303
2304#define TENSOR3D_DECLARATION(name)   \
2305    __global uchar *name##_ptr,      \
2306    uint        name##_stride_x, \
2307    uint        name##_step_x,   \
2308    uint        name##_stride_y, \
2309    uint        name##_step_y,   \
2310    uint        name##_stride_z, \
2311    uint        name##_step_z,   \
2312    uint        name##_offset_first_element_in_bytes
2313
2314#define TENSOR4D_DECLARATION(name)   \
2315    __global uchar *name##_ptr,      \
2316    uint        name##_stride_x, \
2317    uint        name##_step_x,   \
2318    uint        name##_stride_y, \
2319    uint        name##_step_y,   \
2320    uint        name##_stride_z, \
2321    uint        name##_step_z,   \
2322    uint        name##_stride_w, \
2323    uint        name##_step_w,   \
2324    uint        name##_offset_first_element_in_bytes
2325
2326#define TENSOR5D_DECLARATION(name)   \
2327    __global uchar *name##_ptr,      \
2328    uint        name##_stride_x, \
2329    uint        name##_step_x,   \
2330    uint        name##_stride_y, \
2331    uint        name##_step_y,   \
2332    uint        name##_stride_z, \
2333    uint        name##_step_z,   \
2334    uint        name##_stride_w, \
2335    uint        name##_step_w,   \
2336    uint        name##_stride_v, \
2337    uint        name##_step_v,   \
2338    uint        name##_offset_first_element_in_bytes
2339
2340#define CONVERT_TO_VECTOR_STRUCT(name) \
2341    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2342
2343#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2344    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2345
2346#define CONVERT_TO_IMAGE_STRUCT(name) \
2347    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2348
2349#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2350    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2351
2352#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2353    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2354
2355#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2356    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2357
2358#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2359    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2360
2361#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2362    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2363                                 name##_stride_z, name##_step_z)
2364
2365#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2366    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2367
2368#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2369    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2370                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2371
2372#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2373    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2374
2375#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2376    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2377                           name##_stride_z, name##_step_z)
2378
2379
2380typedef struct Vector
2381{
2382    __global uchar *ptr;
2383    int             offset_first_element_in_bytes;
2384    int             stride_x;
2385} Vector;
2386
2387
2388typedef struct Image
2389{
2390    __global uchar *ptr;
2391    int             offset_first_element_in_bytes;
2392    int             stride_x;
2393    int             stride_y;
2394} Image;
2395
2396
2397typedef struct Tensor3D
2398{
2399    __global uchar *ptr;
2400    int             offset_first_element_in_bytes;
2401    int             stride_x;
2402    int             stride_y;
2403    int             stride_z;
2404} Tensor3D;
2405
2406
2407typedef struct Tensor4D
2408{
2409    __global uchar *ptr;
2410    int             offset_first_element_in_bytes;
2411    int             stride_x;
2412    int             stride_y;
2413    int             stride_z;
2414    int             stride_w;
2415} Tensor4D;
2416
2417
2418inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2419{
2420    Vector vector =
2421    {
2422        .ptr                           = ptr,
2423        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2424        .stride_x                      = stride_x,
2425    };
2426    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2427    return vector;
2428}
2429
2430
2431inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2432{
2433    Image img =
2434    {
2435        .ptr                           = ptr,
2436        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2437        .stride_x                      = stride_x,
2438        .stride_y                      = stride_y
2439    };
2440    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2441    return img;
2442}
2443
2444
2445inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2446{
2447    Image img =
2448    {
2449        .ptr                           = ptr,
2450        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2451        .stride_x                      = stride_x,
2452        .stride_y                      = stride_y
2453    };
2454    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2455    return img;
2456}
2457
2458
2459inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2460{
2461    Tensor3D tensor =
2462    {
2463        .ptr                           = ptr,
2464        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2465        .stride_x                      = stride_x,
2466        .stride_y                      = stride_y,
2467        .stride_z                      = stride_z
2468    };
2469    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2470    return tensor;
2471}
2472
2473
2474inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2475{
2476    Tensor3D tensor =
2477    {
2478        .ptr                           = ptr,
2479        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2480        .stride_x                      = stride_x,
2481        .stride_y                      = stride_y,
2482        .stride_z                      = stride_z
2483    };
2484    return tensor;
2485}
2486
2487inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2488                                             uint step_w,
2489                                             uint mod_size)
2490{
2491    Tensor4D tensor =
2492    {
2493        .ptr                           = ptr,
2494        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2495        .stride_x                      = stride_x,
2496        .stride_y                      = stride_y,
2497        .stride_z                      = stride_z,
2498        .stride_w                      = stride_w
2499    };
2500
2501    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2502    return tensor;
2503}
2504
2505
2506inline __global const uchar *vector_offset(const Vector *vec, int x)
2507{
2508    return vec->ptr + x * vec->stride_x;
2509}
2510
2511
2512inline __global uchar *offset(const Image *img, int x, int y)
2513{
2514    return img->ptr + x * img->stride_x + y * img->stride_y;
2515}
2516
2517
2518inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2519{
2520    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2521}
2522
2523
2524inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2525{
2526    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2527}
2528
2529
2530inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2531{
2532    uint num_elements = width * height;
2533
2534    const uint z = index / num_elements;
2535
2536    index %= num_elements;
2537
2538    const uint y = index / width;
2539
2540    index %= width;
2541
2542    const uint x = index;
2543
2544    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2545}
2546
2547#endif
2548
2549
2550#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
2551#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
2552
2553
2554#define scalar_access_0_1(x) ((x).s0)
2555#define scalar_access_0_2(x) ((x).s01)
2556#define scalar_access_0_3(x) ((x).s012)
2557#define scalar_access_0_4(x) ((x).s0123)
2558#define scalar_access_0_8(x) ((x).s01234567)
2559#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
2560
2561
2562#define scalar_access_1_1(x) ((x).s1)
2563#define scalar_access_1_2(x) ((x).s12)
2564#define scalar_access_1_3(x) ((x).s123)
2565#define scalar_access_1_4(x) ((x).s1234)
2566#define scalar_access_1_8(x) ((x).s12345678)
2567
2568
2569#define scalar_access_2_1(x) ((x).s2)
2570#define scalar_access_2_2(x) ((x).s23)
2571#define scalar_access_2_3(x) ((x).s234)
2572#define scalar_access_2_4(x) ((x).s2345)
2573#define scalar_access_2_8(x) ((x).s23456789)
2574
2575
2576#define scalar_access_3_1(x) ((x).s3)
2577#define scalar_access_3_2(x) ((x).s34)
2578#define scalar_access_3_3(x) ((x).s345)
2579#define scalar_access_3_4(x) ((x).s3456)
2580#define scalar_access_3_8(x) ((x).s3456789A)
2581
2582
2583#define scalar_access_4_1(x) ((x).s4)
2584#define scalar_access_4_2(x) ((x).s45)
2585#define scalar_access_4_3(x) ((x).s456)
2586#define scalar_access_4_4(x) ((x).s4567)
2587#define scalar_access_4_8(x) ((x).s456789AB)
2588
2589
2590#define scalar_access_8_1(x) ((x).s8)
2591#define scalar_access_8_2(x) ((x).s89)
2592#define scalar_access_8_3(x) ((x).s89A)
2593#define scalar_access_8_4(x) ((x).s89AB)
2594#define scalar_access_8_8(x) ((x).s89ABCDEF)
2595
2596
2597#define scalar_access_12_1(x) ((x).sC)
2598#define scalar_access_12_2(x) ((x).sCD)
2599#define scalar_access_12_3(x) ((x).sCDE)
2600#define scalar_access_12_4(x) ((x).sCDEF)
2601
2602
2603#define scalar_access_16_1(x) ((x).sF)
2604
2605
2606#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2607    ({})
2608
2609#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2610    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2611
2612#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2613    LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2614    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2615
2616#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2617    LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2618    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2619
2620#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2621    LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2622    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2623
2624#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2625    LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2626    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2627
2628#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2629    LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2630    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2631
2632#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2633    LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2634    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2635
2636#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2637    LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2638    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2639
2640#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2641    LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2642    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2643
2644#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2645    LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
2646    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2647
2648#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2649    LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2650    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2651
2652#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2653    LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2654    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2655
2656#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2657    LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2658    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2659
2660#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2661    LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2662    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2663
2664#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2665    LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2666    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2667
2668#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2669    LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2670    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2671
2672
2673
2674#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2675#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2676
2677
2678
2679#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2680    ({})
2681
2682#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2683    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2684
2685#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2686    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2687
2688#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2689    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2690
2691#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2692    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2693
2694#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2695    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2696    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2697
2698#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2699    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2700    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2701
2702#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2703    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2704    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2705
2706#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2707    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2708
2709#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2710    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
2711    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2712
2713#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2714    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2715    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2716
2717#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2718    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2719    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2720
2721#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2722    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2723    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2724
2725#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2726    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2727    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2728    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2729
2730#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2731    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
2732    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2733    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2734
2735#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2736    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2737    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2738    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2739
2740#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2741    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2742
2743
2744
2745#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2746#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2747
2748
2749#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2750    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2751    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2752
2753#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2754    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2755    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2756    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2757
2758#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2759    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2760    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2761    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2762
2763#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2764    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2765    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2766    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2767
2768#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2769    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2770    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2771    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2772
2773#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2774    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2775    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2776    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2777
2778#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2779    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2780    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2781    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2782
2783#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2784    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2785    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2786    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2787
2788#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2789    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2790    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2791    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2792
2793#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2794    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2795    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2796    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2797
2798#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2799    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2800    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2801    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2802
2803#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2804    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2805    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2806    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2807
2808#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2809    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2810    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2811    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2812
2813#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2814    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2815    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2816    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2817
2818#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2819    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2820    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2821    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2822
2823#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2824    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2825    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2826    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2827
2828
2829
2830
2831#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2832#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2833
2834
2835
2836#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2837    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2838    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2839
2840#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2841    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2842    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2843    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2844
2845#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2846    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2847    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2848    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2849
2850#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2851    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2852    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2853    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2854
2855#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2856    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2857    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2858    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2859
2860#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2861    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2862    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2863    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2864
2865#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2866    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2867    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2868    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2869
2870#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2871    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2872    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2873    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2874
2875#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2876    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2877    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2878    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2879
2880#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2881    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2882    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2883    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2884
2885#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2886    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2887    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2888    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2889
2890#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2891    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2892    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2893    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2894
2895#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2896    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2897    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2898    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2899
2900#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2901    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2902    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2903    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2904
2905#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2906    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2907    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2908    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2909
2910#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2911    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2912    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2913    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2914
2915
2916
2917#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2918#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2919
2920#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2921    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
2922    {                                                                                                                                                            \
2923        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
2924    }                                                                                                                                                            \
2925    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
2926    {                                                                                                                                                            \
2927        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2928    }                                                                                                                                                            \
2929    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
2930    {                                                                                                                                                            \
2931        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2932    }                                                                                                                                                            \
2933    else                                                                                                                                                         \
2934    {                                                                                                                                                            \
2935        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
2936    }
2937
2938#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2939    if(!(PARTIAL_COND_X))                                                                                                \
2940    {                                                                                                                    \
2941        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2942    }                                                                                                                    \
2943    else                                                                                                                 \
2944    {                                                                                                                    \
2945        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2946    }
2947
2948#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2949    if(!(PARTIAL_COND_Y))                                                                                                \
2950    {                                                                                                                    \
2951        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2952    }                                                                                                                    \
2953    else                                                                                                                 \
2954    {                                                                                                                    \
2955        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2956    }
2957
2958
2959#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2960
2961#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2962    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2963
2964#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2965
2966#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2967    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2968    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2969
2970#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2971
2972#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2973    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2974    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2975
2976#else
2977
2978#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2979    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2980    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2981
2982#endif
2983
2984
2985#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2986    BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
2987
2988#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2989    LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2990    BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
2991
2992#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2993    LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2994    BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
2995
2996#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2997    LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2998    BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
2999
3000#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3001    LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3002    BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
3003
3004#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3005    LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3006    BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
3007
3008#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3009    LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3010    BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
3011
3012#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3013    LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3014    BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
3015
3016#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3017    LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3018    BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
3019
3020#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3021    LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
3022    BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
3023
3024#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3025    LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3026    BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
3027
3028#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3029    LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3030    BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
3031
3032#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3033    LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3034    BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
3035
3036#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3037    LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3038    BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
3039
3040#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3041    LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3042    BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
3043
3044#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3045    LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3046    BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
3047
3048
3049
3050#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3051#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3052
3053
3054
3055#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3056    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3057    BASENAME##0;                                                                            \
3058    if(Y_MASK##0 != 0)                                                                      \
3059        BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
3060    else                                                                                    \
3061        BASENAME##0 = 0;
3062
3063#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3064    LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3065    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3066    BASENAME##1;                                                                            \
3067    if(Y_MASK##1 != 0)                                                                      \
3068        BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
3069    else                                                                                    \
3070        BASENAME##1 = 0;
3071
3072#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3073    LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3074    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3075    BASENAME##2;                                                                            \
3076    if(Y_MASK##2 != 0)                                                                      \
3077        BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
3078    else                                                                                    \
3079        BASENAME##2 = 0;
3080
3081#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3082    LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3083    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3084    BASENAME##3;                                                                            \
3085    if(Y_MASK##3 != 0)                                                                      \
3086        BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
3087    else                                                                                    \
3088        BASENAME##3 = 0;
3089
3090#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3091    LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3092    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3093    BASENAME##4;                                                                            \
3094    if(Y_MASK##4 != 0)                                                                      \
3095        BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
3096    else                                                                                    \
3097        BASENAME##4 = 0;
3098
3099#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3100    LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3101    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3102    BASENAME##5;                                                                            \
3103    if(Y_MASK##5 != 0)                                                                      \
3104        BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
3105    else                                                                                    \
3106        BASENAME##5 = 0;
3107
3108#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3109    LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3110    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3111    BASENAME##6;                                                                            \
3112    if(Y_MASK##6 != 0)                                                                      \
3113        BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
3114    else                                                                                    \
3115        BASENAME##6 = 0;
3116
3117#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3118    LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3119    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3120    BASENAME##7;                                                                            \
3121    if(Y_MASK##7 != 0)                                                                      \
3122        BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
3123    else                                                                                    \
3124        BASENAME##7 = 0;
3125
3126#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3127    LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3128    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3129    BASENAME##8;                                                                            \
3130    if(Y_MASK##8 != 0)                                                                      \
3131        BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
3132    else                                                                                    \
3133        BASENAME##8 = 0;
3134
3135#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3136    LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3137    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3138    BASENAME##9;                                                                            \
3139    if(Y_MASK##9 != 0)                                                                      \
3140        BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
3141    else                                                                                    \
3142        BASENAME##9 = 0;
3143
3144#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3145    LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3146    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3147    BASENAME##A;                                                                            \
3148    if(Y_MASK##A != 0)                                                                      \
3149        BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
3150    else                                                                                    \
3151        BASENAME##A = 0;
3152
3153#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3154    LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3155    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3156    BASENAME##B;                                                                            \
3157    if(Y_MASK##B != 0)                                                                      \
3158        BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
3159    else                                                                                    \
3160        BASENAME##B = 0;
3161
3162#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3163    LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3164    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3165    BASENAME##C;                                                                            \
3166    if(Y_MASK##C != 0)                                                                      \
3167        BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
3168    else                                                                                    \
3169        BASENAME##C = 0;
3170
3171#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3172    LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3173    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3174    BASENAME##D;                                                                            \
3175    if(Y_MASK##D != 0)                                                                      \
3176        BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
3177    else                                                                                    \
3178        BASENAME##D = 0;
3179
3180#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3181    LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3182    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3183    BASENAME##E;                                                                            \
3184    if(Y_MASK##E != 0)                                                                      \
3185        BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
3186    else                                                                                    \
3187        BASENAME##E = 0;
3188
3189#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3190    LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3191    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3192    BASENAME##F;                                                                            \
3193    if(Y_MASK##F != 0)                                                                      \
3194        BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
3195    else                                                                                    \
3196        BASENAME##F = 0;
3197
3198
3199#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3200#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3201
3202
3203#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3204    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3205    BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
3206
3207#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3208    LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3209    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3210    BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
3211
3212#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3213    LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3214    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3215    BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
3216
3217#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3218    LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3219    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3220    BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
3221
3222#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3223    LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3224    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3225    BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
3226
3227#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3228    LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3229    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3230    BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
3231
3232#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3233    LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3234    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3235    BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
3236
3237#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3238    LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3239    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3240    BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
3241
3242#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3243    LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3244    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3245    BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
3246
3247#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3248    LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)      \
3249    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3250    BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
3251
3252#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3253    LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3254    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3255    BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
3256
3257#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3258    LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3259    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3260    BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
3261
3262#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3263    LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3264    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3265    BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
3266
3267#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3268    LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3269    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3270    BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
3271
3272#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3273    LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3274    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3275    BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
3276
3277#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3278    LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3279    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3280    BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
3281
3282
3283
3284
3285#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3286#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3287
3288
3289
3290#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3291    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3292    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
3293    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
3294
3295#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3296    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3297    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3298    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
3299    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
3300
3301#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3302    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3303    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3304    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
3305    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
3306
3307#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3308    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3309    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3310    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
3311    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
3312
3313#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3314    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3315    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3316    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
3317    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
3318
3319#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3320    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3321    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3322    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
3323    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
3324
3325#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3326    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3327    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3328    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
3329    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
3330
3331#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3332    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3333    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3334    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
3335    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
3336
3337
3338
3339
3340#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3341#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3342
3343
3344
3345#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
3346    BASENAME##0 *= (DATA_TYPE)SCALE;
3347
3348#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
3349    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
3350    BASENAME##1 *= (DATA_TYPE)SCALE;
3351
3352#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
3353    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
3354    BASENAME##2 *= (DATA_TYPE)SCALE;
3355
3356#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
3357    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
3358    BASENAME##3 *= (DATA_TYPE)SCALE;
3359
3360#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
3361    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
3362    BASENAME##4 *= (DATA_TYPE)SCALE;
3363
3364#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
3365    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
3366    BASENAME##5 *= (DATA_TYPE)SCALE;
3367
3368#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
3369    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
3370    BASENAME##6 *= (DATA_TYPE)SCALE;
3371
3372#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
3373    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
3374    BASENAME##7 *= (DATA_TYPE)SCALE;
3375
3376#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
3377    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
3378    BASENAME##8 *= (DATA_TYPE)SCALE;
3379
3380#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
3381    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
3382    BASENAME##9 *= (DATA_TYPE)SCALE;
3383
3384#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
3385    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
3386    BASENAME##A *= (DATA_TYPE)SCALE;
3387
3388#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
3389    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
3390    BASENAME##B *= (DATA_TYPE)SCALE;
3391
3392#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
3393    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
3394    BASENAME##C *= (DATA_TYPE)SCALE;
3395
3396#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
3397    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
3398    BASENAME##D *= (DATA_TYPE)SCALE;
3399
3400#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
3401    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
3402    BASENAME##E *= (DATA_TYPE)SCALE;
3403
3404#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
3405    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
3406    BASENAME##F *= (DATA_TYPE)SCALE;
3407
3408
3409
3410#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
3411#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
3412
3413
3414
3415#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
3416    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
3417#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
3418    VEC_DATA_TYPE(TYPE, 2)                         \
3419    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
3420#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
3421    VEC_DATA_TYPE(TYPE, 3)                         \
3422    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
3423#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
3424    VEC_DATA_TYPE(TYPE, 4)                         \
3425    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
3426#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
3427    VEC_DATA_TYPE(TYPE, 8)                         \
3428    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
3429#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
3430    VEC_DATA_TYPE(TYPE, 16)                         \
3431    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
3432
3433
3434
3435#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
3436    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
3437#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
3438    VEC_DATA_TYPE(TYPE, 2)                                \
3439    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
3440#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
3441    VEC_DATA_TYPE(TYPE, 3)                                \
3442    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
3443#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
3444    VEC_DATA_TYPE(TYPE, 4)                                \
3445    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
3446#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
3447    VEC_DATA_TYPE(TYPE, 8)                                \
3448    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
3449#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
3450    VEC_DATA_TYPE(TYPE, 16)                                \
3451    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
3452
3453
3454
3455#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
3456    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
3457#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
3458    COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
3459    COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
3460#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
3461    TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
3462    COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
3463#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
3464    TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
3465    COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
3466#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
3467    TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
3468    COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
3469    COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
3470    COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
3471    COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
3472#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
3473    TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
3474    COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
3475    COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
3476    COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
3477    COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
3478    COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
3479    COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
3480    COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
3481    COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
3482
3483
3484
3485
3486#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
3487    CONCAT(COLUMN_VECTOR, K0)                          \
3488    (IDX_COL, BASENAME, BS, TYPE);
3489
3490
3491#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
3492    CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
3493    (IDX_COL, BASENAME, BS, TYPE);
3494
3495
3496#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
3497    CONCAT(TRANSPOSE_K0X, N0)                       \
3498    (K0, BASENAME, BS, TYPE);
3499
3500
3501#define ADD_ROW_1(BASENAME, BIAS) \
3502    BASENAME##0 += BIAS##0;
3503
3504#define ADD_ROW_2(BASENAME, BIAS) \
3505    ADD_ROW_1(BASENAME, BIAS)     \
3506    BASENAME##1 += BIAS##1;
3507
3508#define ADD_ROW_3(BASENAME, BIAS) \
3509    ADD_ROW_2(BASENAME, BIAS)     \
3510    BASENAME##2 += BIAS##2;
3511
3512#define ADD_ROW_4(BASENAME, BIAS) \
3513    ADD_ROW_3(BASENAME, BIAS)     \
3514    BASENAME##3 += BIAS##3;
3515
3516#define ADD_ROW_5(BASENAME, BIAS) \
3517    ADD_ROW_4(BASENAME, BIAS)     \
3518    BASENAME##4 += BIAS##4;
3519
3520#define ADD_ROW_6(BASENAME, BIAS) \
3521    ADD_ROW_5(BASENAME, BIAS)     \
3522    BASENAME##5 += BIAS##5;
3523
3524#define ADD_ROW_7(BASENAME, BIAS) \
3525    ADD_ROW_6(BASENAME, BIAS)     \
3526    BASENAME##6 += BIAS##6;
3527
3528#define ADD_ROW_8(BASENAME, BIAS) \
3529    ADD_ROW_7(BASENAME, BIAS)     \
3530    BASENAME##7 += BIAS##7;
3531
3532#define ADD_ROW_9(BASENAME, BIAS) \
3533    ADD_ROW_8(BASENAME, BIAS)     \
3534    BASENAME##8 += BIAS##8;
3535
3536#define ADD_ROW_10(BASENAME, BIAS) \
3537    ADD_ROW_9(BASENAME, BIAS)      \
3538    BASENAME##9 += BIAS##9;
3539
3540#define ADD_ROW_11(BASENAME, BIAS) \
3541    ADD_ROW_10(BASENAME, BIAS)     \
3542    BASENAME##A += BIAS##A;
3543
3544#define ADD_ROW_12(BASENAME, BIAS) \
3545    ADD_ROW_11(BASENAME, BIAS)     \
3546    BASENAME##B += BIAS##B;
3547
3548#define ADD_ROW_13(BASENAME, BIAS) \
3549    ADD_ROW_12(BASENAME, BIAS)     \
3550    BASENAME##C += BIAS##C;
3551
3552#define ADD_ROW_14(BASENAME, BIAS) \
3553    ADD_ROW_13(BASENAME, BIAS)     \
3554    BASENAME##D += BIAS##D;
3555
3556#define ADD_ROW_15(BASENAME, BIAS) \
3557    ADD_ROW_14(BASENAME, BIAS)     \
3558    BASENAME##E += BIAS##E;
3559
3560#define ADD_ROW_16(BASENAME, BIAS) \
3561    ADD_ROW_15(BASENAME, BIAS)     \
3562    BASENAME##F += BIAS##F;
3563
3564
3565
3566
3567#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
3568#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
3569
3570
3571
3572#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
3573    BASENAME##0 += BIAS;
3574
3575#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
3576    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
3577    BASENAME##1 += BIAS;
3578
3579#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
3580    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
3581    BASENAME##2 += BIAS;
3582
3583#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
3584    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
3585    BASENAME##3 += BIAS;
3586
3587#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
3588    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
3589    BASENAME##4 += BIAS;
3590
3591#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
3592    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
3593    BASENAME##5 += BIAS;
3594
3595#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
3596    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
3597    BASENAME##6 += BIAS;
3598
3599#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
3600    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
3601    BASENAME##7 += BIAS;
3602
3603#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
3604    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
3605    BASENAME##8 += BIAS;
3606
3607#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
3608    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
3609    BASENAME##9 += BIAS;
3610
3611#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
3612    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
3613    BASENAME##A += BIAS;
3614
3615#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
3616    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
3617    BASENAME##B += BIAS;
3618
3619#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
3620    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
3621    BASENAME##C += BIAS;
3622
3623#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
3624    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
3625    BASENAME##D += BIAS;
3626
3627#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
3628    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
3629    BASENAME##E += BIAS;
3630
3631#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
3632    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
3633    BASENAME##F += BIAS;
3634
3635
3636#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
3637#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
3638
3639
3640
3641#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3642    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
3643
3644#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3645    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3646    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
3647
3648#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3649    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3650    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
3651
3652#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3653    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3654    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
3655
3656#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3657    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3658    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
3659
3660#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3661    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3662    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
3663
3664#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3665    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3666    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
3667
3668#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3669    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3670    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
3671
3672#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3673    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3674    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
3675
3676#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3677    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
3678    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
3679
3680#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3681    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3682    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
3683
3684#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3685    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3686    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
3687
3688#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3689    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3690    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
3691
3692#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3693    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3694    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
3695
3696#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3697    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3698    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
3699
3700#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3701    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3702    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
3703
3704
3705
3706#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3707#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3708
3709
3710
3711#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3712    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3713    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
3714
3715#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3716    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3717    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3718    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
3719
3720#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3721    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3722    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3723    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
3724
3725#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3726    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3727    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3728    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
3729
3730#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3731    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3732    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3733    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
3734
3735#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3736    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3737    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3738    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
3739
3740#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3741    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3742    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3743    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
3744
3745#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3746    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3747    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3748    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
3749
3750#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3751    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3752    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3753    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
3754
3755#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3756    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
3757    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3758    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
3759
3760#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3761    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3762    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3763    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
3764
3765#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3766    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3767    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3768    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
3769
3770#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3771    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3772    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3773    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
3774
3775#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3776    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3777    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3778    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
3779
3780#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3781    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3782    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3783    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
3784
3785#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3786    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3787    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3788    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
3789
3790
3791
3792#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3793#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3794
3795
3796#ifndef ARM_COMPUTE_REPEAT_H
3797#define ARM_COMPUTE_REPEAT_H
3798
3799
3800#ifndef ARM_COMPUTE_HELPER_H
3801#define ARM_COMPUTE_HELPER_H
3802
3803
3804
3805
3806#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3807    VSTORE(N0)                                                 \
3808    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3809
3810#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3811    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3812    VSTORE(N0)                                                 \
3813    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3814
3815#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3816    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3817    VSTORE(N0)                                                 \
3818    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3819
3820#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3821    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3822    VSTORE(N0)                                                 \
3823    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3824
3825#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3826    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3827    VSTORE(N0)                                                 \
3828    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3829
3830#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3831    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3832    VSTORE(N0)                                                 \
3833    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3834
3835#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3836    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3837    VSTORE(N0)                                                 \
3838    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3839
3840#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3841    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3842    VSTORE(N0)                                                 \
3843    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3844
3845#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3846    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3847    VSTORE(N0)                                                 \
3848    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3849
3850#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3851    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
3852    VSTORE(N0)                                                  \
3853    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3854
3855#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3856    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3857    VSTORE(N0)                                                  \
3858    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3859
3860#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3861    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3862    VSTORE(N0)                                                  \
3863    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3864
3865#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3866    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3867    VSTORE(N0)                                                  \
3868    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3869
3870#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3871    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3872    VSTORE(N0)                                                  \
3873    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3874
3875#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3876    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3877    VSTORE(N0)                                                  \
3878    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3879
3880#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3881    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3882    VSTORE(N0)                                                  \
3883    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3884
3885
3886
3887#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3888    VSTORE(N0)                                                         \
3889    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3890
3891#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3892    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3893    VSTORE(N0)                                                         \
3894    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3895
3896#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3897    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3898    VSTORE(N0)                                                         \
3899    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3900
3901#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3902    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3903    VSTORE(N0)                                                         \
3904    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3905
3906#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3907    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3908    VSTORE(N0)                                                         \
3909    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3910
3911#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3912    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3913    VSTORE(N0)                                                         \
3914    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3915
3916#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3917    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3918    VSTORE(N0)                                                         \
3919    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3920
3921#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3922    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3923    VSTORE(N0)                                                         \
3924    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3925
3926#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3927    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3928    VSTORE(N0)                                                         \
3929    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3930
3931#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
3932    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3933    VSTORE(N0)                                                     \
3934    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3935
3936#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3937    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3938    VSTORE(N0)                                                          \
3939    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3940
3941#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3942    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3943    VSTORE(N0)                                                          \
3944    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3945
3946#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3947    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3948    VSTORE(N0)                                                          \
3949    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3950
3951#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3952    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3953    VSTORE(N0)                                                          \
3954    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3955
3956#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3957    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3958    VSTORE(N0)                                                          \
3959    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3960
3961#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3962    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3963    VSTORE(N0)                                                          \
3964    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3965
3966
3967
3968
3969#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3970#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3971
3972
3973
3974#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3975#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3976
3977
3978
3979#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3980    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3981    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3982
3983#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3984    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3985    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3986    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3987
3988#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3989    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3990    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3991    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3992
3993#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3994    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3995    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3996    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3997
3998#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3999    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4000    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4001    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
4002
4003#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4004    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4005    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4006    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
4007
4008#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4009    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4010    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4011    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
4012
4013#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4014    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4015    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4016    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
4017
4018#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4019    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4020    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4021    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
4022
4023#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4024    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
4025    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4026    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
4027
4028#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4029    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4030    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4031    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
4032
4033#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4034    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4035    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4036    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
4037
4038#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4039    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4040    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4041    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
4042
4043#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4044    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4045    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4046    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
4047
4048#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4049    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4050    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4051    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
4052
4053#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4054    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4055    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4056    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
4057
4058
4059
4060#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4061#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4062
4063#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4064    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
4065    {                                                                                                                                                     \
4066        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
4067    }                                                                                                                                                     \
4068    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
4069    {                                                                                                                                                     \
4070        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4071    }                                                                                                                                                     \
4072    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
4073    {                                                                                                                                                     \
4074        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4075    }                                                                                                                                                     \
4076    else                                                                                                                                                  \
4077    {                                                                                                                                                     \
4078        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
4079    }
4080
4081#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
4082    if(!(PARTIAL_COND_X))                                                                                         \
4083    {                                                                                                             \
4084        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4085    }                                                                                                             \
4086    else                                                                                                          \
4087    {                                                                                                             \
4088        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4089    }
4090
4091#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
4092    if(!(PARTIAL_COND_Y))                                                                                         \
4093    {                                                                                                             \
4094        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4095    }                                                                                                             \
4096    else                                                                                                          \
4097    {                                                                                                             \
4098        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4099    }
4100
4101
4102#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
4103
4104
4105#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
4106
4107#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4108    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4109
4110#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
4111
4112#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4113    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
4114
4115#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
4116
4117#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4118    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
4119
4120#else
4121
4122#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4123    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
4124
4125#endif
4126
4127#endif
4128
4129
4130#if defined(PARTIAL_STORE_M0)
4131
4132#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4133    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
4134#else
4135#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4136    ((uint)(y * M0))
4137#endif
4138
4139
4140
4141#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
4142    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
4143
4144
4145#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4146#pragma OPENCL EXTENSION cl_khr_fp16 : enable
4147#endif
4148
4149#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
4150#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
4151#endif
4152
4153#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
4154#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
4155#endif
4156
4157#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
4158#pragma OPENCL EXTENSION cl_arm_printf : enable
4159#endif
4160
4161#define GPU_ARCH_MIDGARD 0x100
4162#define GPU_ARCH_BIFROST 0x200
4163#define GPU_ARCH_VALHALL 0x300
4164
4165
4166#define CONCAT(a, b) a##b
4167
4168
4169#define EXPAND(x) x
4170
4171
4172#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
4173
4174
4175#define REV1(x) ((x))
4176#define REV2(x) ((x).s10)
4177#define REV3(x) ((x).s210)
4178#define REV4(x) ((x).s3210)
4179#define REV8(x) ((x).s76543210)
4180#define REV16(x) ((x).sFEDCBA9876543210)
4181
4182
4183
4184#define REVERSE_STR(x, s) REV##s((x))
4185#define REVERSE(x, s) REVERSE_STR(x, s)
4186
4187
4188
4189#define ROT1_0(x) ((x))
4190#define ROT1_1(x) ((x))
4191
4192#define ROT2_0(x) ((x))
4193#define ROT2_1(x) ((x).s10)
4194#define ROT2_2(x) ((x))
4195
4196#define ROT3_0(x) ((x))
4197#define ROT3_1(x) ((x).s201)
4198#define ROT3_2(x) ((x).s120)
4199#define ROT3_3(x) ((x))
4200
4201#define ROT4_0(x) ((x))
4202#define ROT4_1(x) ((x).s3012)
4203#define ROT4_2(x) ((x).s2301)
4204#define ROT4_3(x) ((x).s1230)
4205#define ROT4_4(x) ((x))
4206
4207#define ROT8_0(x) ((x))
4208#define ROT8_1(x) ((x).s70123456)
4209#define ROT8_2(x) ((x).s67012345)
4210#define ROT8_3(x) ((x).s56701234)
4211#define ROT8_4(x) ((x).s45670123)
4212#define ROT8_5(x) ((x).s34567012)
4213#define ROT8_6(x) ((x).s23456701)
4214#define ROT8_7(x) ((x).s12345670)
4215#define ROT8_8(x) ((x))
4216
4217#define ROT16_0(x) ((x))
4218#define ROT16_1(x) ((x).sF0123456789ABCDE)
4219#define ROT16_2(x) ((x).sEF0123456789ABCD)
4220#define ROT16_3(x) ((x).sDEF0123456789ABC)
4221#define ROT16_4(x) ((x).sCDEF0123456789AB)
4222#define ROT16_5(x) ((x).sBCDEF0123456789A)
4223#define ROT16_6(x) ((x).sABCDEF0123456789)
4224#define ROT16_7(x) ((x).s9ABCDEF012345678)
4225#define ROT16_8(x) ((x).s89ABCDEF01234567)
4226#define ROT16_9(x) ((x).s789ABCDEF0123456)
4227#define ROT16_10(x) ((x).s6789ABCDEF012345)
4228#define ROT16_11(x) ((x).s56789ABCDEF01234)
4229#define ROT16_12(x) ((x).s456789ABCDEF0123)
4230#define ROT16_13(x) ((x).s3456789ABCDEF012)
4231#define ROT16_14(x) ((x).s23456789ABCDEF01)
4232#define ROT16_15(x) ((x).s123456789ABCDEF0)
4233#define ROT16_16(x) ((x))
4234
4235
4236
4237#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
4238#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
4239
4240
4241
4242#define V_OFFS1(dt) (dt##1)(0)
4243#define V_OFFS2(dt) (dt##2)(0, 1)
4244#define V_OFFS3(dt) (dt##3)(0, 1, 2)
4245#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
4246#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
4247#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
4248
4249
4250
4251#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
4252#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
4253
4254
4255#define VLOAD_STR(size) vload##size
4256#define VLOAD(size) VLOAD_STR(size)
4257
4258
4259#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
4260#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
4261
4262#define NO_LOAD(data, offs, ptr) \
4263    {                            \
4264    }
4265
4266
4267#define vload_partial_1_0 NO_LOAD
4268#define vload_partial_1_1 vload1
4269#define vload_partial_1_2 NO_LOAD
4270#define vload_partial_1_3 NO_LOAD
4271#define vload_partial_1_4 NO_LOAD
4272#define vload_partial_1_5 NO_LOAD
4273#define vload_partial_1_6 NO_LOAD
4274#define vload_partial_1_7 NO_LOAD
4275#define vload_partial_1_8 NO_LOAD
4276#define vload_partial_1_9 NO_LOAD
4277#define vload_partial_1_10 NO_LOAD
4278#define vload_partial_1_11 NO_LOAD
4279#define vload_partial_1_12 NO_LOAD
4280#define vload_partial_1_13 NO_LOAD
4281#define vload_partial_1_14 NO_LOAD
4282#define vload_partial_1_15 NO_LOAD
4283#define vload_partial_1_16 NO_LOAD
4284
4285#define vload_partial_2_0 NO_LOAD
4286#define vload_partial_2_1 vload_partial_1
4287#define vload_partial_2_2 vload_partial_2
4288#define vload_partial_2_3 NO_LOAD
4289#define vload_partial_2_4 NO_LOAD
4290#define vload_partial_2_5 NO_LOAD
4291#define vload_partial_2_6 NO_LOAD
4292#define vload_partial_2_7 NO_LOAD
4293#define vload_partial_2_8 NO_LOAD
4294#define vload_partial_2_9 NO_LOAD
4295#define vload_partial_2_10 NO_LOAD
4296#define vload_partial_2_11 NO_LOAD
4297#define vload_partial_2_12 NO_LOAD
4298#define vload_partial_2_13 NO_LOAD
4299#define vload_partial_2_14 NO_LOAD
4300#define vload_partial_2_15 NO_LOAD
4301#define vload_partial_2_16 NO_LOAD
4302
4303#define vload_partial_3_0 NO_LOAD
4304#define vload_partial_3_1 vload_partial_1
4305#define vload_partial_3_2 vload_partial_2
4306#define vload_partial_3_3 vload_partial_3
4307#define vload_partial_3_4 NO_LOAD
4308#define vload_partial_3_5 NO_LOAD
4309#define vload_partial_3_6 NO_LOAD
4310#define vload_partial_3_7 NO_LOAD
4311#define vload_partial_3_8 NO_LOAD
4312#define vload_partial_3_9 NO_LOAD
4313#define vload_partial_3_10 NO_LOAD
4314#define vload_partial_3_11 NO_LOAD
4315#define vload_partial_3_12 NO_LOAD
4316#define vload_partial_3_13 NO_LOAD
4317#define vload_partial_3_14 NO_LOAD
4318#define vload_partial_3_15 NO_LOAD
4319#define vload_partial_3_16 NO_LOAD
4320
4321#define vload_partial_4_0 NO_LOAD
4322#define vload_partial_4_1 vload_partial_1
4323#define vload_partial_4_2 vload_partial_2
4324#define vload_partial_4_3 vload_partial_3
4325#define vload_partial_4_4 vload_partial_4
4326#define vload_partial_4_5 NO_LOAD
4327#define vload_partial_4_6 NO_LOAD
4328#define vload_partial_4_7 NO_LOAD
4329#define vload_partial_4_8 NO_LOAD
4330#define vload_partial_4_9 NO_LOAD
4331#define vload_partial_4_10 NO_LOAD
4332#define vload_partial_4_11 NO_LOAD
4333#define vload_partial_4_12 NO_LOAD
4334#define vload_partial_4_13 NO_LOAD
4335#define vload_partial_4_14 NO_LOAD
4336#define vload_partial_4_15 NO_LOAD
4337#define vload_partial_4_16 NO_LOAD
4338
4339#define vload_partial_8_0 NO_LOAD
4340#define vload_partial_8_1 vload_partial_1
4341#define vload_partial_8_2 vload_partial_2
4342#define vload_partial_8_3 vload_partial_3
4343#define vload_partial_8_4 vload_partial_4
4344#define vload_partial_8_5 vload_partial_5
4345#define vload_partial_8_6 vload_partial_6
4346#define vload_partial_8_7 vload_partial_7
4347#define vload_partial_8_8 vload_partial_8
4348#define vload_partial_8_9 NO_LOAD
4349#define vload_partial_8_10 NO_LOAD
4350#define vload_partial_8_11 NO_LOAD
4351#define vload_partial_8_12 NO_LOAD
4352#define vload_partial_8_13 NO_LOAD
4353#define vload_partial_8_14 NO_LOAD
4354#define vload_partial_8_15 NO_LOAD
4355#define vload_partial_8_16 NO_LOAD
4356
4357#define vload_partial_16_0 NO_LOAD
4358#define vload_partial_16_1 vload_partial_1
4359#define vload_partial_16_2 vload_partial_2
4360#define vload_partial_16_3 vload_partial_3
4361#define vload_partial_16_4 vload_partial_4
4362#define vload_partial_16_5 vload_partial_5
4363#define vload_partial_16_6 vload_partial_6
4364#define vload_partial_16_7 vload_partial_7
4365#define vload_partial_16_8 vload_partial_8
4366#define vload_partial_16_9 vload_partial_9
4367#define vload_partial_16_10 vload_partial_10
4368#define vload_partial_16_11 vload_partial_11
4369#define vload_partial_16_12 vload_partial_12
4370#define vload_partial_16_13 vload_partial_13
4371#define vload_partial_16_14 vload_partial_14
4372#define vload_partial_16_15 vload_partial_15
4373#define vload_partial_16_16 vload_partial_16
4374
4375
4376#define vload_partial_1(DATA, OFFSET, PTR) \
4377    DATA.s0 = vload1(OFFSET, PTR);
4378
4379#define vload_partial_2(DATA, OFFSET, PTR) \
4380    DATA.s01 = vload2(OFFSET, PTR);
4381
4382#define vload_partial_3(DATA, OFFSET, PTR) \
4383    DATA.s012 = vload3(OFFSET, PTR);
4384
4385#define vload_partial_4(DATA, OFFSET, PTR) \
4386    DATA.s0123 = vload4(OFFSET, PTR);
4387
4388#define vload_partial_5(DATA, OFFSET, PTR)    \
4389    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4390    DATA.s4 = vload1(OFFSET, PTR + 4);
4391
4392#define vload_partial_6(DATA, OFFSET, PTR)    \
4393    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4394    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
4395
4396#define vload_partial_7(DATA, OFFSET, PTR)    \
4397    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4398    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
4399
4400#define vload_partial_8(DATA, OFFSET, PTR) \
4401    DATA.s01234567 = vload8(OFFSET, PTR);
4402
4403#define vload_partial_9(DATA, OFFSET, PTR)        \
4404    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4405    DATA.s8 = vload1(OFFSET, PTR + 8);
4406
4407#define vload_partial_10(DATA, OFFSET, PTR)       \
4408    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4409    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
4410
4411#define vload_partial_11(DATA, OFFSET, PTR)       \
4412    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4413    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
4414
4415#define vload_partial_12(DATA, OFFSET, PTR)       \
4416    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4417    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
4418
4419#define vload_partial_13(DATA, OFFSET, PTR)       \
4420    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4421    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
4422
4423#define vload_partial_14(DATA, OFFSET, PTR)       \
4424    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4425    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
4426
4427#define vload_partial_15(DATA, OFFSET, PTR)       \
4428    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4429    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
4430
4431#define vload_partial_16(DATA, OFFSET, PTR) \
4432    DATA = vload16(OFFSET, PTR);
4433
4434
4435
4436#define PIXEL_UNIT4 1
4437#define PIXEL_UNIT8 2
4438#define PIXEL_UNIT16 4
4439
4440
4441#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
4442#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
4443
4444
4445#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
4446#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
4447#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
4448
4449#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4450#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
4451#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
4452#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
4453#endif
4454
4455#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
4456#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
4457#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4458
4459#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4460#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
4461#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
4462#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4463#endif
4464
4465
4466#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
4467#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
4468
4469
4470#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
4471#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
4472
4473#define VSTORE_STR(size) vstore##size
4474#define VSTORE(size) VSTORE_STR(size)
4475
4476#define float1 float
4477#define half1 half
4478#define char1 char
4479#define uchar1 uchar
4480#define short1 short
4481#define ushort1 ushort
4482#define int1 int
4483#define uint1 uint
4484#define long1 long
4485#define ulong1 ulong
4486#define double1 double
4487
4488#define vload1(OFFSET, PTR) *(OFFSET + PTR)
4489#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
4490
4491
4492#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
4493#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
4494
4495#define NO_STORE(data, offs, ptr) \
4496    {                             \
4497    }
4498
4499
4500#define vstore_partial_1_0 NO_STORE
4501#define vstore_partial_1_1 vstore1
4502#define vstore_partial_1_2 NO_STORE
4503#define vstore_partial_1_3 NO_STORE
4504#define vstore_partial_1_4 NO_STORE
4505#define vstore_partial_1_5 NO_STORE
4506#define vstore_partial_1_6 NO_STORE
4507#define vstore_partial_1_7 NO_STORE
4508#define vstore_partial_1_8 NO_STORE
4509#define vstore_partial_1_9 NO_STORE
4510#define vstore_partial_1_10 NO_STORE
4511#define vstore_partial_1_11 NO_STORE
4512#define vstore_partial_1_12 NO_STORE
4513#define vstore_partial_1_13 NO_STORE
4514#define vstore_partial_1_14 NO_STORE
4515#define vstore_partial_1_15 NO_STORE
4516#define vstore_partial_1_16 NO_STORE
4517
4518#define vstore_partial_2_0 NO_STORE
4519#define vstore_partial_2_1 vstore_partial_1
4520#define vstore_partial_2_2 vstore_partial_2
4521#define vstore_partial_2_3 NO_STORE
4522#define vstore_partial_2_4 NO_STORE
4523#define vstore_partial_2_5 NO_STORE
4524#define vstore_partial_2_6 NO_STORE
4525#define vstore_partial_2_7 NO_STORE
4526#define vstore_partial_2_8 NO_STORE
4527#define vstore_partial_2_9 NO_STORE
4528#define vstore_partial_2_10 NO_STORE
4529#define vstore_partial_2_11 NO_STORE
4530#define vstore_partial_2_12 NO_STORE
4531#define vstore_partial_2_13 NO_STORE
4532#define vstore_partial_2_14 NO_STORE
4533#define vstore_partial_2_15 NO_STORE
4534#define vstore_partial_2_16 NO_STORE
4535
4536#define vstore_partial_3_0 NO_STORE
4537#define vstore_partial_3_1 vstore_partial_1
4538#define vstore_partial_3_2 vstore_partial_2
4539#define vstore_partial_3_3 vstore_partial_3
4540#define vstore_partial_3_4 NO_STORE
4541#define vstore_partial_3_5 NO_STORE
4542#define vstore_partial_3_6 NO_STORE
4543#define vstore_partial_3_7 NO_STORE
4544#define vstore_partial_3_8 NO_STORE
4545#define vstore_partial_3_9 NO_STORE
4546#define vstore_partial_3_10 NO_STORE
4547#define vstore_partial_3_11 NO_STORE
4548#define vstore_partial_3_12 NO_STORE
4549#define vstore_partial_3_13 NO_STORE
4550#define vstore_partial_3_14 NO_STORE
4551#define vstore_partial_3_15 NO_STORE
4552#define vstore_partial_3_16 NO_STORE
4553
4554#define vstore_partial_4_0 NO_STORE
4555#define vstore_partial_4_1 vstore_partial_1
4556#define vstore_partial_4_2 vstore_partial_2
4557#define vstore_partial_4_3 vstore_partial_3
4558#define vstore_partial_4_4 vstore_partial_4
4559#define vstore_partial_4_5 NO_STORE
4560#define vstore_partial_4_6 NO_STORE
4561#define vstore_partial_4_7 NO_STORE
4562#define vstore_partial_4_8 NO_STORE
4563#define vstore_partial_4_9 NO_STORE
4564#define vstore_partial_4_10 NO_STORE
4565#define vstore_partial_4_11 NO_STORE
4566#define vstore_partial_4_12 NO_STORE
4567#define vstore_partial_4_13 NO_STORE
4568#define vstore_partial_4_14 NO_STORE
4569#define vstore_partial_4_15 NO_STORE
4570#define vstore_partial_4_16 NO_STORE
4571
4572#define vstore_partial_8_0 NO_STORE
4573#define vstore_partial_8_1 vstore_partial_1
4574#define vstore_partial_8_2 vstore_partial_2
4575#define vstore_partial_8_3 vstore_partial_3
4576#define vstore_partial_8_4 vstore_partial_4
4577#define vstore_partial_8_5 vstore_partial_5
4578#define vstore_partial_8_6 vstore_partial_6
4579#define vstore_partial_8_7 vstore_partial_7
4580#define vstore_partial_8_8 vstore_partial_8
4581#define vstore_partial_8_9 NO_STORE
4582#define vstore_partial_8_10 NO_STORE
4583#define vstore_partial_8_11 NO_STORE
4584#define vstore_partial_8_12 NO_STORE
4585#define vstore_partial_8_13 NO_STORE
4586#define vstore_partial_8_14 NO_STORE
4587#define vstore_partial_8_15 NO_STORE
4588#define vstore_partial_8_16 NO_STORE
4589
4590#define vstore_partial_16_0 NO_STORE
4591#define vstore_partial_16_1 vstore_partial_1
4592#define vstore_partial_16_2 vstore_partial_2
4593#define vstore_partial_16_3 vstore_partial_3
4594#define vstore_partial_16_4 vstore_partial_4
4595#define vstore_partial_16_5 vstore_partial_5
4596#define vstore_partial_16_6 vstore_partial_6
4597#define vstore_partial_16_7 vstore_partial_7
4598#define vstore_partial_16_8 vstore_partial_8
4599#define vstore_partial_16_9 vstore_partial_9
4600#define vstore_partial_16_10 vstore_partial_10
4601#define vstore_partial_16_11 vstore_partial_11
4602#define vstore_partial_16_12 vstore_partial_12
4603#define vstore_partial_16_13 vstore_partial_13
4604#define vstore_partial_16_14 vstore_partial_14
4605#define vstore_partial_16_15 vstore_partial_15
4606#define vstore_partial_16_16 vstore_partial_16
4607
4608
4609#define vstore_partial_1(DATA, OFFSET, PTR) \
4610    vstore1(DATA.s0, OFFSET, PTR);
4611
4612#define vstore_partial_2(DATA, OFFSET, PTR) \
4613    vstore2(DATA.s01, OFFSET, PTR);
4614
4615#define vstore_partial_3(DATA, OFFSET, PTR) \
4616    vstore3(DATA.s012, OFFSET, PTR);
4617
4618#define vstore_partial_4(DATA, OFFSET, PTR) \
4619    vstore4(DATA.s0123, OFFSET, PTR);
4620
4621#define vstore_partial_5(DATA, OFFSET, PTR)    \
4622    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4623    vstore1(DATA.s4, OFFSET, PTR + 4);
4624
4625#define vstore_partial_6(DATA, OFFSET, PTR)    \
4626    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4627    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
4628
4629#define vstore_partial_7(DATA, OFFSET, PTR)    \
4630    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4631    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
4632
4633#define vstore_partial_8(DATA, OFFSET, PTR) \
4634    vstore8(DATA.s01234567, OFFSET, PTR);
4635
4636#define vstore_partial_9(DATA, OFFSET, PTR)        \
4637    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4638    vstore1(DATA.s8, OFFSET, PTR + 8);
4639
4640#define vstore_partial_10(DATA, OFFSET, PTR)       \
4641    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4642    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
4643
4644#define vstore_partial_11(DATA, OFFSET, PTR)       \
4645    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4646    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
4647
4648#define vstore_partial_12(DATA, OFFSET, PTR)       \
4649    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4650    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
4651
4652#define vstore_partial_13(DATA, OFFSET, PTR)       \
4653    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4654    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
4655
4656#define vstore_partial_14(DATA, OFFSET, PTR)       \
4657    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4658    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
4659
4660#define vstore_partial_15(DATA, OFFSET, PTR)       \
4661    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4662    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
4663
4664#define vstore_partial_16(DATA, OFFSET, PTR) \
4665    vstore16(DATA, OFFSET, PTR);
4666
4667
4668
4669
4670
4671#define convert_float_sat convert_float
4672#define convert_float1_sat convert_float
4673#define convert_float2_sat convert_float2
4674#define convert_float3_sat convert_float3
4675#define convert_float4_sat convert_float4
4676#define convert_float8_sat convert_float8
4677#define convert_float16_sat convert_float16
4678#define convert_half_sat convert_float
4679#define convert_half1_sat convert_half
4680#define convert_half2_sat convert_half2
4681#define convert_half3_sat convert_half3
4682#define convert_half4_sat convert_half4
4683#define convert_half8_sat convert_half8
4684#define convert_half16_sat convert_half16
4685
4686#define convert_float1 convert_float
4687#define convert_half1 convert_half
4688#define convert_char1 convert_char
4689#define convert_uchar1 convert_uchar
4690#define convert_short1 convert_short
4691#define convert_ushort1 convert_ushort
4692#define convert_int1 convert_int
4693#define convert_uint1 convert_uint
4694#define convert_long1 convert_long
4695#define convert_ulong1 convert_ulong
4696#define convert_double1 convert_double
4697
4698#define convert_char1_sat convert_char_sat
4699#define convert_uchar1_sat convert_uchar_sat
4700#define convert_uchar2_sat convert_uchar2_sat
4701#define convert_uchar3_sat convert_uchar3_sat
4702#define convert_uchar4_sat convert_uchar4_sat
4703#define convert_uchar8_sat convert_uchar8_sat
4704#define convert_uchar16_sat convert_uchar16_sat
4705#define convert_short1_sat convert_short_sat
4706#define convert_ushort1_sat convert_ushort_sat
4707#define convert_int1_sat convert_int_sat
4708#define convert_uint1_sat convert_uint_sat
4709#define convert_long1_sat convert_long_sat
4710#define convert_ulong1_sat convert_ulong_sat
4711#define convert_double1_sat convert_double_sat
4712
4713#define VEC_DATA_TYPE_STR(type, size) type##size
4714#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
4715
4716#define CONVERT_STR(x, type) (convert_##type((x)))
4717#define CONVERT(x, type) CONVERT_STR(x, type)
4718
4719#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
4720#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
4721
4722#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
4723#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
4724
4725#define select_vec_dt_uchar(size) uchar##size
4726#define select_vec_dt_char(size) char##size
4727#define select_vec_dt_ushort(size) ushort##size
4728#define select_vec_dt_short(size) short##size
4729#define select_vec_dt_half(size) short##size
4730#define select_vec_dt_uint(size) uint##size
4731#define select_vec_dt_int(size) int##size
4732#define select_vec_dt_float(size) int##size
4733#define select_vec_dt_ulong(size) ulong##size
4734#define select_vec_dt_long(size) long##size
4735
4736#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
4737#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
4738#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
4739
4740#define signed_int_vec_dt_uchar(size) char##size
4741#define signed_int_vec_dt_char(size) char##size
4742#define signed_int_vec_dt_ushort(size) short##size
4743#define signed_int_vec_dt_short(size) short##size
4744#define signed_int_vec_dt_half(size) short##size
4745#define signed_int_vec_dt_uint(size) int##size
4746#define signed_int_vec_dt_int(size) int##size
4747#define signed_int_vec_dt_float(size) int##size
4748#define signed_int_vec_dt_ulong(size) long##size
4749#define signed_int_vec_dt_long(size) long##size
4750
4751#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
4752#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
4753#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
4754
4755#define sum_reduce_1(x) (x)
4756#define sum_reduce_2(x) ((x).s0) + ((x).s1)
4757#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
4758#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
4759#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
4760#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
4761
4762#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
4763#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
4764
4765#define prod_reduce_1(x) (x)
4766#define prod_reduce_2(x) ((x).s0) * ((x).s1)
4767#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
4768#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
4769#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
4770#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
4771
4772#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
4773#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
4774
4775#define max_reduce_1(x) (x)
4776#define max_reduce_2(x) max(((x).s0), ((x).s1))
4777#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
4778#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
4779#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
4780#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
4781
4782#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
4783#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
4784
4785#define VECTOR_DECLARATION(name)     \
4786    __global uchar *name##_ptr,      \
4787    uint        name##_stride_x, \
4788    uint        name##_step_x,   \
4789    uint        name##_offset_first_element_in_bytes
4790
4791#define IMAGE_DECLARATION(name)      \
4792    __global uchar *name##_ptr,      \
4793    uint        name##_stride_x, \
4794    uint        name##_step_x,   \
4795    uint        name##_stride_y, \
4796    uint        name##_step_y,   \
4797    uint        name##_offset_first_element_in_bytes
4798
4799#define TENSOR3D_DECLARATION(name)   \
4800    __global uchar *name##_ptr,      \
4801    uint        name##_stride_x, \
4802    uint        name##_step_x,   \
4803    uint        name##_stride_y, \
4804    uint        name##_step_y,   \
4805    uint        name##_stride_z, \
4806    uint        name##_step_z,   \
4807    uint        name##_offset_first_element_in_bytes
4808
4809#define TENSOR4D_DECLARATION(name)   \
4810    __global uchar *name##_ptr,      \
4811    uint        name##_stride_x, \
4812    uint        name##_step_x,   \
4813    uint        name##_stride_y, \
4814    uint        name##_step_y,   \
4815    uint        name##_stride_z, \
4816    uint        name##_step_z,   \
4817    uint        name##_stride_w, \
4818    uint        name##_step_w,   \
4819    uint        name##_offset_first_element_in_bytes
4820
4821#define TENSOR5D_DECLARATION(name)   \
4822    __global uchar *name##_ptr,      \
4823    uint        name##_stride_x, \
4824    uint        name##_step_x,   \
4825    uint        name##_stride_y, \
4826    uint        name##_step_y,   \
4827    uint        name##_stride_z, \
4828    uint        name##_step_z,   \
4829    uint        name##_stride_w, \
4830    uint        name##_step_w,   \
4831    uint        name##_stride_v, \
4832    uint        name##_step_v,   \
4833    uint        name##_offset_first_element_in_bytes
4834
4835#define CONVERT_TO_VECTOR_STRUCT(name) \
4836    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
4837
4838#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
4839    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
4840
4841#define CONVERT_TO_IMAGE_STRUCT(name) \
4842    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
4843
4844#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
4845    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
4846
4847#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4848    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4849
4850#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
4851    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
4852
4853#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4854    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4855
4856#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
4857    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4858                                 name##_stride_z, name##_step_z)
4859
4860#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
4861    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
4862
4863#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
4864    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4865                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
4866
4867#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
4868    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
4869
4870#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
4871    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4872                           name##_stride_z, name##_step_z)
4873
4874
4875typedef struct Vector
4876{
4877    __global uchar *ptr;
4878    int             offset_first_element_in_bytes;
4879    int             stride_x;
4880} Vector;
4881
4882
4883typedef struct Image
4884{
4885    __global uchar *ptr;
4886    int             offset_first_element_in_bytes;
4887    int             stride_x;
4888    int             stride_y;
4889} Image;
4890
4891
4892typedef struct Tensor3D
4893{
4894    __global uchar *ptr;
4895    int             offset_first_element_in_bytes;
4896    int             stride_x;
4897    int             stride_y;
4898    int             stride_z;
4899} Tensor3D;
4900
4901
4902typedef struct Tensor4D
4903{
4904    __global uchar *ptr;
4905    int             offset_first_element_in_bytes;
4906    int             stride_x;
4907    int             stride_y;
4908    int             stride_z;
4909    int             stride_w;
4910} Tensor4D;
4911
4912
4913inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
4914{
4915    Vector vector =
4916    {
4917        .ptr                           = ptr,
4918        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4919        .stride_x                      = stride_x,
4920    };
4921    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
4922    return vector;
4923}
4924
4925
4926inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
4927{
4928    Image img =
4929    {
4930        .ptr                           = ptr,
4931        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4932        .stride_x                      = stride_x,
4933        .stride_y                      = stride_y
4934    };
4935    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
4936    return img;
4937}
4938
4939
4940inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4941{
4942    Image img =
4943    {
4944        .ptr                           = ptr,
4945        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4946        .stride_x                      = stride_x,
4947        .stride_y                      = stride_y
4948    };
4949    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4950    return img;
4951}
4952
4953
4954inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4955{
4956    Tensor3D tensor =
4957    {
4958        .ptr                           = ptr,
4959        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4960        .stride_x                      = stride_x,
4961        .stride_y                      = stride_y,
4962        .stride_z                      = stride_z
4963    };
4964    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4965    return tensor;
4966}
4967
4968
4969inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4970{
4971    Tensor3D tensor =
4972    {
4973        .ptr                           = ptr,
4974        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4975        .stride_x                      = stride_x,
4976        .stride_y                      = stride_y,
4977        .stride_z                      = stride_z
4978    };
4979    return tensor;
4980}
4981
4982inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
4983                                             uint step_w,
4984                                             uint mod_size)
4985{
4986    Tensor4D tensor =
4987    {
4988        .ptr                           = ptr,
4989        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4990        .stride_x                      = stride_x,
4991        .stride_y                      = stride_y,
4992        .stride_z                      = stride_z,
4993        .stride_w                      = stride_w
4994    };
4995
4996    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
4997    return tensor;
4998}
4999
5000
5001inline __global const uchar *vector_offset(const Vector *vec, int x)
5002{
5003    return vec->ptr + x * vec->stride_x;
5004}
5005
5006
5007inline __global uchar *offset(const Image *img, int x, int y)
5008{
5009    return img->ptr + x * img->stride_x + y * img->stride_y;
5010}
5011
5012
5013inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
5014{
5015    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
5016}
5017
5018
5019inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
5020{
5021    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
5022}
5023
5024
5025inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
5026{
5027    uint num_elements = width * height;
5028
5029    const uint z = index / num_elements;
5030
5031    index %= num_elements;
5032
5033    const uint y = index / width;
5034
5035    index %= width;
5036
5037    const uint x = index;
5038
5039    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
5040}
5041
5042#endif
5043
5044
5045
5046#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
5047#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
5048    P_X##_DEF(1, P_A, P_B, P_C);       \
5049    REPEAT_3_1(P_X, P_A, P_B, P_C)
5050#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
5051    P_X##_DEF(2, P_A, P_B, P_C);       \
5052    REPEAT_3_2(P_X, P_A, P_B, P_C)
5053#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
5054    P_X##_DEF(3, P_A, P_B, P_C);       \
5055    REPEAT_3_3(P_X, P_A, P_B, P_C)
5056#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
5057    P_X##_DEF(4, P_A, P_B, P_C);       \
5058    REPEAT_3_4(P_X, P_A, P_B, P_C)
5059#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
5060    P_X##_DEF(5, P_A, P_B, P_C);       \
5061    REPEAT_3_5(P_X, P_A, P_B, P_C)
5062#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
5063    P_X##_DEF(6, P_A, P_B, P_C);       \
5064    REPEAT_3_6(P_X, P_A, P_B, P_C)
5065#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
5066    P_X##_DEF(7, P_A, P_B, P_C);       \
5067    REPEAT_3_7(P_X, P_A, P_B, P_C)
5068#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
5069    P_X##_DEF(8, P_A, P_B, P_C);       \
5070    REPEAT_3_8(P_X, P_A, P_B, P_C)
5071#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
5072    P_X##_DEF(9, P_A, P_B, P_C);        \
5073    REPEAT_3_9(P_X, P_A, P_B, P_C)
5074#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
5075    P_X##_DEF(A, P_A, P_B, P_C);        \
5076    REPEAT_3_10(P_X, P_A, P_B, P_C)
5077#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
5078    P_X##_DEF(B, P_A, P_B, P_C);        \
5079    REPEAT_3_11(P_X, P_A, P_B, P_C)
5080#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
5081    P_X##_DEF(C, P_A, P_B, P_C);        \
5082    REPEAT_3_12(P_X, P_A, P_B, P_C)
5083#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
5084    P_X##_DEF(D, P_A, P_B, P_C);        \
5085    REPEAT_3_13(P_X, P_A, P_B, P_C)
5086#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
5087    P_X##_DEF(E, P_A, P_B, P_C);        \
5088    REPEAT_3_14(P_X, P_A, P_B, P_C)
5089#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
5090    P_X##_DEF(F, P_A, P_B, P_C);        \
5091    REPEAT_3_15(P_X, P_A, P_B, P_C)
5092
5093#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C)
5094#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
5095
5096
5097#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
5098#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
5099    P_X##_DEF(1, P_A, P_B, P_C, P_D);       \
5100    REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
5101#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
5102    P_X##_DEF(2, P_A, P_B, P_C, P_D);       \
5103    REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
5104#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
5105    P_X##_DEF(3, P_A, P_B, P_C, P_D);       \
5106    REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
5107#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
5108    P_X##_DEF(4, P_A, P_B, P_C, P_D);       \
5109    REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
5110#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
5111    P_X##_DEF(5, P_A, P_B, P_C, P_D);       \
5112    REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
5113#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
5114    P_X##_DEF(6, P_A, P_B, P_C, P_D);       \
5115    REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
5116#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
5117    P_X##_DEF(7, P_A, P_B, P_C, P_D);       \
5118    REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
5119#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
5120    P_X##_DEF(8, P_A, P_B, P_C, P_D);       \
5121    REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
5122#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
5123    P_X##_DEF(9, P_A, P_B, P_C, P_D);        \
5124    REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
5125#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
5126    P_X##_DEF(A, P_A, P_B, P_C, P_D);        \
5127    REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
5128#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
5129    P_X##_DEF(B, P_A, P_B, P_C, P_D);        \
5130    REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
5131#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
5132    P_X##_DEF(C, P_A, P_B, P_C, P_D);        \
5133    REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
5134#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
5135    P_X##_DEF(D, P_A, P_B, P_C, P_D);        \
5136    REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
5137#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
5138    P_X##_DEF(E, P_A, P_B, P_C, P_D);        \
5139    REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
5140#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
5141    P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
5142    REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
5143
5144#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D)
5145#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
5146
5147
5148#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
5149#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
5150
5151
5152#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
5153#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
5154
5155
5156#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
5157#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
5158
5159
5160#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
5161#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
5162
5163
5164#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
5165#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
5166
5167
5168#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
5169#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
5170
5171
5172#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
5173#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
5174
5175
5176#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
5177#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
5178
5179
5180#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
5181#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
5182
5183
5184#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
5185#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
5186
5187
5188#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
5189#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
5190
5191
5192#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
5193    ({                                                                                                        \
5194        VEC_DATA_TYPE(int, N0)                                                                                \
5195        VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
5196        VEC_DATA_TYPE(int, N0)                                                                                \
5197        VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
5198        VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
5199    })
5200#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
5201
5202#endif
5203
5204#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
5205
5206#define CONCAT(a, b) a##b
5207
5208#define ARM_DOT1(a, b, c) \
5209    ({                    \
5210        c = fma(a, b, c); \
5211    })
5212#define ARM_DOT2(a, b, c)       \
5213    ({                          \
5214        c = fma(a.s0, b.s0, c); \
5215        c = fma(a.s1, b.s1, c); \
5216    })
5217#define ARM_DOT3(a, b, c)           \
5218    ({                              \
5219        ARM_DOT2(a, b, c);          \
5220        c = fma((a.s2), (b.s2), c); \
5221    })
5222#define ARM_DOT4(a, b, c)           \
5223    ({                              \
5224        ARM_DOT3(a, b, c);          \
5225        c = fma((a.s3), (b.s3), c); \
5226    })
5227#define ARM_DOT8(a, b, c)            \
5228    ({                               \
5229        ARM_DOT4((a.lo), (b.lo), c); \
5230        ARM_DOT4((a.hi), (b.hi), c); \
5231    })
5232#define ARM_DOT16(a, b, c)           \
5233    ({                               \
5234        ARM_DOT8((a.lo), (b.lo), c); \
5235        ARM_DOT8((a.hi), (b.hi), c); \
5236    })
5237
5238#if N0 == 2
5239#define ARM_DOT_K0XN0(k0, a, b, c) \
5240    ({                             \
5241        CONCAT(ARM_DOT, k0)        \
5242        ((a), (b##0), (c.s0));     \
5243        CONCAT(ARM_DOT, k0)        \
5244        ((a), (b##1), (c.s1));     \
5245    })
5246#elif N0 == 3
5247#define ARM_DOT_K0XN0(k0, a, b, c) \
5248    ({                             \
5249        CONCAT(ARM_DOT, k0)        \
5250        ((a), (b##0), (c.s0));     \
5251        CONCAT(ARM_DOT, k0)        \
5252        ((a), (b##1), (c.s1));     \
5253        CONCAT(ARM_DOT, k0)        \
5254        ((a), (b##2), (c.s2));     \
5255    })
5256#elif N0 == 4
5257#define ARM_DOT_K0XN0(k0, a, b, c) \
5258    ({                             \
5259        CONCAT(ARM_DOT, k0)        \
5260        ((a), (b##0), (c.s0));     \
5261        CONCAT(ARM_DOT, k0)        \
5262        ((a), (b##1), (c.s1));     \
5263        CONCAT(ARM_DOT, k0)        \
5264        ((a), (b##2), (c.s2));     \
5265        CONCAT(ARM_DOT, k0)        \
5266        ((a), (b##3), (c.s3));     \
5267    })
5268#elif N0 == 8
5269#define ARM_DOT_K0XN0(k0, a, b, c) \
5270    ({                             \
5271        CONCAT(ARM_DOT, k0)        \
5272        ((a), (b##0), (c.s0));     \
5273        CONCAT(ARM_DOT, k0)        \
5274        ((a), (b##1), (c.s1));     \
5275        CONCAT(ARM_DOT, k0)        \
5276        ((a), (b##2), (c.s2));     \
5277        CONCAT(ARM_DOT, k0)        \
5278        ((a), (b##3), (c.s3));     \
5279        CONCAT(ARM_DOT, k0)        \
5280        ((a), (b##4), (c.s4));     \
5281        CONCAT(ARM_DOT, k0)        \
5282        ((a), (b##5), (c.s5));     \
5283        CONCAT(ARM_DOT, k0)        \
5284        ((a), (b##6), (c.s6));     \
5285        CONCAT(ARM_DOT, k0)        \
5286        ((a), (b##7), (c.s7));     \
5287    })
5288#elif N0 == 16
5289#define ARM_DOT_K0XN0(k0, a, b, c) \
5290    ({                             \
5291        CONCAT(ARM_DOT, k0)        \
5292        ((a), (b##0), (c.s0));     \
5293        CONCAT(ARM_DOT, k0)        \
5294        ((a), (b##1), (c.s1));     \
5295        CONCAT(ARM_DOT, k0)        \
5296        ((a), (b##2), (c.s2));     \
5297        CONCAT(ARM_DOT, k0)        \
5298        ((a), (b##3), (c.s3));     \
5299        CONCAT(ARM_DOT, k0)        \
5300        ((a), (b##4), (c.s4));     \
5301        CONCAT(ARM_DOT, k0)        \
5302        ((a), (b##5), (c.s5));     \
5303        CONCAT(ARM_DOT, k0)        \
5304        ((a), (b##6), (c.s6));     \
5305        CONCAT(ARM_DOT, k0)        \
5306        ((a), (b##7), (c.s7));     \
5307        CONCAT(ARM_DOT, k0)        \
5308        ((a), (b##8), (c.s8));     \
5309        CONCAT(ARM_DOT, k0)        \
5310        ((a), (b##9), (c.s9));     \
5311        CONCAT(ARM_DOT, k0)        \
5312        ((a), (b##A), (c.sA));     \
5313        CONCAT(ARM_DOT, k0)        \
5314        ((a), (b##B), (c.sB));     \
5315        CONCAT(ARM_DOT, k0)        \
5316        ((a), (b##C), (c.sC));     \
5317        CONCAT(ARM_DOT, k0)        \
5318        ((a), (b##D), (c.sD));     \
5319        CONCAT(ARM_DOT, k0)        \
5320        ((a), (b##E), (c.sE));     \
5321        CONCAT(ARM_DOT, k0)        \
5322        ((a), (b##F), (c.sF));     \
5323    })
5324#else
5325#error "N0 value not supported"
5326#endif
5327
5328#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
5329
5330__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
5331                                          IMAGE_DECLARATION(rhs),
5332#if defined(BETA)
5333                                          IMAGE_DECLARATION(bias),
5334#endif
5335                                          IMAGE_DECLARATION(dst),
5336                                          uint lhs_stride_z,
5337                                          uint rhs_stride_z,
5338#if defined(BETA)
5339                                          uint bias_stride_z,
5340#endif
5341                                          uint dst_stride_z
5342#if defined(REINTERPRET_INPUT_AS_3D)
5343                                          ,
5344                                          uint lhs_cross_plane_pad
5345#endif
5346#if defined(REINTERPRET_OUTPUT_AS_3D)
5347                                          ,
5348                                          uint dst_cross_plane_pad
5349#endif
5350                                          ,
5351                                          const int M,
5352                                          const int N,
5353                                          const int K)
5354{
5355
5356#define RHS_BLOCK_SIZE ((K0) * (N0))
5357
5358
5359#if defined(RHS_INTERLEAVE)
5360#define RHS_OFFSET_X (K0)
5361#define RHS_STEP_X ((K0) * (H0))
5362#define RHS_STEP_LOOP (1)
5363#else
5364#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
5365#define RHS_STEP_X (K0)
5366#define RHS_STEP_LOOP (H0)
5367#endif
5368
5369    uint x = get_global_id(0);
5370    uint y = get_global_id(1);
5371    uint z = get_global_id(2);
5372
5373    const bool cond_y = y == 0;
5374    const bool cond_x = ((x + 1) * N0 >= N);
5375
5376#if defined(DUMMY_WORK_ITEMS)
5377    if((x * N0 >= N) || (y * M0 >= M))
5378    {
5379        return;
5380    }
5381#endif
5382
5383
5384    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
5385
5386
5387    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
5388
5389#if defined(MATRIX_B_DEPTH)
5390
5391    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
5392#else
5393    rhs_offset += z * rhs_stride_z;
5394#endif
5395
5396    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
5397    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
5398
5399#if defined(REINTERPRET_INPUT_AS_3D)
5400
5401    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
5402
5403
5404
5405    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
5406
5407#else
5408
5409
5410    lhs_offset += z * lhs_stride_z;
5411
5412#endif
5413
5414
5415    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
5416
5417    int i = 0;
5418    for(; i <= (K - K0); i += K0)
5419    {
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
5431
5432
5433        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
5434
5435
5436        ARM_DOT_K0XN0(K0, a0, b, c0);
5437#if M0 > 1
5438        ARM_DOT_K0XN0(K0, a1, b, c1);
5439#endif
5440#if M0 > 2
5441        ARM_DOT_K0XN0(K0, a2, b, c2);
5442#endif
5443#if M0 > 3
5444        ARM_DOT_K0XN0(K0, a3, b, c3);
5445#endif
5446#if M0 > 4
5447        ARM_DOT_K0XN0(K0, a4, b, c4);
5448#endif
5449#if M0 > 5
5450        ARM_DOT_K0XN0(K0, a5, b, c5);
5451#endif
5452#if M0 > 6
5453        ARM_DOT_K0XN0(K0, a6, b, c6);
5454#endif
5455#if M0 > 7
5456        ARM_DOT_K0XN0(K0, a7, b, c7);
5457#endif
5458
5459        lhs_offset += K0 * sizeof(DATA_TYPE);
5460        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
5461    }
5462
5463
5464    for(; i < K; ++i)
5465    {
5466
5467        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
5468
5469
5470        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
5471
5472
5473        ARM_DOT_K0XN0(1, a0, b, c0);
5474#if M0 > 1
5475        ARM_DOT_K0XN0(1, a1, b, c1);
5476#endif
5477#if M0 > 2
5478        ARM_DOT_K0XN0(1, a2, b, c2);
5479#endif
5480#if M0 > 3
5481        ARM_DOT_K0XN0(1, a3, b, c3);
5482#endif
5483#if M0 > 4
5484        ARM_DOT_K0XN0(1, a4, b, c4);
5485#endif
5486#if M0 > 5
5487        ARM_DOT_K0XN0(1, a5, b, c5);
5488#endif
5489#if M0 > 6
5490        ARM_DOT_K0XN0(1, a6, b, c6);
5491#endif
5492#if M0 > 7
5493        ARM_DOT_K0XN0(1, a7, b, c7);
5494#endif
5495
5496        lhs_offset += sizeof(DATA_TYPE);
5497        rhs_offset += sizeof(DATA_TYPE);
5498    }
5499
5500    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
5501
5502    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0);
5503
5504#if defined(REINTERPRET_OUTPUT_AS_3D)
5505
5506
5507    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
5508
5509
5510
5511    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
5512
5513#else
5514
5515
5516    dst_addr += z * dst_stride_z;
5517
5518#endif
5519
5520
5521#if defined(ALPHA)
5522    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
5523#endif
5524
5525
5526#if defined(BETA)
5527#if defined(BROADCAST_BIAS)
5528    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
5529
5530    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
5531
5532#ifndef UNIT_BETA
5533    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
5534#endif
5535
5536
5537    ADD_BLOCK_BROADCAST(M0, c, bias0);
5538
5539#else
5540    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
5541
5542    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
5543
5544#ifndef UNIT_BETA
5545    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
5546#endif
5547
5548
5549    ADD_BLOCK(M0, c, bias);
5550
5551#endif
5552#endif
5553
5554#if defined(ACTIVATION_TYPE)
5555    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
5556#endif
5557
5558
5559    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
5560
5561#undef RHS_BLOCK_SIZE
5562#undef RHS_OFFSET_X
5563#undef RHS_STEP_X
5564#undef RHS_STEP_LOOP
5565}
5566#endif
5567
5568#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
5569
5570__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
5571                                                  __read_only image2d_t rhs_img,
5572#if defined(BETA)
5573                                                  IMAGE_DECLARATION(bias),
5574#endif
5575                                                  IMAGE_DECLARATION(dst),
5576                                                  uint lhs_stride_z,
5577                                                  uint rhs_stride_z,
5578#if defined(BETA)
5579                                                  uint bias_stride_z,
5580#endif
5581                                                  uint dst_stride_z
5582#if defined(REINTERPRET_INPUT_AS_3D)
5583                                                  ,
5584                                                  uint lhs_cross_plane_pad
5585#endif
5586#if defined(REINTERPRET_OUTPUT_AS_3D)
5587                                                  ,
5588                                                  uint dst_cross_plane_pad
5589#endif
5590                                                  ,
5591                                                  const int M,
5592                                                  const int N,
5593                                                  const int K)
5594{
5595
5596#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
5597
5598    const uint LEFTOVER_K = K % K0;
5599
5600
5601#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
5602
5603
5604#if defined(RHS_INTERLEAVE)
5605#define RHS_OFFSET_X (PIXEL_UNIT)
5606#define RHS_STEP_X (PIXEL_UNIT * (H0))
5607#define RHS_STEP_LOOP (1)
5608#else
5609#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
5610#define RHS_STEP_X PIXEL_UNIT
5611#define RHS_STEP_LOOP (H0)
5612#endif
5613
5614    uint x = get_global_id(0);
5615    uint y = get_global_id(1);
5616    uint z = get_global_id(2);
5617
5618    const bool cond_y = y == 0;
5619    const bool cond_x = ((x + 1) * N0 >= N);
5620
5621#if defined(DUMMY_WORK_ITEMS)
5622    if((x * N0 >= N) || (y * M0 >= M))
5623    {
5624        return;
5625    }
5626#endif
5627
5628
5629    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
5630
5631#if defined(MATRIX_B_DEPTH)
5632
5633    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
5634#else
5635    const uint z_rhs = get_global_id(2);
5636#endif
5637
5638
5639    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
5640    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
5641
5642    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
5643    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
5644
5645#if defined(REINTERPRET_INPUT_AS_3D)
5646
5647    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
5648
5649
5650
5651    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
5652
5653#else
5654
5655
5656    lhs_offset += z * lhs_stride_z;
5657
5658#endif
5659
5660
5661    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
5662
5663    int i = 0;
5664    for(; i <= (K - K0); i += K0)
5665    {
5666
5667        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
5668
5669
5670        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
5671        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
5672
5673
5674        ARM_DOT_K0XN0(K0, a0, b, c0);
5675#if M0 > 1
5676        ARM_DOT_K0XN0(K0, a1, b, c1);
5677#endif
5678#if M0 > 2
5679        ARM_DOT_K0XN0(K0, a2, b, c2);
5680#endif
5681#if M0 > 3
5682        ARM_DOT_K0XN0(K0, a3, b, c3);
5683#endif
5684#if M0 > 4
5685        ARM_DOT_K0XN0(K0, a4, b, c4);
5686#endif
5687#if M0 > 5
5688        ARM_DOT_K0XN0(K0, a5, b, c5);
5689#endif
5690#if M0 > 6
5691        ARM_DOT_K0XN0(K0, a6, b, c6);
5692#endif
5693#if M0 > 7
5694        ARM_DOT_K0XN0(K0, a7, b, c7);
5695#endif
5696
5697        lhs_offset += K0 * sizeof(DATA_TYPE);
5698        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
5699    }
5700
5701    if(LEFTOVER_K != 0)
5702    {
5703
5704
5705
5706
5707        union UNION_VEC_TYPE
5708        {
5709            DATA_TYPE s[K0];
5710            VEC_DATA_TYPE(DATA_TYPE, K0)
5711            v;
5712        };
5713
5714        union UNION_VEC_TYPE a0 = {.v = 0 };
5715#if M0 > 1
5716        union UNION_VEC_TYPE a1 = {.v = 0 };
5717#endif
5718#if M0 > 2
5719        union UNION_VEC_TYPE a2 = {.v = 0 };
5720#endif
5721#if M0 > 3
5722        union UNION_VEC_TYPE a3 = {.v = 0 };
5723#endif
5724#if M0 > 4
5725        union UNION_VEC_TYPE a4 = {.v = 0 };
5726#endif
5727#if M0 > 5
5728        union UNION_VEC_TYPE a5 = {.v = 0 };
5729#endif
5730#if M0 > 6
5731        union UNION_VEC_TYPE a6 = {.v = 0 };
5732#endif
5733#if M0 > 7
5734        union UNION_VEC_TYPE a7 = {.v = 0 };
5735#endif
5736
5737        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
5738
5739
5740        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
5741
5742
5743        for(int k = 0; k < LEFTOVER_K; ++k)
5744        {
5745            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
5746#if M0 > 1
5747            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
5748#endif
5749#if M0 > 2
5750            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
5751#endif
5752#if M0 > 3
5753            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
5754#endif
5755#if M0 > 4
5756            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
5757#endif
5758#if M0 > 5
5759            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
5760#endif
5761#if M0 > 6
5762            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
5763#endif
5764#if M0 > 7
5765            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
5766#endif
5767
5768            lhs_offset += sizeof(DATA_TYPE);
5769        }
5770
5771
5772        ARM_DOT_K0XN0(K0, a0.v, b, c0);
5773#if M0 > 1
5774        ARM_DOT_K0XN0(K0, a1.v, b, c1);
5775#endif
5776#if M0 > 2
5777        ARM_DOT_K0XN0(K0, a2.v, b, c2);
5778#endif
5779#if M0 > 3
5780        ARM_DOT_K0XN0(K0, a3.v, b, c3);
5781#endif
5782#if M0 > 4
5783        ARM_DOT_K0XN0(K0, a4.v, b, c4);
5784#endif
5785#if M0 > 5
5786        ARM_DOT_K0XN0(K0, a5.v, b, c5);
5787#endif
5788#if M0 > 6
5789        ARM_DOT_K0XN0(K0, a6.v, b, c6);
5790#endif
5791#if M0 > 7
5792        ARM_DOT_K0XN0(K0, a7.v, b, c7);
5793#endif
5794    }
5795
5796    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
5797
5798    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
5799
5800#if defined(REINTERPRET_OUTPUT_AS_3D)
5801
5802
5803    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
5804
5805
5806
5807    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
5808
5809#else
5810
5811
5812    dst_addr += z * dst_stride_z;
5813
5814#endif
5815
5816
5817#if defined(ALPHA)
5818    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
5819#endif
5820
5821
5822#if defined(BETA)
5823#if defined(BROADCAST_BIAS)
5824    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
5825
5826    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
5827
5828#ifndef UNIT_BETA
5829    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
5830#endif
5831
5832
5833    ADD_BLOCK_BROADCAST(M0, c, bias0);
5834
5835#else
5836    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
5837
5838    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
5839
5840#ifndef UNIT_BETA
5841    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
5842#endif
5843
5844
5845    ADD_BLOCK(M0, c, bias);
5846
5847#endif
5848#endif
5849
5850#if defined(ACTIVATION_TYPE)
5851    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
5852#endif
5853
5854
5855    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
5856
5857#undef RHS_BLOCK_SIZE
5858#undef RHS_OFFSET_X
5859#undef RHS_STEP_X
5860#undef RHS_STEP_LOOP
5861#undef PIXEL_UNIT
5862}
5863#endif
5864
5865#define VFMA(a, b, c)     \
5866    ({                    \
5867        c = fma(a, b, c); \
5868    })
5869
5870#if M0 == 1
5871#define VFMA_M0xN0(i, a, b, c)                                        \
5872    ({                                                                \
5873        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5874    })
5875#elif M0 == 2
5876#define VFMA_M0xN0(i, a, b, c)                                        \
5877    ({                                                                \
5878        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5879        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5880    })
5881#elif M0 == 3
5882#define VFMA_M0xN0(i, a, b, c)                                        \
5883    ({                                                                \
5884        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5885        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5886        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5887    })
5888#elif M0 == 4
5889#define VFMA_M0xN0(i, a, b, c)                                        \
5890    ({                                                                \
5891        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5892        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5893        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5894        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5895    })
5896#elif M0 == 5
5897#define VFMA_M0xN0(i, a, b, c)                                        \
5898    ({                                                                \
5899        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5900        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5901        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5902        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5903        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
5904    })
5905#elif M0 == 6
5906#define VFMA_M0xN0(i, a, b, c)                                        \
5907    ({                                                                \
5908        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5909        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5910        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5911        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5912        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
5913        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
5914    })
5915#elif M0 == 7
5916#define VFMA_M0xN0(i, a, b, c)                                        \
5917    ({                                                                \
5918        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5919        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5920        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5921        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5922        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
5923        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
5924        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
5925    })
5926#elif M0 == 8
5927#define VFMA_M0xN0(i, a, b, c)                                        \
5928    ({                                                                \
5929        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
5930        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
5931        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
5932        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
5933        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
5934        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
5935        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
5936        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
5937    })
5938#else
5939#error "M0 not supported"
5940#endif
5941
5942#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
5943
5944__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
5945                                           IMAGE_DECLARATION(rhs),
5946#if defined(BETA)
5947                                           IMAGE_DECLARATION(bias),
5948#endif
5949                                           IMAGE_DECLARATION(dst),
5950                                           uint lhs_stride_z,
5951                                           uint rhs_stride_z,
5952#if defined(BETA)
5953                                           uint bias_stride_z,
5954#endif
5955                                           uint dst_stride_z
5956#if defined(REINTERPRET_INPUT_AS_3D)
5957                                           ,
5958                                           uint lhs_cross_plane_pad
5959#endif
5960#if defined(REINTERPRET_OUTPUT_AS_3D)
5961                                           ,
5962                                           uint dst_cross_plane_pad
5963#endif
5964                                           ,
5965                                           const int M,
5966                                           const int N,
5967                                           const int K)
5968{
5969
5970#define RHS_BLOCK_SIZE ((K0) * (N0))
5971
5972
5973#if defined(RHS_INTERLEAVE)
5974#define RHS_OFFSET_X (N0)
5975#define RHS_STEP_X ((N0) * (H0))
5976#define RHS_STEP_LOOP (1)
5977#else
5978#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
5979#define RHS_STEP_X (N0)
5980#define RHS_STEP_LOOP (H0)
5981#endif
5982
5983    uint x = get_global_id(0);
5984    uint y = get_global_id(1);
5985    uint z = get_global_id(2);
5986
5987    const bool cond_y = y == 0;
5988    const bool cond_x = ((x + 1) * N0 >= N);
5989
5990#if defined(DUMMY_WORK_ITEMS)
5991    if((x * N0 >= N) || (y * M0 >= M))
5992    {
5993        return;
5994    }
5995#endif
5996
5997
5998    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
5999
6000
6001    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
6002
6003#if defined(MATRIX_B_DEPTH)
6004
6005    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
6006#else
6007    rhs_offset += z * rhs_stride_z;
6008#endif
6009
6010    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
6011    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
6012
6013#if defined(REINTERPRET_INPUT_AS_3D)
6014
6015
6016    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
6017
6018
6019
6020    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
6021
6022#else
6023
6024
6025    lhs_offset += z * lhs_stride_z;
6026
6027#endif
6028
6029
6030    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
6031
6032    int i = 0;
6033    for(; i <= (K - K0); i += K0)
6034    {
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
6046
6047        VEC_DATA_TYPE(DATA_TYPE, N0)
6048        b0;
6049
6050        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
6051        VFMA_M0xN0(0, a, b0, c);
6052        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
6053        VFMA_M0xN0(1, a, b0, c);
6054#if K0 > 2
6055        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
6056        VFMA_M0xN0(2, a, b0, c);
6057#endif
6058#if K0 > 3
6059        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
6060        VFMA_M0xN0(3, a, b0, c);
6061#endif
6062#if K0 > 4
6063        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
6064        VFMA_M0xN0(4, a, b0, c);
6065        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
6066        VFMA_M0xN0(5, a, b0, c);
6067        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
6068        VFMA_M0xN0(6, a, b0, c);
6069        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
6070        VFMA_M0xN0(7, a, b0, c);
6071#endif
6072#if K0 > 8
6073        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
6074        VFMA_M0xN0(8, a, b0, c);
6075        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
6076        VFMA_M0xN0(9, a, b0, c);
6077        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
6078        VFMA_M0xN0(A, a, b0, c);
6079        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
6080        VFMA_M0xN0(B, a, b0, c);
6081        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
6082        VFMA_M0xN0(C, a, b0, c);
6083        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
6084        VFMA_M0xN0(D, a, b0, c);
6085        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
6086        VFMA_M0xN0(E, a, b0, c);
6087        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
6088        VFMA_M0xN0(F, a, b0, c);
6089#endif
6090
6091        lhs_offset += K0 * sizeof(DATA_TYPE);
6092        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
6093    }
6094
6095
6096    for(; i < K; ++i)
6097    {
6098
6099        VEC_DATA_TYPE(DATA_TYPE, 2)
6100        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
6101#if M0 > 1
6102        VEC_DATA_TYPE(DATA_TYPE, 2)
6103        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
6104#endif
6105#if M0 > 2
6106        VEC_DATA_TYPE(DATA_TYPE, 2)
6107        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
6108#endif
6109#if M0 > 3
6110        VEC_DATA_TYPE(DATA_TYPE, 2)
6111        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
6112#endif
6113#if M0 > 4
6114        VEC_DATA_TYPE(DATA_TYPE, 2)
6115        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
6116#endif
6117#if M0 > 5
6118        VEC_DATA_TYPE(DATA_TYPE, 2)
6119        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
6120#endif
6121#if M0 > 6
6122        VEC_DATA_TYPE(DATA_TYPE, 2)
6123        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
6124#endif
6125#if M0 > 7
6126        VEC_DATA_TYPE(DATA_TYPE, 2)
6127        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
6128#endif
6129
6130        VEC_DATA_TYPE(DATA_TYPE, N0)
6131        b0;
6132
6133        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
6134        VFMA_M0xN0(0, a, b0, c);
6135
6136        lhs_offset += sizeof(DATA_TYPE);
6137        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
6138    }
6139
6140    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
6141
6142    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0);
6143
6144#if defined(REINTERPRET_OUTPUT_AS_3D)
6145
6146    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
6147
6148
6149
6150    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
6151
6152#else
6153
6154
6155    dst_addr += z * dst_stride_z;
6156
6157#endif
6158
6159
6160#if defined(ALPHA)
6161    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
6162#endif
6163
6164
6165#if defined(BETA)
6166#if defined(BROADCAST_BIAS)
6167    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
6168
6169    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
6170
6171#ifndef UNIT_BETA
6172    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
6173#endif
6174
6175
6176    ADD_BLOCK_BROADCAST(M0, c, bias0);
6177
6178#else
6179    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
6180
6181    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6182
6183#ifndef UNIT_BETA
6184    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
6185#endif
6186
6187
6188    ADD_BLOCK(M0, c, bias);
6189
6190#endif
6191#endif
6192
6193#if defined(ACTIVATION_TYPE)
6194    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
6195#endif
6196
6197
6198    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6199
6200#undef RHS_BLOCK_SIZE
6201#undef RHS_OFFSET_X
6202#undef RHS_STEP_X
6203#undef RHS_STEP_LOOP
6204}
6205#endif
6206
6207#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
6208
6209__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
6210                                                   __read_only image2d_t rhs_img,
6211#if defined(BETA)
6212                                                   IMAGE_DECLARATION(bias),
6213#endif
6214                                                   IMAGE_DECLARATION(dst),
6215                                                   uint lhs_stride_z,
6216                                                   uint rhs_stride_z,
6217#if defined(BETA)
6218                                                   uint bias_stride_z,
6219#endif
6220                                                   uint dst_stride_z
6221#if defined(REINTERPRET_INPUT_AS_3D)
6222                                                   ,
6223                                                   uint lhs_cross_plane_pad
6224#endif
6225#if defined(REINTERPRET_OUTPUT_AS_3D)
6226                                                   ,
6227                                                   uint dst_cross_plane_pad
6228#endif
6229                                                   ,
6230                                                   const int M,
6231                                                   const int N,
6232                                                   const int K)
6233{
6234
6235#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
6236
6237
6238#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
6239
6240
6241#if defined(RHS_INTERLEAVE)
6242#define RHS_OFFSET_X (PIXEL_UNIT)
6243#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
6244#define RHS_STEP_LOOP 1
6245#else
6246#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
6247#define RHS_STEP_X (PIXEL_UNIT)
6248#define RHS_STEP_LOOP (H0)
6249#endif
6250
6251    uint x = get_global_id(0);
6252    uint y = get_global_id(1);
6253    uint z = get_global_id(2);
6254
6255    const bool cond_y = y == 0;
6256    const bool cond_x = ((x + 1) * N0 >= N);
6257
6258#if defined(DUMMY_WORK_ITEMS)
6259    if((x * N0 >= N) || (y * M0 >= M))
6260    {
6261        return;
6262    }
6263#endif
6264
6265
6266    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
6267
6268#if defined(MATRIX_B_DEPTH)
6269
6270    const uint z_rhs = (z % MATRIX_B_DEPTH);
6271#else
6272    const uint z_rhs = z;
6273#endif
6274
6275
6276    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
6277    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
6278
6279    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
6280    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
6281
6282#if defined(REINTERPRET_INPUT_AS_3D)
6283
6284
6285    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
6286
6287
6288
6289    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
6290
6291#else
6292
6293
6294    lhs_offset += z * lhs_stride_z;
6295
6296#endif
6297
6298
6299    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
6300
6301    int i = 0;
6302    for(; i <= (K - K0); i += K0)
6303    {
6304
6305        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
6306
6307        VEC_DATA_TYPE(DATA_TYPE, N0)
6308        b0;
6309
6310        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
6311        VFMA_M0xN0(0, a, b0, c);
6312        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
6313        VFMA_M0xN0(1, a, b0, c);
6314#if K0 > 2
6315        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
6316        VFMA_M0xN0(2, a, b0, c);
6317#endif
6318#if K0 > 3
6319        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
6320        VFMA_M0xN0(3, a, b0, c);
6321#endif
6322#if K0 > 4
6323        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
6324        VFMA_M0xN0(4, a, b0, c);
6325        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
6326        VFMA_M0xN0(5, a, b0, c);
6327        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
6328        VFMA_M0xN0(6, a, b0, c);
6329        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
6330        VFMA_M0xN0(7, a, b0, c);
6331#endif
6332#if K0 > 8
6333        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
6334        VFMA_M0xN0(8, a, b0, c);
6335        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
6336        VFMA_M0xN0(9, a, b0, c);
6337        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
6338        VFMA_M0xN0(A, a, b0, c);
6339        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
6340        VFMA_M0xN0(B, a, b0, c);
6341        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
6342        VFMA_M0xN0(C, a, b0, c);
6343        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
6344        VFMA_M0xN0(D, a, b0, c);
6345        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
6346        VFMA_M0xN0(E, a, b0, c);
6347        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
6348        VFMA_M0xN0(F, a, b0, c);
6349#endif
6350
6351        lhs_offset += K0 * sizeof(DATA_TYPE);
6352        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
6353    }
6354
6355
6356    for(; i < K; ++i)
6357    {
6358
6359        VEC_DATA_TYPE(DATA_TYPE, 2)
6360        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
6361#if M0 > 1
6362        VEC_DATA_TYPE(DATA_TYPE, 2)
6363        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
6364#endif
6365#if M0 > 2
6366        VEC_DATA_TYPE(DATA_TYPE, 2)
6367        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
6368#endif
6369#if M0 > 3
6370        VEC_DATA_TYPE(DATA_TYPE, 2)
6371        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
6372#endif
6373#if M0 > 4
6374        VEC_DATA_TYPE(DATA_TYPE, 2)
6375        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
6376#endif
6377#if M0 > 5
6378        VEC_DATA_TYPE(DATA_TYPE, 2)
6379        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
6380#endif
6381#if M0 > 6
6382        VEC_DATA_TYPE(DATA_TYPE, 2)
6383        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
6384#endif
6385#if M0 > 7
6386        VEC_DATA_TYPE(DATA_TYPE, 2)
6387        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
6388#endif
6389
6390        VEC_DATA_TYPE(DATA_TYPE, N0)
6391        b0;
6392        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
6393
6394        VFMA_M0xN0(0, a, b0, c);
6395
6396        lhs_offset += sizeof(DATA_TYPE);
6397        x_rhs += RHS_STEP_X;
6398    }
6399
6400    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
6401
6402    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0);
6403
6404#if defined(REINTERPRET_OUTPUT_AS_3D)
6405
6406    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
6407
6408
6409
6410    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
6411
6412#else
6413
6414
6415    dst_addr += z * dst_stride_z;
6416
6417#endif
6418
6419
6420#if defined(ALPHA)
6421    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
6422#endif
6423
6424
6425#if defined(BETA)
6426#if defined(BROADCAST_BIAS)
6427    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
6428
6429    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
6430
6431#ifndef UNIT_BETA
6432    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
6433#endif
6434
6435
6436    ADD_BLOCK_BROADCAST(M0, c, bias0);
6437
6438#else
6439    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
6440
6441    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6442
6443#ifndef UNIT_BETA
6444    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
6445#endif
6446
6447
6448    ADD_BLOCK(M0, c, bias);
6449
6450#endif
6451#endif
6452
6453#if defined(ACTIVATION_TYPE)
6454    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
6455#endif
6456
6457
6458    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6459
6460#undef RHS_BLOCK_SIZE
6461#undef RHS_OFFSET_X
6462#undef RHS_STEP_X
6463#undef RHS_STEP_LOOP
6464}
6465#endif
6466#endif
6467
6468#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
6469
6470#if defined(MIXED_PRECISION)
6471#if K0 == 2
6472#define ARM_DOT_K0(a, b, c) \
6473    ({                      \
6474        c += a.s0 * b.s0;   \
6475        c += a.s1 * b.s1;   \
6476    })
6477#elif K0 == 3
6478#define ARM_DOT_K0(a, b, c) \
6479    ({                      \
6480        c += a.s0 * b.s0;   \
6481        c += a.s1 * b.s1;   \
6482        c += a.s2 * b.s2;   \
6483    })
6484#elif K0 == 4
6485#define ARM_DOT_K0(a, b, c) \
6486    ({                      \
6487        c += a.s0 * b.s0;   \
6488        c += a.s1 * b.s1;   \
6489        c += a.s2 * b.s2;   \
6490        c += a.s3 * b.s3;   \
6491    })
6492#elif K0 == 8
6493#define ARM_DOT_K0(a, b, c) \
6494    ({                      \
6495        c += a.s0 * b.s0;   \
6496        c += a.s1 * b.s1;   \
6497        c += a.s2 * b.s2;   \
6498        c += a.s3 * b.s3;   \
6499        c += a.s4 * b.s4;   \
6500        c += a.s5 * b.s5;   \
6501        c += a.s6 * b.s6;   \
6502        c += a.s7 * b.s7;   \
6503    })
6504#elif K0 == 16
6505#define ARM_DOT_K0(a, b, c) \
6506    ({                      \
6507        c += a.s0 * b.s0;   \
6508        c += a.s1 * b.s1;   \
6509        c += a.s2 * b.s2;   \
6510        c += a.s3 * b.s3;   \
6511        c += a.s4 * b.s4;   \
6512        c += a.s5 * b.s5;   \
6513        c += a.s6 * b.s6;   \
6514        c += a.s7 * b.s7;   \
6515        c += a.s8 * b.s8;   \
6516        c += a.s9 * b.s9;   \
6517        c += a.sA * b.sA;   \
6518        c += a.sB * b.sB;   \
6519        c += a.sC * b.sC;   \
6520        c += a.sD * b.sD;   \
6521        c += a.sE * b.sE;   \
6522        c += a.sF * b.sF;   \
6523    })
6524#else
6525#error "K0 value not supported"
6526#endif
6527#else
6528#if K0 == 2
6529#define ARM_DOT_K0(a, b, c)     \
6530    ({                          \
6531        c = fma(a.s0, b.s0, c); \
6532        c = fma(a.s1, b.s1, c); \
6533    })
6534#elif K0 == 3
6535#define ARM_DOT_K0(a, b, c)     \
6536    ({                          \
6537        c = fma(a.s0, b.s0, c); \
6538        c = fma(a.s1, b.s1, c); \
6539        c = fma(a.s2, b.s2, c); \
6540    })
6541#elif K0 == 4
6542#define ARM_DOT_K0(a, b, c)     \
6543    ({                          \
6544        c = fma(a.s0, b.s0, c); \
6545        c = fma(a.s1, b.s1, c); \
6546        c = fma(a.s2, b.s2, c); \
6547        c = fma(a.s3, b.s3, c); \
6548    })
6549#elif K0 == 8
6550#define ARM_DOT_K0(a, b, c)     \
6551    ({                          \
6552        c = fma(a.s0, b.s0, c); \
6553        c = fma(a.s1, b.s1, c); \
6554        c = fma(a.s2, b.s2, c); \
6555        c = fma(a.s3, b.s3, c); \
6556        c = fma(a.s4, b.s4, c); \
6557        c = fma(a.s5, b.s5, c); \
6558        c = fma(a.s6, b.s6, c); \
6559        c = fma(a.s7, b.s7, c); \
6560    })
6561#elif K0 == 16
6562#define ARM_DOT_K0(a, b, c)     \
6563    ({                          \
6564        c = fma(a.s0, b.s0, c); \
6565        c = fma(a.s1, b.s1, c); \
6566        c = fma(a.s2, b.s2, c); \
6567        c = fma(a.s3, b.s3, c); \
6568        c = fma(a.s4, b.s4, c); \
6569        c = fma(a.s5, b.s5, c); \
6570        c = fma(a.s6, b.s6, c); \
6571        c = fma(a.s7, b.s7, c); \
6572        c = fma(a.s8, b.s8, c); \
6573        c = fma(a.s9, b.s9, c); \
6574        c = fma(a.sA, b.sA, c); \
6575        c = fma(a.sB, b.sB, c); \
6576        c = fma(a.sC, b.sC, c); \
6577        c = fma(a.sD, b.sD, c); \
6578        c = fma(a.sE, b.sE, c); \
6579        c = fma(a.sF, b.sF, c); \
6580    })
6581#else
6582#error "K0 value not supported"
6583#endif
6584#endif
6585
6586#if defined(ARM_DOT_K0XN0)
6587#undef ARM_DOT_K0XN0
6588#endif
6589
6590#if N0 == 2
6591#define ARM_DOT_K0XN0(a, b, c)           \
6592    ({                                   \
6593        ARM_DOT_K0((a), (b##0), (c.s0)); \
6594        ARM_DOT_K0((a), (b##1), (c.s1)); \
6595    })
6596#elif N0 == 3
6597#define ARM_DOT_K0XN0(a, b, c)           \
6598    ({                                   \
6599        ARM_DOT_K0((a), (b##0), (c.s0)); \
6600        ARM_DOT_K0((a), (b##1), (c.s1)); \
6601        ARM_DOT_K0((a), (b##2), (c.s2)); \
6602    })
6603#elif N0 == 4
6604#define ARM_DOT_K0XN0(a, b, c)           \
6605    ({                                   \
6606        ARM_DOT_K0((a), (b##0), (c.s0)); \
6607        ARM_DOT_K0((a), (b##1), (c.s1)); \
6608        ARM_DOT_K0((a), (b##2), (c.s2)); \
6609        ARM_DOT_K0((a), (b##3), (c.s3)); \
6610    })
6611#elif N0 == 8
6612#define ARM_DOT_K0XN0(a, b, c)           \
6613    ({                                   \
6614        ARM_DOT_K0((a), (b##0), (c.s0)); \
6615        ARM_DOT_K0((a), (b##1), (c.s1)); \
6616        ARM_DOT_K0((a), (b##2), (c.s2)); \
6617        ARM_DOT_K0((a), (b##3), (c.s3)); \
6618        ARM_DOT_K0((a), (b##4), (c.s4)); \
6619        ARM_DOT_K0((a), (b##5), (c.s5)); \
6620        ARM_DOT_K0((a), (b##6), (c.s6)); \
6621        ARM_DOT_K0((a), (b##7), (c.s7)); \
6622    })
6623#elif N0 == 16
6624#define ARM_DOT_K0XN0(a, b, c)           \
6625    ({                                   \
6626        ARM_DOT_K0((a), (b##0), (c.s0)); \
6627        ARM_DOT_K0((a), (b##1), (c.s1)); \
6628        ARM_DOT_K0((a), (b##2), (c.s2)); \
6629        ARM_DOT_K0((a), (b##3), (c.s3)); \
6630        ARM_DOT_K0((a), (b##4), (c.s4)); \
6631        ARM_DOT_K0((a), (b##5), (c.s5)); \
6632        ARM_DOT_K0((a), (b##6), (c.s6)); \
6633        ARM_DOT_K0((a), (b##7), (c.s7)); \
6634        ARM_DOT_K0((a), (b##8), (c.s8)); \
6635        ARM_DOT_K0((a), (b##9), (c.s9)); \
6636        ARM_DOT_K0((a), (b##A), (c.sA)); \
6637        ARM_DOT_K0((a), (b##B), (c.sB)); \
6638        ARM_DOT_K0((a), (b##C), (c.sC)); \
6639        ARM_DOT_K0((a), (b##D), (c.sD)); \
6640        ARM_DOT_K0((a), (b##E), (c.sE)); \
6641        ARM_DOT_K0((a), (b##F), (c.sF)); \
6642    })
6643#else
6644#error "N0 value not supported"
6645#endif
6646
6647#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
6648
6649__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
6650                                            IMAGE_DECLARATION(rhs),
6651#if defined(BETA)
6652                                            IMAGE_DECLARATION(bias),
6653#endif
6654                                            IMAGE_DECLARATION(dst),
6655                                            uint lhs_stride_z,
6656                                            uint rhs_stride_z,
6657#if defined(BETA)
6658                                            uint bias_stride_z,
6659#endif
6660                                            uint dst_stride_z
6661#if defined(REINTERPRET_OUTPUT_AS_3D)
6662                                            ,
6663                                            uint dst_cross_plane_pad
6664#endif
6665                                            ,
6666                                            const int M,
6667                                            const int N,
6668                                            const int K)
6669{
6670
6671#define LHS_BLOCK_SIZE ((K0) * (M0))
6672
6673#if defined(LHS_INTERLEAVE)
6674#define LHS_OFFSET_X (K0)
6675#define LHS_STEP_X ((K0) * (V0))
6676#define LHS_STEP_LOOP (1)
6677#else
6678#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
6679#define LHS_STEP_X (K0)
6680#define LHS_STEP_LOOP (V0)
6681#endif
6682
6683
6684#define RHS_BLOCK_SIZE ((K0) * (N0))
6685
6686
6687#if defined(RHS_INTERLEAVE)
6688#define RHS_OFFSET_X (K0)
6689#define RHS_STEP_X ((K0) * (H0))
6690#define RHS_STEP_LOOP (1)
6691#else
6692#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
6693#define RHS_STEP_X (K0)
6694#define RHS_STEP_LOOP (H0)
6695#endif
6696
6697#if defined(DUMMY_WORK_ITEMS)
6698    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
6699    {
6700        return;
6701    }
6702#endif
6703
6704
6705    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
6706                               (get_global_id(2) * lhs_stride_z);
6707
6708
6709    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
6710
6711#if defined(MATRIX_B_DEPTH)
6712
6713    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
6714#else
6715    rhs_addr += get_global_id(2) * rhs_stride_z;
6716#endif
6717
6718
6719    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
6720
6721    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
6722    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
6723
6724    for(int i = 0; i < K; i += K0)
6725    {
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
6737
6738
6739        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
6740
6741
6742        ARM_DOT_K0XN0(a0, b, c0);
6743#if M0 > 1
6744        ARM_DOT_K0XN0(a1, b, c1);
6745#endif
6746#if M0 > 2
6747        ARM_DOT_K0XN0(a2, b, c2);
6748#endif
6749#if M0 > 3
6750        ARM_DOT_K0XN0(a3, b, c3);
6751#endif
6752#if M0 > 4
6753        ARM_DOT_K0XN0(a4, b, c4);
6754#endif
6755#if M0 > 5
6756        ARM_DOT_K0XN0(a5, b, c5);
6757#endif
6758#if M0 > 6
6759        ARM_DOT_K0XN0(a6, b, c6);
6760#endif
6761#if M0 > 7
6762        ARM_DOT_K0XN0(a7, b, c7);
6763#endif
6764
6765        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
6766        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
6767    }
6768
6769    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
6770
6771    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
6772
6773    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
6774    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
6775
6776#if defined(REINTERPRET_OUTPUT_AS_3D)
6777
6778
6779    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
6780
6781
6782    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
6783
6784#else
6785
6786
6787    dst_addr += get_global_id(2) * dst_stride_z;
6788
6789#endif
6790
6791
6792#if defined(ALPHA)
6793    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
6794#endif
6795
6796
6797#if defined(BETA)
6798#if defined(BROADCAST_BIAS)
6799    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
6800
6801    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
6802
6803#ifndef UNIT_BETA
6804    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
6805#endif
6806
6807
6808#if defined(MIXED_PRECISION)
6809    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
6810    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
6811#else
6812    ADD_BLOCK_BROADCAST(M0, c, bias0);
6813#endif
6814
6815#else
6816    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
6817                                    2) * bias_stride_z;
6818
6819    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6820
6821#ifndef UNIT_BETA
6822    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
6823#endif
6824
6825
6826#if defined(MIXED_PRECISION)
6827    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
6828    ADD_BLOCK(M0, c, bias_hp);
6829#else
6830    ADD_BLOCK(M0, c, bias);
6831#endif
6832
6833#endif
6834#endif
6835
6836#if defined(ACTIVATION_TYPE)
6837#if defined(MIXED_PRECISION)
6838    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
6839#else
6840    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
6841#endif
6842#endif
6843
6844
6845#if defined(MIXED_PRECISION)
6846    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
6847    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6848#else
6849    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
6850#endif
6851
6852#undef LHS_BLOCK_SIZE
6853#undef LHS_OFFSET_X
6854#undef LHS_STEP_X
6855#undef RHS_BLOCK_SIZE
6856#undef RHS_OFFSET_X
6857#undef RHS_STEP_X
6858#undef LHS_STEP_LOOP
6859#undef RHS_STEP_LOOP
6860}
6861#endif
6862
6863#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
6864
6865__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
6866                                                    __read_only image2d_t rhs_img,
6867#if defined(BETA)
6868                                                    IMAGE_DECLARATION(bias),
6869#endif
6870                                                    IMAGE_DECLARATION(dst),
6871                                                    uint lhs_stride_z,
6872                                                    uint rhs_stride_z,
6873#if defined(BETA)
6874                                                    uint bias_stride_z,
6875#endif
6876                                                    uint dst_stride_z
6877#if defined(REINTERPRET_OUTPUT_AS_3D)
6878                                                    ,
6879                                                    uint dst_cross_plane_pad
6880#endif
6881                                                    ,
6882                                                    const int M,
6883                                                    const int N,
6884                                                    const int K)
6885{
6886
6887#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
6888
6889
6890#define LHS_BLOCK_SIZE ((K0) * (M0))
6891
6892#if defined(LHS_INTERLEAVE)
6893#define LHS_OFFSET_X (K0)
6894#define LHS_STEP_X ((K0) * (V0))
6895#define LHS_STEP_LOOP (1)
6896#else
6897#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
6898#define LHS_STEP_X (K0)
6899#define LHS_STEP_LOOP (V0)
6900#endif
6901
6902
6903#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
6904
6905
6906#if defined(RHS_INTERLEAVE)
6907#define RHS_OFFSET_X (PIXEL_UNIT)
6908#define RHS_STEP_X (PIXEL_UNIT * (H0))
6909#define RHS_STEP_LOOP (1)
6910#else
6911#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
6912#define RHS_STEP_X PIXEL_UNIT
6913#define RHS_STEP_LOOP (H0)
6914#endif
6915
6916#if defined(DUMMY_WORK_ITEMS)
6917    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
6918    {
6919        return;
6920    }
6921#endif
6922
6923
6924    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
6925                               (get_global_id(2) * lhs_stride_z);
6926
6927#if defined(MATRIX_B_DEPTH)
6928
6929    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
6930#else
6931    const uint z_rhs = get_global_id(2);
6932#endif
6933
6934
6935    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
6936    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
6937
6938
6939    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
6940
6941    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
6942    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
6943
6944    for(int i = 0; i < K; i += K0)
6945    {
6946
6947        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
6948
6949
6950        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
6951        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
6952
6953
6954        ARM_DOT_K0XN0(a0, b, c0);
6955#if M0 > 1
6956        ARM_DOT_K0XN0(a1, b, c1);
6957#endif
6958#if M0 > 2
6959        ARM_DOT_K0XN0(a2, b, c2);
6960#endif
6961#if M0 > 3
6962        ARM_DOT_K0XN0(a3, b, c3);
6963#endif
6964#if M0 > 4
6965        ARM_DOT_K0XN0(a4, b, c4);
6966#endif
6967#if M0 > 5
6968        ARM_DOT_K0XN0(a5, b, c5);
6969#endif
6970#if M0 > 6
6971        ARM_DOT_K0XN0(a6, b, c6);
6972#endif
6973#if M0 > 7
6974        ARM_DOT_K0XN0(a7, b, c7);
6975#endif
6976
6977        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
6978
6979        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
6980    }
6981
6982    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
6983
6984    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
6985
6986    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
6987    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
6988
6989#if defined(REINTERPRET_OUTPUT_AS_3D)
6990
6991
6992    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
6993
6994
6995    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
6996
6997#else
6998
6999
7000    dst_addr += get_global_id(2) * dst_stride_z;
7001
7002#endif
7003
7004
7005#if defined(ALPHA)
7006    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
7007#endif
7008
7009
7010#if defined(BETA)
7011#if defined(BROADCAST_BIAS)
7012    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
7013
7014    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
7015
7016#ifndef UNIT_BETA
7017    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
7018#endif
7019
7020
7021#if defined(MIXED_PRECISION)
7022    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7023    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
7024#else
7025    ADD_BLOCK_BROADCAST(M0, c, bias0);
7026#endif
7027
7028#else
7029    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
7030                                    2) * bias_stride_z;
7031
7032    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7033
7034#ifndef UNIT_BETA
7035    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
7036#endif
7037
7038
7039#if defined(MIXED_PRECISION)
7040    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7041    ADD_BLOCK(M0, c, bias_hp);
7042#else
7043    ADD_BLOCK(M0, c, bias);
7044#endif
7045
7046#endif
7047#endif
7048
7049#if defined(ACTIVATION_TYPE)
7050#if defined(MIXED_PRECISION)
7051    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
7052#else
7053    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
7054#endif
7055#endif
7056
7057
7058#if defined(MIXED_PRECISION)
7059    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
7060    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7061#else
7062    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7063#endif
7064
7065#undef LHS_BLOCK_SIZE
7066#undef LHS_OFFSET_X
7067#undef LHS_STEP_X
7068#undef RHS_BLOCK_SIZE
7069#undef RHS_OFFSET_X
7070#undef RHS_STEP_X
7071#undef PIXEL_UNIT
7072#undef LHS_STEP_LOOP
7073#undef RHS_STEP_LOOP
7074}
7075#endif
7076
7077#if defined(LHS_TRANSPOSE)
7078
7079#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
7080
7081#if defined(MIXED_PRECISION)
7082
7083#if(GPU_ARCH == GPU_ARCH_MIDGARD)
7084#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
7085#else
7086#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
7087#endif
7088
7089#else
7090
7091#if(GPU_ARCH == GPU_ARCH_MIDGARD)
7092#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
7093#else
7094#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
7095#endif
7096
7097#endif
7098
7099#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
7100    ({                                                 \
7101        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
7102    })
7103#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
7104    ({                                                    \
7105        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
7106        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
7107    })
7108#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
7109    ({                                                    \
7110        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
7111        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
7112    })
7113#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
7114    ({                                                    \
7115        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
7116        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
7117    })
7118#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
7119    ({                                                    \
7120        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
7121        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
7122        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
7123        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
7124        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
7125    })
7126
7127
7128
7129
7130
7131
7132
7133#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
7134
7135#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
7136    ({                                                         \
7137        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
7138    })
7139#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
7140    ({                                                         \
7141        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
7142        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
7143    })
7144#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
7145    ({                                                         \
7146        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
7147        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
7148    })
7149#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
7150    ({                                                         \
7151        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
7152        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
7153    })
7154#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
7155    ({                                                         \
7156        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
7157        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
7158        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
7159        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
7160        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
7161    })
7162#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
7163    ({                                                        \
7164        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
7165        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
7166        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
7167        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
7168        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
7169        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
7170        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
7171        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
7172        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
7173    })
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
7184    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
7185    (M0, N0, TYPE, A, B, C)
7186
7187#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
7188
7189__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
7190                                            IMAGE_DECLARATION(rhs),
7191#if defined(BETA)
7192                                            IMAGE_DECLARATION(bias),
7193#endif
7194                                            IMAGE_DECLARATION(dst),
7195                                            uint lhs_stride_z,
7196                                            uint rhs_stride_z,
7197#if defined(BETA)
7198                                            uint bias_stride_z,
7199#endif
7200                                            uint dst_stride_z
7201#if defined(REINTERPRET_OUTPUT_AS_3D)
7202                                            ,
7203                                            uint dst_cross_plane_pad
7204#endif
7205                                            ,
7206                                            const int M,
7207                                            const int N,
7208                                            const int K)
7209{
7210
7211#define LHS_BLOCK_SIZE ((K0) * (M0))
7212
7213#if defined(LHS_INTERLEAVE)
7214#define LHS_OFFSET_X (M0)
7215#define LHS_STEP_X ((M0) * (V0))
7216#define LHS_STEP_LOOP (1)
7217#else
7218#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
7219#define LHS_STEP_X (M0)
7220#define LHS_STEP_LOOP (V0)
7221#endif
7222
7223
7224#define RHS_BLOCK_SIZE ((K0) * (N0))
7225
7226
7227#if defined(RHS_INTERLEAVE)
7228#define RHS_OFFSET_X (N0)
7229#define RHS_STEP_X ((N0) * (H0))
7230#else
7231#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
7232#define RHS_STEP_X (N0)
7233#endif
7234
7235    const uint x = get_global_id(0);
7236    const uint y = get_global_id(1);
7237    const uint z = get_global_id(2);
7238
7239    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
7240    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
7241
7242#if defined(DUMMY_WORK_ITEMS)
7243    if((x * N0 >= N) || (y * M0 >= M))
7244    {
7245        return;
7246    }
7247#endif
7248
7249
7250    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
7251
7252
7253    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
7254
7255#if defined(MATRIX_B_DEPTH)
7256
7257    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
7258#else
7259    rhs_addr += z * rhs_stride_z;
7260#endif
7261
7262
7263    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
7264
7265    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
7266
7267    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
7268    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
7269
7270    for(int i = 0; i < K; i += K0)
7271    {
7272        VEC_DATA_TYPE(DATA_TYPE, M0)
7273        a0;
7274        VEC_DATA_TYPE(DATA_TYPE, N0)
7275        b0;
7276
7277        a0 = VLOAD(M0)(0, lhs);
7278        b0 = VLOAD(N0)(0, rhs);
7279
7280        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7281
7282        lhs += LHS_STEP_X;
7283        rhs += RHS_STEP_X;
7284
7285#if K0 > 1
7286        a0 = VLOAD(M0)(0, lhs);
7287        b0 = VLOAD(N0)(0, rhs);
7288
7289        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7290
7291        lhs += LHS_STEP_X;
7292        rhs += RHS_STEP_X;
7293#endif
7294
7295#if K0 > 2
7296        a0 = VLOAD(M0)(0, lhs);
7297        b0 = VLOAD(N0)(0, rhs);
7298
7299        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7300
7301        lhs += LHS_STEP_X;
7302        rhs += RHS_STEP_X;
7303#endif
7304
7305#if K0 > 3
7306        a0 = VLOAD(M0)(0, lhs);
7307        b0 = VLOAD(N0)(0, rhs);
7308
7309        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7310
7311        lhs += LHS_STEP_X;
7312        rhs += RHS_STEP_X;
7313#endif
7314
7315#if K0 > 4
7316        a0 = VLOAD(M0)(0, lhs);
7317        b0 = VLOAD(N0)(0, rhs);
7318
7319        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7320
7321        lhs += LHS_STEP_X;
7322        rhs += RHS_STEP_X;
7323
7324        a0 = VLOAD(M0)(0, lhs);
7325        b0 = VLOAD(N0)(0, rhs);
7326
7327        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7328
7329        lhs += LHS_STEP_X;
7330        rhs += RHS_STEP_X;
7331
7332        a0 = VLOAD(M0)(0, lhs);
7333        b0 = VLOAD(N0)(0, rhs);
7334
7335        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7336
7337        lhs += LHS_STEP_X;
7338        rhs += RHS_STEP_X;
7339
7340        a0 = VLOAD(M0)(0, lhs);
7341        b0 = VLOAD(N0)(0, rhs);
7342
7343        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7344
7345        lhs += LHS_STEP_X;
7346        rhs += RHS_STEP_X;
7347#endif
7348
7349#if K0 > 8
7350        a0 = VLOAD(M0)(0, lhs);
7351        b0 = VLOAD(N0)(0, rhs);
7352
7353        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7354
7355        lhs += LHS_STEP_X;
7356        rhs += RHS_STEP_X;
7357
7358        a0 = VLOAD(M0)(0, lhs);
7359        b0 = VLOAD(N0)(0, rhs);
7360
7361        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7362
7363        lhs += LHS_STEP_X;
7364        rhs += RHS_STEP_X;
7365
7366        a0 = VLOAD(M0)(0, lhs);
7367        b0 = VLOAD(N0)(0, rhs);
7368
7369        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7370
7371        lhs += LHS_STEP_X;
7372        rhs += RHS_STEP_X;
7373
7374        a0 = VLOAD(M0)(0, lhs);
7375        b0 = VLOAD(N0)(0, rhs);
7376
7377        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7378
7379        lhs += LHS_STEP_X;
7380        rhs += RHS_STEP_X;
7381
7382        a0 = VLOAD(M0)(0, lhs);
7383        b0 = VLOAD(N0)(0, rhs);
7384
7385        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7386
7387        lhs += LHS_STEP_X;
7388        rhs += RHS_STEP_X;
7389
7390        a0 = VLOAD(M0)(0, lhs);
7391        b0 = VLOAD(N0)(0, rhs);
7392
7393        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7394
7395        lhs += LHS_STEP_X;
7396        rhs += RHS_STEP_X;
7397
7398        a0 = VLOAD(M0)(0, lhs);
7399        b0 = VLOAD(N0)(0, rhs);
7400
7401        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7402
7403        lhs += LHS_STEP_X;
7404        rhs += RHS_STEP_X;
7405
7406        a0 = VLOAD(M0)(0, lhs);
7407        b0 = VLOAD(N0)(0, rhs);
7408
7409        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7410
7411        lhs += LHS_STEP_X;
7412        rhs += RHS_STEP_X;
7413#endif
7414
7415#ifndef LHS_INTERLEAVE
7416        lhs += (M0 * K0 * (V0 - 1));
7417#endif
7418
7419#ifndef RHS_INTERLEAVE
7420        rhs += (N0 * K0 * (H0 - 1));
7421#endif
7422    }
7423
7424    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
7425
7426    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
7427
7428#if defined(REINTERPRET_OUTPUT_AS_3D)
7429
7430
7431    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
7432
7433
7434    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
7435
7436#else
7437
7438
7439    dst_addr += z * dst_stride_z;
7440
7441#endif
7442
7443
7444#if defined(ALPHA)
7445    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
7446#endif
7447
7448
7449#if defined(BETA)
7450#if defined(BROADCAST_BIAS)
7451    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
7452
7453    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
7454
7455#ifndef UNIT_BETA
7456    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
7457#endif
7458
7459
7460#if defined(MIXED_PRECISION)
7461    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7462    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
7463#else
7464    ADD_BLOCK_BROADCAST(M0, c, bias0);
7465#endif
7466
7467#else
7468    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
7469                                    2) * bias_stride_z;
7470
7471    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7472
7473#ifndef UNIT_BETA
7474    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
7475#endif
7476
7477#if defined(MIXED_PRECISION)
7478    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7479    ADD_BLOCK(M0, c, bias_hp);
7480#else
7481    ADD_BLOCK(M0, c, bias);
7482#endif
7483
7484#endif
7485#endif
7486
7487#if defined(ACTIVATION_TYPE)
7488#if defined(MIXED_PRECISION)
7489    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
7490#else
7491    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
7492#endif
7493#endif
7494
7495
7496#if defined(MIXED_PRECISION)
7497    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
7498    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7499#else
7500    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7501#endif
7502
7503#undef LHS_BLOCK_SIZE
7504#undef LHS_OFFSET_X
7505#undef LHS_STEP_X
7506#undef RHS_BLOCK_SIZE
7507#undef RHS_OFFSET_X
7508#undef RHS_STEP_X
7509}
7510#endif
7511
7512#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
7513
7514__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
7515                                                    __read_only image2d_t rhs_img,
7516#if defined(BETA)
7517                                                    IMAGE_DECLARATION(bias),
7518#endif
7519                                                    IMAGE_DECLARATION(dst),
7520                                                    uint lhs_stride_z,
7521                                                    uint rhs_stride_z,
7522#if defined(BETA)
7523                                                    uint bias_stride_z,
7524#endif
7525                                                    uint dst_stride_z
7526#if defined(REINTERPRET_OUTPUT_AS_3D)
7527                                                    ,
7528                                                    uint dst_cross_plane_pad
7529#endif
7530                                                    ,
7531                                                    const int M,
7532                                                    const int N,
7533                                                    const int K)
7534{
7535
7536#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
7537
7538
7539#define LHS_BLOCK_SIZE ((K0) * (M0))
7540
7541#if defined(LHS_INTERLEAVE)
7542#define LHS_OFFSET_X (M0)
7543#define LHS_STEP_X ((M0) * (V0))
7544#define LHS_STEP_LOOP (1)
7545#else
7546#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
7547#define LHS_STEP_X (M0)
7548#define LHS_STEP_LOOP (V0)
7549#endif
7550
7551
7552#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
7553
7554
7555#if defined(RHS_INTERLEAVE)
7556#define RHS_OFFSET_X (PIXEL_UNIT)
7557#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
7558#else
7559#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
7560#define RHS_STEP_X (PIXEL_UNIT)
7561#endif
7562
7563    const uint x = get_global_id(0);
7564    const uint y = get_global_id(1);
7565    const uint z = get_global_id(2);
7566
7567#if defined(DUMMY_WORK_ITEMS)
7568    if((x * N0 >= N) || (y * M0 >= M))
7569    {
7570        return;
7571    }
7572#endif
7573
7574
7575    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
7576
7577#if defined(MATRIX_B_DEPTH)
7578
7579    const uint z_rhs = (z % MATRIX_B_DEPTH);
7580#else
7581    const uint z_rhs = z;
7582#endif
7583
7584
7585    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
7586    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
7587
7588
7589    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
7590
7591    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
7592
7593    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
7594
7595    for(int i = 0; i < K; i += K0)
7596    {
7597        VEC_DATA_TYPE(DATA_TYPE, M0)
7598        a0;
7599        VEC_DATA_TYPE(DATA_TYPE, N0)
7600        b0;
7601
7602        a0 = VLOAD(M0)(0, lhs);
7603        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
7604
7605        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7606
7607        lhs += LHS_STEP_X;
7608
7609#if K0 > 1
7610        a0 = VLOAD(M0)(0, lhs);
7611        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
7612
7613        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7614
7615        lhs += LHS_STEP_X;
7616#endif
7617
7618#if K0 > 2
7619        a0 = VLOAD(M0)(0, lhs);
7620        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
7621
7622        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7623
7624        lhs += LHS_STEP_X;
7625#endif
7626
7627#if K0 > 3
7628        a0 = VLOAD(M0)(0, lhs);
7629        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
7630
7631        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7632
7633        lhs += LHS_STEP_X;
7634#endif
7635
7636#if K0 > 4
7637        a0 = VLOAD(M0)(0, lhs);
7638        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
7639
7640        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7641
7642        lhs += LHS_STEP_X;
7643
7644        a0 = VLOAD(M0)(0, lhs);
7645        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
7646
7647        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7648
7649        lhs += LHS_STEP_X;
7650
7651        a0 = VLOAD(M0)(0, lhs);
7652        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
7653
7654        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7655
7656        lhs += LHS_STEP_X;
7657
7658        a0 = VLOAD(M0)(0, lhs);
7659        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
7660
7661        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7662
7663        lhs += LHS_STEP_X;
7664#endif
7665
7666#if K0 > 8
7667        a0 = VLOAD(M0)(0, lhs);
7668        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
7669
7670        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7671
7672        lhs += LHS_STEP_X;
7673
7674        a0 = VLOAD(M0)(0, lhs);
7675        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
7676
7677        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7678
7679        lhs += LHS_STEP_X;
7680
7681        a0 = VLOAD(M0)(0, lhs);
7682        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
7683
7684        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7685
7686        lhs += LHS_STEP_X;
7687
7688        a0 = VLOAD(M0)(0, lhs);
7689        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
7690
7691        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7692
7693        lhs += LHS_STEP_X;
7694
7695        a0 = VLOAD(M0)(0, lhs);
7696        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
7697
7698        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7699
7700        lhs += LHS_STEP_X;
7701
7702        a0 = VLOAD(M0)(0, lhs);
7703        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
7704
7705        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7706
7707        lhs += LHS_STEP_X;
7708
7709        a0 = VLOAD(M0)(0, lhs);
7710        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
7711
7712        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7713
7714        lhs += LHS_STEP_X;
7715
7716        a0 = VLOAD(M0)(0, lhs);
7717        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
7718
7719        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
7720
7721        lhs += LHS_STEP_X;
7722#endif
7723
7724#ifndef LHS_INTERLEAVE
7725        lhs += (M0 * K0 * (V0 - 1));
7726#endif
7727
7728        x_rhs += K0 * RHS_STEP_X;
7729#ifndef RHS_INTERLEAVE
7730        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
7731#endif
7732    }
7733
7734    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
7735
7736    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
7737
7738    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
7739    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
7740
7741#if defined(REINTERPRET_OUTPUT_AS_3D)
7742
7743
7744    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
7745
7746
7747    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
7748
7749#else
7750
7751
7752    dst_addr += z * dst_stride_z;
7753
7754#endif
7755
7756
7757#if defined(ALPHA)
7758    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
7759#endif
7760
7761
7762#if defined(BETA)
7763#if defined(BROADCAST_BIAS)
7764    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
7765
7766    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
7767
7768#ifndef UNIT_BETA
7769    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
7770#endif
7771
7772
7773#if defined(MIXED_PRECISION)
7774    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7775    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
7776#else
7777    ADD_BLOCK_BROADCAST(M0, c, bias0);
7778#endif
7779
7780#else
7781    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
7782
7783    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7784
7785#ifndef UNIT_BETA
7786    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
7787#endif
7788
7789#if defined(MIXED_PRECISION)
7790    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
7791    ADD_BLOCK(M0, c, bias_hp);
7792#else
7793    ADD_BLOCK(M0, c, bias);
7794#endif
7795
7796#endif
7797#endif
7798
7799#if defined(ACTIVATION_TYPE)
7800#if defined(MIXED_PRECISION)
7801    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
7802#else
7803    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
7804#endif
7805#endif
7806
7807
7808#if defined(MIXED_PRECISION)
7809    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
7810    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7811#else
7812    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
7813#endif
7814
7815#undef LHS_BLOCK_SIZE
7816#undef LHS_OFFSET_X
7817#undef LHS_STEP_X
7818#undef RHS_BLOCK_SIZE
7819#undef RHS_OFFSET_X
7820#undef RHS_STEP_X
7821#undef PIXEL_UNIT
7822#undef LHS_STEP_LOOP
7823#undef RHS_STEP_LOOP
7824}
7825#endif
7826
7827#endif
7828
7829#endif
7830
7831#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
7832
7833#define VFMA(a, b, c)     \
7834    ({                    \
7835        c = fma(a, b, c); \
7836    })
7837
7838#if M0 == 1
7839#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7840    ({                                                                \
7841        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7842    })
7843#elif M0 == 2
7844#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7845    ({                                                                \
7846        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7847        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7848    })
7849#elif M0 == 3
7850#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7851    ({                                                                \
7852        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7853        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7854        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7855    })
7856#elif M0 == 4
7857#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7858    ({                                                                \
7859        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7860        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7861        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7862        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7863    })
7864#elif M0 == 5
7865#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7866    ({                                                                \
7867        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7868        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7869        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7870        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7871        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
7872    })
7873#elif M0 == 6
7874#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7875    ({                                                                \
7876        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7877        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7878        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7879        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7880        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
7881        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
7882    })
7883#elif M0 == 7
7884#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7885    ({                                                                \
7886        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7887        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7888        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7889        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7890        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
7891        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
7892        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
7893    })
7894#elif M0 == 8
7895#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
7896    ({                                                                \
7897        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
7898        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
7899        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
7900        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
7901        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
7902        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
7903        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
7904        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
7905    })
7906#else
7907#error "M0 not supported"
7908#endif
7909
7910#if defined(GEMM_MM_NATIVE)
7911
7912__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
7913                             IMAGE_DECLARATION(rhs),
7914#if defined(BETA)
7915                             IMAGE_DECLARATION(bias),
7916#endif
7917                             IMAGE_DECLARATION(dst),
7918                             uint lhs_stride_z,
7919                             uint rhs_stride_z,
7920#if defined(BETA)
7921                             uint bias_stride_z,
7922#endif
7923                             uint      dst_stride_z,
7924                             const int M,
7925                             const int N,
7926                             const int K
7927#if defined(REINTERPRET_INPUT_AS_3D)
7928                             ,
7929                             uint lhs_cross_plane_pad
7930#endif
7931#if defined(REINTERPRET_OUTPUT_AS_3D)
7932                             ,
7933                             uint dst_cross_plane_pad
7934#endif
7935                            )
7936{
7937
7938#define RHS_BLOCK_SIZE ((K0) * (N0))
7939
7940
7941#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
7942
7943    uint x = get_global_id(0);
7944    uint y = get_global_id(1);
7945    uint z = get_global_id(2);
7946
7947#if defined(DUMMY_WORK_ITEMS)
7948    if((x * N0 >= N) || (y * M0 >= M))
7949    {
7950        return;
7951    }
7952#endif
7953
7954
7955    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
7956
7957
7958    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
7959
7960#if defined(MATRIX_B_DEPTH)
7961
7962    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
7963#else
7964    rhs_offset += z * rhs_stride_z;
7965#endif
7966
7967    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
7968    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
7969
7970#if defined(REINTERPRET_INPUT_AS_3D)
7971
7972    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
7973
7974
7975
7976    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
7977
7978#else
7979
7980
7981    lhs_offset += z * lhs_stride_z;
7982
7983#endif
7984
7985
7986    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
7987
7988    int i = 0;
7989#if K0 > 1
7990    for(; i <= (K - K0); i += K0)
7991    {
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
8003
8004
8005        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
8006
8007        RHS_VFMA_M0xN0(0, a, b0, c);
8008        RHS_VFMA_M0xN0(1, a, b1, c);
8009#if K0 > 2
8010        RHS_VFMA_M0xN0(2, a, b2, c);
8011#endif
8012#if K0 > 3
8013        RHS_VFMA_M0xN0(3, a, b3, c);
8014#endif
8015#if K0 > 4
8016        RHS_VFMA_M0xN0(4, a, b4, c);
8017        RHS_VFMA_M0xN0(5, a, b5, c);
8018        RHS_VFMA_M0xN0(6, a, b6, c);
8019        RHS_VFMA_M0xN0(7, a, b7, c);
8020#endif
8021#if K0 > 8
8022        RHS_VFMA_M0xN0(8, a, b8, c);
8023        RHS_VFMA_M0xN0(9, a, b9, c);
8024        RHS_VFMA_M0xN0(A, a, bA, c);
8025        RHS_VFMA_M0xN0(B, a, bB, c);
8026        RHS_VFMA_M0xN0(C, a, bC, c);
8027        RHS_VFMA_M0xN0(D, a, bD, c);
8028        RHS_VFMA_M0xN0(E, a, bE, c);
8029        RHS_VFMA_M0xN0(F, a, bF, c);
8030#endif
8031
8032        lhs_offset += K0 * sizeof(DATA_TYPE);
8033        rhs_offset += K0 * rhs_stride_y;
8034    }
8035#endif
8036
8037    for(; i < K; ++i)
8038    {
8039
8040        VEC_DATA_TYPE(DATA_TYPE, 2)
8041        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
8042#if M0 > 1
8043        VEC_DATA_TYPE(DATA_TYPE, 2)
8044        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
8045#endif
8046#if M0 > 2
8047        VEC_DATA_TYPE(DATA_TYPE, 2)
8048        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
8049#endif
8050#if M0 > 3
8051        VEC_DATA_TYPE(DATA_TYPE, 2)
8052        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
8053#endif
8054#if M0 > 4
8055        VEC_DATA_TYPE(DATA_TYPE, 2)
8056        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
8057#endif
8058#if M0 > 5
8059        VEC_DATA_TYPE(DATA_TYPE, 2)
8060        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
8061#endif
8062#if M0 > 6
8063        VEC_DATA_TYPE(DATA_TYPE, 2)
8064        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
8065#endif
8066#if M0 > 7
8067        VEC_DATA_TYPE(DATA_TYPE, 2)
8068        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
8069#endif
8070
8071        VEC_DATA_TYPE(DATA_TYPE, N0)
8072        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
8073        RHS_VFMA_M0xN0(0, a, b, c);
8074
8075        lhs_offset += sizeof(DATA_TYPE);
8076        rhs_offset += rhs_stride_y;
8077    }
8078
8079    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
8080
8081    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
8082
8083#if defined(REINTERPRET_OUTPUT_AS_3D)
8084
8085    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
8086
8087
8088
8089    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
8090
8091#else
8092
8093
8094    dst_addr += z * dst_stride_z;
8095
8096#endif
8097
8098
8099#if defined(ALPHA)
8100    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
8101#endif
8102
8103
8104#if defined(BETA)
8105#if defined(BROADCAST_BIAS)
8106    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
8107
8108    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
8109
8110#ifndef UNIT_BETA
8111    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
8112#endif
8113
8114
8115    ADD_BLOCK_BROADCAST(M0, c, bias0);
8116
8117#else
8118    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
8119
8120    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
8121
8122#ifndef UNIT_BETA
8123    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
8124#endif
8125
8126
8127    ADD_BLOCK(M0, c, bias);
8128
8129#endif
8130#endif
8131
8132#if defined(ACTIVATION_TYPE)
8133    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
8134#endif
8135
8136    const bool cond_y = y == 0;
8137    const bool cond_x = ((x + 1) * N0 >= N);
8138
8139
8140    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
8141}
8142#endif
8143#endif
8144
8145#if defined(BETA)
8146
8147__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
8148                          TENSOR3D_DECLARATION(dst))
8149{
8150
8151    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
8152    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
8153
8154
8155    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
8156
8157
8158    float4 c = vload4(0, (__global float *)src.ptr);
8159
8160
8161    float4 out = alpha_ab + (float4)BETA * c;
8162
8163
8164    vstore4(out, 0, (__global float *)dst.ptr);
8165}
8166
8167#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
8168
8169__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
8170                          TENSOR3D_DECLARATION(dst))
8171{
8172
8173    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
8174    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
8175
8176
8177    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
8178
8179
8180    half8 c = vload8(0, (__global half *)src.ptr);
8181
8182
8183    half8 out = alpha_ab + (half8)BETA * c;
8184
8185
8186    vstore8(out, 0, (__global half *)dst.ptr);
8187}
8188#endif
8189#endif  )"