xref: /aosp_15_r20/external/ComputeLibrary/cl_kernels/common/gemmlowp.clembed (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1*c217d954SCole FaustR"(
2*c217d954SCole Faust
3*c217d954SCole Faust
4*c217d954SCole Faust
5*c217d954SCole Faust
6*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
7*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
8*c217d954SCole Faust
9*c217d954SCole Faust
10*c217d954SCole Faust
11*c217d954SCole Faust
12*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
13*c217d954SCole Faust    VSTORE(N0)                                                 \
14*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
15*c217d954SCole Faust
16*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
17*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
18*c217d954SCole Faust    VSTORE(N0)                                                 \
19*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
20*c217d954SCole Faust
21*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
22*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
23*c217d954SCole Faust    VSTORE(N0)                                                 \
24*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
25*c217d954SCole Faust
26*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
27*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
28*c217d954SCole Faust    VSTORE(N0)                                                 \
29*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
30*c217d954SCole Faust
31*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
32*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
33*c217d954SCole Faust    VSTORE(N0)                                                 \
34*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
35*c217d954SCole Faust
36*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
37*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
38*c217d954SCole Faust    VSTORE(N0)                                                 \
39*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
40*c217d954SCole Faust
41*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
42*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
43*c217d954SCole Faust    VSTORE(N0)                                                 \
44*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
45*c217d954SCole Faust
46*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
47*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
48*c217d954SCole Faust    VSTORE(N0)                                                 \
49*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
50*c217d954SCole Faust
51*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
52*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
53*c217d954SCole Faust    VSTORE(N0)                                                 \
54*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
55*c217d954SCole Faust
56*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
57*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
58*c217d954SCole Faust    VSTORE(N0)                                                  \
59*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
60*c217d954SCole Faust
61*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
62*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
63*c217d954SCole Faust    VSTORE(N0)                                                  \
64*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
65*c217d954SCole Faust
66*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
67*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
68*c217d954SCole Faust    VSTORE(N0)                                                  \
69*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
70*c217d954SCole Faust
71*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
72*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
73*c217d954SCole Faust    VSTORE(N0)                                                  \
74*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
75*c217d954SCole Faust
76*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
77*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
78*c217d954SCole Faust    VSTORE(N0)                                                  \
79*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
80*c217d954SCole Faust
81*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
82*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
83*c217d954SCole Faust    VSTORE(N0)                                                  \
84*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
85*c217d954SCole Faust
86*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
87*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
88*c217d954SCole Faust    VSTORE(N0)                                                  \
89*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
90*c217d954SCole Faust
91*c217d954SCole Faust
92*c217d954SCole Faust
93*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
94*c217d954SCole Faust    VSTORE(N0)                                                         \
95*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
96*c217d954SCole Faust
97*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
98*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
99*c217d954SCole Faust    VSTORE(N0)                                                         \
100*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
101*c217d954SCole Faust
102*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
103*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
104*c217d954SCole Faust    VSTORE(N0)                                                         \
105*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
106*c217d954SCole Faust
107*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
108*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
109*c217d954SCole Faust    VSTORE(N0)                                                         \
110*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
111*c217d954SCole Faust
112*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
113*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
114*c217d954SCole Faust    VSTORE(N0)                                                         \
115*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
116*c217d954SCole Faust
117*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
118*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
119*c217d954SCole Faust    VSTORE(N0)                                                         \
120*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
121*c217d954SCole Faust
122*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
123*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
124*c217d954SCole Faust    VSTORE(N0)                                                         \
125*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
126*c217d954SCole Faust
127*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
128*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
129*c217d954SCole Faust    VSTORE(N0)                                                         \
130*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
131*c217d954SCole Faust
132*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
133*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
134*c217d954SCole Faust    VSTORE(N0)                                                         \
135*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
136*c217d954SCole Faust
137*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
138*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
139*c217d954SCole Faust    VSTORE(N0)                                                     \
140*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
141*c217d954SCole Faust
142*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
143*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
144*c217d954SCole Faust    VSTORE(N0)                                                          \
145*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
146*c217d954SCole Faust
147*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
148*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
149*c217d954SCole Faust    VSTORE(N0)                                                          \
150*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
151*c217d954SCole Faust
152*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
153*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
154*c217d954SCole Faust    VSTORE(N0)                                                          \
155*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156*c217d954SCole Faust
157*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
158*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
159*c217d954SCole Faust    VSTORE(N0)                                                          \
160*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
161*c217d954SCole Faust
162*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
163*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
164*c217d954SCole Faust    VSTORE(N0)                                                          \
165*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
166*c217d954SCole Faust
167*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
168*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
169*c217d954SCole Faust    VSTORE(N0)                                                          \
170*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
171*c217d954SCole Faust
172*c217d954SCole Faust
173*c217d954SCole Faust
174*c217d954SCole Faust
175*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
176*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
177*c217d954SCole Faust
178*c217d954SCole Faust
179*c217d954SCole Faust
180*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
181*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
182*c217d954SCole Faust
183*c217d954SCole Faust
184*c217d954SCole Faust
185*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
186*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
187*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
188*c217d954SCole Faust
189*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
190*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
191*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
192*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
193*c217d954SCole Faust
194*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
195*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
196*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
197*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
198*c217d954SCole Faust
199*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
200*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
201*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
202*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
203*c217d954SCole Faust
204*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
205*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
206*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
207*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
208*c217d954SCole Faust
209*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
210*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
211*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
212*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
213*c217d954SCole Faust
214*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
215*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
216*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
217*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
218*c217d954SCole Faust
219*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
220*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
221*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
222*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
223*c217d954SCole Faust
224*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
225*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
226*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
227*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
228*c217d954SCole Faust
229*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
230*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
231*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
232*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
233*c217d954SCole Faust
234*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
235*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
236*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
237*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
238*c217d954SCole Faust
239*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
240*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
241*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
242*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
243*c217d954SCole Faust
244*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
245*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
246*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
247*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
248*c217d954SCole Faust
249*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
250*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
251*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
252*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
253*c217d954SCole Faust
254*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
255*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
256*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
257*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
258*c217d954SCole Faust
259*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
260*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
261*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
262*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
263*c217d954SCole Faust
264*c217d954SCole Faust
265*c217d954SCole Faust
266*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
267*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
268*c217d954SCole Faust
269*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
270*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
271*c217d954SCole Faust    {                                                                                                                                                     \
272*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
273*c217d954SCole Faust    }                                                                                                                                                     \
274*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
275*c217d954SCole Faust    {                                                                                                                                                     \
276*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
277*c217d954SCole Faust    }                                                                                                                                                     \
278*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
279*c217d954SCole Faust    {                                                                                                                                                     \
280*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
281*c217d954SCole Faust    }                                                                                                                                                     \
282*c217d954SCole Faust    else                                                                                                                                                  \
283*c217d954SCole Faust    {                                                                                                                                                     \
284*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
285*c217d954SCole Faust    }
286*c217d954SCole Faust
287*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
288*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
289*c217d954SCole Faust    {                                                                                                             \
290*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
291*c217d954SCole Faust    }                                                                                                             \
292*c217d954SCole Faust    else                                                                                                          \
293*c217d954SCole Faust    {                                                                                                             \
294*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
295*c217d954SCole Faust    }
296*c217d954SCole Faust
297*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
298*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
299*c217d954SCole Faust    {                                                                                                             \
300*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
301*c217d954SCole Faust    }                                                                                                             \
302*c217d954SCole Faust    else                                                                                                          \
303*c217d954SCole Faust    {                                                                                                             \
304*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
305*c217d954SCole Faust    }
306*c217d954SCole Faust
307*c217d954SCole Faust
308*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
309*c217d954SCole Faust
310*c217d954SCole Faust
311*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
312*c217d954SCole Faust
313*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
314*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
315*c217d954SCole Faust
316*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
317*c217d954SCole Faust
318*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
319*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
320*c217d954SCole Faust
321*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
322*c217d954SCole Faust
323*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
324*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
325*c217d954SCole Faust
326*c217d954SCole Faust#else
327*c217d954SCole Faust
328*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
329*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
330*c217d954SCole Faust
331*c217d954SCole Faust#endif
332*c217d954SCole Faust
333*c217d954SCole Faust#endif
334*c217d954SCole Faust
335*c217d954SCole Faust
336*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
337*c217d954SCole Faust
338*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
339*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
340*c217d954SCole Faust#else
341*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
342*c217d954SCole Faust    ((uint)(y * M0))
343*c217d954SCole Faust#endif
344*c217d954SCole Faust
345*c217d954SCole Faust
346*c217d954SCole Faust
347*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
348*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
349*c217d954SCole Faust
350*c217d954SCole Faust
351*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
352*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
353*c217d954SCole Faust#endif
354*c217d954SCole Faust
355*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
356*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
357*c217d954SCole Faust#endif
358*c217d954SCole Faust
359*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
360*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
361*c217d954SCole Faust#endif
362*c217d954SCole Faust
363*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
364*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
365*c217d954SCole Faust#endif
366*c217d954SCole Faust
367*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
368*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
369*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
370*c217d954SCole Faust
371*c217d954SCole Faust
372*c217d954SCole Faust#define CONCAT(a, b) a##b
373*c217d954SCole Faust
374*c217d954SCole Faust
375*c217d954SCole Faust#define EXPAND(x) x
376*c217d954SCole Faust
377*c217d954SCole Faust
378*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
379*c217d954SCole Faust
380*c217d954SCole Faust
381*c217d954SCole Faust#define REV1(x) ((x))
382*c217d954SCole Faust#define REV2(x) ((x).s10)
383*c217d954SCole Faust#define REV3(x) ((x).s210)
384*c217d954SCole Faust#define REV4(x) ((x).s3210)
385*c217d954SCole Faust#define REV8(x) ((x).s76543210)
386*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
387*c217d954SCole Faust
388*c217d954SCole Faust
389*c217d954SCole Faust
390*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
391*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
392*c217d954SCole Faust
393*c217d954SCole Faust
394*c217d954SCole Faust
395*c217d954SCole Faust#define ROT1_0(x) ((x))
396*c217d954SCole Faust#define ROT1_1(x) ((x))
397*c217d954SCole Faust
398*c217d954SCole Faust#define ROT2_0(x) ((x))
399*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
400*c217d954SCole Faust#define ROT2_2(x) ((x))
401*c217d954SCole Faust
402*c217d954SCole Faust#define ROT3_0(x) ((x))
403*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
404*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
405*c217d954SCole Faust#define ROT3_3(x) ((x))
406*c217d954SCole Faust
407*c217d954SCole Faust#define ROT4_0(x) ((x))
408*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
409*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
410*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
411*c217d954SCole Faust#define ROT4_4(x) ((x))
412*c217d954SCole Faust
413*c217d954SCole Faust#define ROT8_0(x) ((x))
414*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
415*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
416*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
417*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
418*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
419*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
420*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
421*c217d954SCole Faust#define ROT8_8(x) ((x))
422*c217d954SCole Faust
423*c217d954SCole Faust#define ROT16_0(x) ((x))
424*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
425*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
426*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
427*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
428*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
429*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
430*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
431*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
432*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
433*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
434*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
435*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
436*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
437*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
438*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
439*c217d954SCole Faust#define ROT16_16(x) ((x))
440*c217d954SCole Faust
441*c217d954SCole Faust
442*c217d954SCole Faust
443*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
444*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
445*c217d954SCole Faust
446*c217d954SCole Faust
447*c217d954SCole Faust
448*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
449*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
450*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
451*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
452*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
453*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
454*c217d954SCole Faust
455*c217d954SCole Faust
456*c217d954SCole Faust
457*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
458*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
459*c217d954SCole Faust
460*c217d954SCole Faust
461*c217d954SCole Faust#define VLOAD_STR(size) vload##size
462*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
463*c217d954SCole Faust
464*c217d954SCole Faust
465*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
466*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
467*c217d954SCole Faust
468*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
469*c217d954SCole Faust    {                            \
470*c217d954SCole Faust    }
471*c217d954SCole Faust
472*c217d954SCole Faust
473*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
474*c217d954SCole Faust#define vload_partial_1_1 vload1
475*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
476*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
477*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
478*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
479*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
480*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
481*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
482*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
483*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
484*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
485*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
486*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
487*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
488*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
489*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
490*c217d954SCole Faust
491*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
492*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
493*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
494*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
495*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
496*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
497*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
498*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
499*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
500*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
501*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
502*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
503*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
504*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
505*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
506*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
507*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
508*c217d954SCole Faust
509*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
510*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
511*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
512*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
513*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
514*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
515*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
516*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
517*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
518*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
519*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
520*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
521*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
522*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
523*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
524*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
525*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
526*c217d954SCole Faust
527*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
528*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
529*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
530*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
531*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
532*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
533*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
534*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
535*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
536*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
537*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
538*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
539*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
540*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
541*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
542*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
543*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
544*c217d954SCole Faust
545*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
546*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
547*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
548*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
549*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
550*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
551*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
552*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
553*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
554*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
555*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
556*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
557*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
558*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
559*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
560*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
561*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
562*c217d954SCole Faust
563*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
564*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
565*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
566*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
567*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
568*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
569*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
570*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
571*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
572*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
573*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
574*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
575*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
576*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
577*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
578*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
579*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
580*c217d954SCole Faust
581*c217d954SCole Faust
582*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
583*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
584*c217d954SCole Faust
585*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
586*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
587*c217d954SCole Faust
588*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
589*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
590*c217d954SCole Faust
591*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
592*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
593*c217d954SCole Faust
594*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
595*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
596*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
597*c217d954SCole Faust
598*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
599*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
600*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
601*c217d954SCole Faust
602*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
603*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
604*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
605*c217d954SCole Faust
606*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
607*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
608*c217d954SCole Faust
609*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
610*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
611*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
612*c217d954SCole Faust
613*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
614*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
615*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
616*c217d954SCole Faust
617*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
618*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
619*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
620*c217d954SCole Faust
621*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
622*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
623*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
624*c217d954SCole Faust
625*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
626*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
627*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
628*c217d954SCole Faust
629*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
630*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
631*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
632*c217d954SCole Faust
633*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
634*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
635*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
636*c217d954SCole Faust
637*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
638*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
639*c217d954SCole Faust
640*c217d954SCole Faust
641*c217d954SCole Faust
642*c217d954SCole Faust#define PIXEL_UNIT4 1
643*c217d954SCole Faust#define PIXEL_UNIT8 2
644*c217d954SCole Faust#define PIXEL_UNIT16 4
645*c217d954SCole Faust
646*c217d954SCole Faust
647*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
648*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
649*c217d954SCole Faust
650*c217d954SCole Faust
651*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
652*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
653*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
654*c217d954SCole Faust
655*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
656*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
657*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
658*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
659*c217d954SCole Faust#endif
660*c217d954SCole Faust
661*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
662*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
663*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
664*c217d954SCole Faust
665*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
666*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
667*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
668*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
669*c217d954SCole Faust#endif
670*c217d954SCole Faust
671*c217d954SCole Faust
672*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
673*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
674*c217d954SCole Faust
675*c217d954SCole Faust
676*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
677*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
678*c217d954SCole Faust
679*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
680*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
681*c217d954SCole Faust
682*c217d954SCole Faust#define float1 float
683*c217d954SCole Faust#define half1 half
684*c217d954SCole Faust#define char1 char
685*c217d954SCole Faust#define uchar1 uchar
686*c217d954SCole Faust#define short1 short
687*c217d954SCole Faust#define ushort1 ushort
688*c217d954SCole Faust#define int1 int
689*c217d954SCole Faust#define uint1 uint
690*c217d954SCole Faust#define long1 long
691*c217d954SCole Faust#define ulong1 ulong
692*c217d954SCole Faust#define double1 double
693*c217d954SCole Faust
694*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
695*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
696*c217d954SCole Faust
697*c217d954SCole Faust
698*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
699*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
700*c217d954SCole Faust
701*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
702*c217d954SCole Faust    {                             \
703*c217d954SCole Faust    }
704*c217d954SCole Faust
705*c217d954SCole Faust
706*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
707*c217d954SCole Faust#define vstore_partial_1_1 vstore1
708*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
709*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
710*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
711*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
712*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
713*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
714*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
715*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
716*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
717*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
718*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
719*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
720*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
721*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
722*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
723*c217d954SCole Faust
724*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
725*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
726*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
727*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
728*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
729*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
730*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
731*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
732*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
733*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
734*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
735*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
736*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
737*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
738*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
739*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
740*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
741*c217d954SCole Faust
742*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
743*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
744*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
745*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
746*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
747*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
748*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
749*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
750*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
751*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
752*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
753*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
754*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
755*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
756*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
757*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
758*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
759*c217d954SCole Faust
760*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
761*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
762*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
763*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
764*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
765*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
766*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
767*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
768*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
769*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
770*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
771*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
772*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
773*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
774*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
775*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
776*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
777*c217d954SCole Faust
778*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
779*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
780*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
781*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
782*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
783*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
784*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
785*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
786*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
787*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
788*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
789*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
790*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
791*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
792*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
793*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
794*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
795*c217d954SCole Faust
796*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
797*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
798*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
799*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
800*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
801*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
802*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
803*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
804*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
805*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
806*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
807*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
808*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
809*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
810*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
811*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
812*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
813*c217d954SCole Faust
814*c217d954SCole Faust
815*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
816*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
817*c217d954SCole Faust
818*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
819*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
820*c217d954SCole Faust
821*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
822*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
823*c217d954SCole Faust
824*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
825*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
826*c217d954SCole Faust
827*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
828*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
829*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
830*c217d954SCole Faust
831*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
832*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
833*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
834*c217d954SCole Faust
835*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
836*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
837*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
838*c217d954SCole Faust
839*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
840*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
841*c217d954SCole Faust
842*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
843*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
844*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
845*c217d954SCole Faust
846*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
847*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
848*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
849*c217d954SCole Faust
850*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
851*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
852*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
853*c217d954SCole Faust
854*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
855*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
856*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
857*c217d954SCole Faust
858*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
859*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
860*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
861*c217d954SCole Faust
862*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
863*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
864*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
865*c217d954SCole Faust
866*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
867*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
868*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
869*c217d954SCole Faust
870*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
871*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
872*c217d954SCole Faust
873*c217d954SCole Faust
874*c217d954SCole Faust
875*c217d954SCole Faust
876*c217d954SCole Faust
877*c217d954SCole Faust#define convert_float_sat convert_float
878*c217d954SCole Faust#define convert_float1_sat convert_float
879*c217d954SCole Faust#define convert_float2_sat convert_float2
880*c217d954SCole Faust#define convert_float3_sat convert_float3
881*c217d954SCole Faust#define convert_float4_sat convert_float4
882*c217d954SCole Faust#define convert_float8_sat convert_float8
883*c217d954SCole Faust#define convert_float16_sat convert_float16
884*c217d954SCole Faust#define convert_half_sat convert_float
885*c217d954SCole Faust#define convert_half1_sat convert_half
886*c217d954SCole Faust#define convert_half2_sat convert_half2
887*c217d954SCole Faust#define convert_half3_sat convert_half3
888*c217d954SCole Faust#define convert_half4_sat convert_half4
889*c217d954SCole Faust#define convert_half8_sat convert_half8
890*c217d954SCole Faust#define convert_half16_sat convert_half16
891*c217d954SCole Faust
892*c217d954SCole Faust#define convert_float1 convert_float
893*c217d954SCole Faust#define convert_half1 convert_half
894*c217d954SCole Faust#define convert_char1 convert_char
895*c217d954SCole Faust#define convert_uchar1 convert_uchar
896*c217d954SCole Faust#define convert_short1 convert_short
897*c217d954SCole Faust#define convert_ushort1 convert_ushort
898*c217d954SCole Faust#define convert_int1 convert_int
899*c217d954SCole Faust#define convert_uint1 convert_uint
900*c217d954SCole Faust#define convert_long1 convert_long
901*c217d954SCole Faust#define convert_ulong1 convert_ulong
902*c217d954SCole Faust#define convert_double1 convert_double
903*c217d954SCole Faust
904*c217d954SCole Faust#define convert_char1_sat convert_char_sat
905*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
906*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
907*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
908*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
909*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
910*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
911*c217d954SCole Faust#define convert_short1_sat convert_short_sat
912*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
913*c217d954SCole Faust#define convert_int1_sat convert_int_sat
914*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
915*c217d954SCole Faust#define convert_long1_sat convert_long_sat
916*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
917*c217d954SCole Faust#define convert_double1_sat convert_double_sat
918*c217d954SCole Faust
919*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
920*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
921*c217d954SCole Faust
922*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
923*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
924*c217d954SCole Faust
925*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
926*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
927*c217d954SCole Faust
928*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
929*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
930*c217d954SCole Faust
931*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
932*c217d954SCole Faust#define select_vec_dt_char(size) char##size
933*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
934*c217d954SCole Faust#define select_vec_dt_short(size) short##size
935*c217d954SCole Faust#define select_vec_dt_half(size) short##size
936*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
937*c217d954SCole Faust#define select_vec_dt_int(size) int##size
938*c217d954SCole Faust#define select_vec_dt_float(size) int##size
939*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
940*c217d954SCole Faust#define select_vec_dt_long(size) long##size
941*c217d954SCole Faust
942*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
943*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
944*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
945*c217d954SCole Faust
946*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
947*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
948*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
949*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
950*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
951*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
952*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
953*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
954*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
955*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
956*c217d954SCole Faust
957*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
958*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
959*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
960*c217d954SCole Faust
961*c217d954SCole Faust#define sum_reduce_1(x) (x)
962*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
963*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
964*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
965*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
966*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
967*c217d954SCole Faust
968*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
969*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
970*c217d954SCole Faust
971*c217d954SCole Faust#define prod_reduce_1(x) (x)
972*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
973*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
974*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
975*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
976*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
977*c217d954SCole Faust
978*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
979*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
980*c217d954SCole Faust
981*c217d954SCole Faust#define max_reduce_1(x) (x)
982*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
983*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
984*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
985*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
986*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
987*c217d954SCole Faust
988*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
989*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
990*c217d954SCole Faust
991*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
992*c217d954SCole Faust    __global uchar *name##_ptr,      \
993*c217d954SCole Faust    uint        name##_stride_x, \
994*c217d954SCole Faust    uint        name##_step_x,   \
995*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
996*c217d954SCole Faust
997*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
998*c217d954SCole Faust    __global uchar *name##_ptr,      \
999*c217d954SCole Faust    uint        name##_stride_x, \
1000*c217d954SCole Faust    uint        name##_step_x,   \
1001*c217d954SCole Faust    uint        name##_stride_y, \
1002*c217d954SCole Faust    uint        name##_step_y,   \
1003*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1004*c217d954SCole Faust
1005*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
1006*c217d954SCole Faust    __global uchar *name##_ptr,      \
1007*c217d954SCole Faust    uint        name##_stride_x, \
1008*c217d954SCole Faust    uint        name##_step_x,   \
1009*c217d954SCole Faust    uint        name##_stride_y, \
1010*c217d954SCole Faust    uint        name##_step_y,   \
1011*c217d954SCole Faust    uint        name##_stride_z, \
1012*c217d954SCole Faust    uint        name##_step_z,   \
1013*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1014*c217d954SCole Faust
1015*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
1016*c217d954SCole Faust    __global uchar *name##_ptr,      \
1017*c217d954SCole Faust    uint        name##_stride_x, \
1018*c217d954SCole Faust    uint        name##_step_x,   \
1019*c217d954SCole Faust    uint        name##_stride_y, \
1020*c217d954SCole Faust    uint        name##_step_y,   \
1021*c217d954SCole Faust    uint        name##_stride_z, \
1022*c217d954SCole Faust    uint        name##_step_z,   \
1023*c217d954SCole Faust    uint        name##_stride_w, \
1024*c217d954SCole Faust    uint        name##_step_w,   \
1025*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1026*c217d954SCole Faust
1027*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
1028*c217d954SCole Faust    __global uchar *name##_ptr,      \
1029*c217d954SCole Faust    uint        name##_stride_x, \
1030*c217d954SCole Faust    uint        name##_step_x,   \
1031*c217d954SCole Faust    uint        name##_stride_y, \
1032*c217d954SCole Faust    uint        name##_step_y,   \
1033*c217d954SCole Faust    uint        name##_stride_z, \
1034*c217d954SCole Faust    uint        name##_step_z,   \
1035*c217d954SCole Faust    uint        name##_stride_w, \
1036*c217d954SCole Faust    uint        name##_step_w,   \
1037*c217d954SCole Faust    uint        name##_stride_v, \
1038*c217d954SCole Faust    uint        name##_step_v,   \
1039*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
1040*c217d954SCole Faust
1041*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
1042*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
1043*c217d954SCole Faust
1044*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
1045*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
1046*c217d954SCole Faust
1047*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
1048*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
1049*c217d954SCole Faust
1050*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
1051*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
1052*c217d954SCole Faust
1053*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1054*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1055*c217d954SCole Faust
1056*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
1057*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
1058*c217d954SCole Faust
1059*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
1060*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
1061*c217d954SCole Faust
1062*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
1063*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1064*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
1065*c217d954SCole Faust
1066*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
1067*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
1068*c217d954SCole Faust
1069*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
1070*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1071*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
1072*c217d954SCole Faust
1073*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
1074*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
1075*c217d954SCole Faust
1076*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
1077*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
1078*c217d954SCole Faust                           name##_stride_z, name##_step_z)
1079*c217d954SCole Faust
1080*c217d954SCole Faust
1081*c217d954SCole Fausttypedef struct Vector
1082*c217d954SCole Faust{
1083*c217d954SCole Faust    __global uchar *ptr;
1084*c217d954SCole Faust    int             offset_first_element_in_bytes;
1085*c217d954SCole Faust    int             stride_x;
1086*c217d954SCole Faust} Vector;
1087*c217d954SCole Faust
1088*c217d954SCole Faust
1089*c217d954SCole Fausttypedef struct Image
1090*c217d954SCole Faust{
1091*c217d954SCole Faust    __global uchar *ptr;
1092*c217d954SCole Faust    int             offset_first_element_in_bytes;
1093*c217d954SCole Faust    int             stride_x;
1094*c217d954SCole Faust    int             stride_y;
1095*c217d954SCole Faust} Image;
1096*c217d954SCole Faust
1097*c217d954SCole Faust
1098*c217d954SCole Fausttypedef struct Tensor3D
1099*c217d954SCole Faust{
1100*c217d954SCole Faust    __global uchar *ptr;
1101*c217d954SCole Faust    int             offset_first_element_in_bytes;
1102*c217d954SCole Faust    int             stride_x;
1103*c217d954SCole Faust    int             stride_y;
1104*c217d954SCole Faust    int             stride_z;
1105*c217d954SCole Faust} Tensor3D;
1106*c217d954SCole Faust
1107*c217d954SCole Faust
1108*c217d954SCole Fausttypedef struct Tensor4D
1109*c217d954SCole Faust{
1110*c217d954SCole Faust    __global uchar *ptr;
1111*c217d954SCole Faust    int             offset_first_element_in_bytes;
1112*c217d954SCole Faust    int             stride_x;
1113*c217d954SCole Faust    int             stride_y;
1114*c217d954SCole Faust    int             stride_z;
1115*c217d954SCole Faust    int             stride_w;
1116*c217d954SCole Faust} Tensor4D;
1117*c217d954SCole Faust
1118*c217d954SCole Faust
1119*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
1120*c217d954SCole Faust{
1121*c217d954SCole Faust    Vector vector =
1122*c217d954SCole Faust    {
1123*c217d954SCole Faust        .ptr                           = ptr,
1124*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1125*c217d954SCole Faust        .stride_x                      = stride_x,
1126*c217d954SCole Faust    };
1127*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
1128*c217d954SCole Faust    return vector;
1129*c217d954SCole Faust}
1130*c217d954SCole Faust
1131*c217d954SCole Faust
1132*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
1133*c217d954SCole Faust{
1134*c217d954SCole Faust    Image img =
1135*c217d954SCole Faust    {
1136*c217d954SCole Faust        .ptr                           = ptr,
1137*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1138*c217d954SCole Faust        .stride_x                      = stride_x,
1139*c217d954SCole Faust        .stride_y                      = stride_y
1140*c217d954SCole Faust    };
1141*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
1142*c217d954SCole Faust    return img;
1143*c217d954SCole Faust}
1144*c217d954SCole Faust
1145*c217d954SCole Faust
1146*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1147*c217d954SCole Faust{
1148*c217d954SCole Faust    Image img =
1149*c217d954SCole Faust    {
1150*c217d954SCole Faust        .ptr                           = ptr,
1151*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1152*c217d954SCole Faust        .stride_x                      = stride_x,
1153*c217d954SCole Faust        .stride_y                      = stride_y
1154*c217d954SCole Faust    };
1155*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1156*c217d954SCole Faust    return img;
1157*c217d954SCole Faust}
1158*c217d954SCole Faust
1159*c217d954SCole Faust
1160*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1161*c217d954SCole Faust{
1162*c217d954SCole Faust    Tensor3D tensor =
1163*c217d954SCole Faust    {
1164*c217d954SCole Faust        .ptr                           = ptr,
1165*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1166*c217d954SCole Faust        .stride_x                      = stride_x,
1167*c217d954SCole Faust        .stride_y                      = stride_y,
1168*c217d954SCole Faust        .stride_z                      = stride_z
1169*c217d954SCole Faust    };
1170*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1171*c217d954SCole Faust    return tensor;
1172*c217d954SCole Faust}
1173*c217d954SCole Faust
1174*c217d954SCole Faust
1175*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1176*c217d954SCole Faust{
1177*c217d954SCole Faust    Tensor3D tensor =
1178*c217d954SCole Faust    {
1179*c217d954SCole Faust        .ptr                           = ptr,
1180*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1181*c217d954SCole Faust        .stride_x                      = stride_x,
1182*c217d954SCole Faust        .stride_y                      = stride_y,
1183*c217d954SCole Faust        .stride_z                      = stride_z
1184*c217d954SCole Faust    };
1185*c217d954SCole Faust    return tensor;
1186*c217d954SCole Faust}
1187*c217d954SCole Faust
1188*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
1189*c217d954SCole Faust                                             uint step_w,
1190*c217d954SCole Faust                                             uint mod_size)
1191*c217d954SCole Faust{
1192*c217d954SCole Faust    Tensor4D tensor =
1193*c217d954SCole Faust    {
1194*c217d954SCole Faust        .ptr                           = ptr,
1195*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
1196*c217d954SCole Faust        .stride_x                      = stride_x,
1197*c217d954SCole Faust        .stride_y                      = stride_y,
1198*c217d954SCole Faust        .stride_z                      = stride_z,
1199*c217d954SCole Faust        .stride_w                      = stride_w
1200*c217d954SCole Faust    };
1201*c217d954SCole Faust
1202*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1203*c217d954SCole Faust    return tensor;
1204*c217d954SCole Faust}
1205*c217d954SCole Faust
1206*c217d954SCole Faust
1207*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
1208*c217d954SCole Faust{
1209*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
1210*c217d954SCole Faust}
1211*c217d954SCole Faust
1212*c217d954SCole Faust
1213*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
1214*c217d954SCole Faust{
1215*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
1216*c217d954SCole Faust}
1217*c217d954SCole Faust
1218*c217d954SCole Faust
1219*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
1220*c217d954SCole Faust{
1221*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1222*c217d954SCole Faust}
1223*c217d954SCole Faust
1224*c217d954SCole Faust
1225*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
1226*c217d954SCole Faust{
1227*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1228*c217d954SCole Faust}
1229*c217d954SCole Faust
1230*c217d954SCole Faust
1231*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1232*c217d954SCole Faust{
1233*c217d954SCole Faust    uint num_elements = width * height;
1234*c217d954SCole Faust
1235*c217d954SCole Faust    const uint z = index / num_elements;
1236*c217d954SCole Faust
1237*c217d954SCole Faust    index %= num_elements;
1238*c217d954SCole Faust
1239*c217d954SCole Faust    const uint y = index / width;
1240*c217d954SCole Faust
1241*c217d954SCole Faust    index %= width;
1242*c217d954SCole Faust
1243*c217d954SCole Faust    const uint x = index;
1244*c217d954SCole Faust
1245*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1246*c217d954SCole Faust}
1247*c217d954SCole Faust
1248*c217d954SCole Faust#endif
1249*c217d954SCole Faust
1250*c217d954SCole Faust#if GPU_ARCH == GPU_ARCH_BIFROST
1251*c217d954SCole Faust#define MLA(a, b, c) (fma(c, b, a))
1252*c217d954SCole Faust#else
1253*c217d954SCole Faust#define MLA(a, b, c) ((b) * (c) + (a))
1254*c217d954SCole Faust#endif
1255*c217d954SCole Faust
1256*c217d954SCole Faust
1257*c217d954SCole Faust#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
1258*c217d954SCole Faust
1259*c217d954SCole Faust
1260*c217d954SCole Faust#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
1261*c217d954SCole Faust
1262*c217d954SCole Faust
1263*c217d954SCole Faust#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
1264*c217d954SCole Faust
1265*c217d954SCole Faust
1266*c217d954SCole Faust#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
1267*c217d954SCole Faust
1268*c217d954SCole Faust
1269*c217d954SCole Faust#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
1270*c217d954SCole Faust
1271*c217d954SCole Faust
1272*c217d954SCole Faust#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
1273*c217d954SCole Faust
1274*c217d954SCole Faust
1275*c217d954SCole Faust#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
1276*c217d954SCole Faust
1277*c217d954SCole Faust
1278*c217d954SCole Faust#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
1279*c217d954SCole Faust
1280*c217d954SCole Faust
1281*c217d954SCole Faust#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
1282*c217d954SCole Faust
1283*c217d954SCole Faust
1284*c217d954SCole Faust#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
1285*c217d954SCole Faust
1286*c217d954SCole Faust
1287*c217d954SCole Faust#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x)
1288*c217d954SCole Faust
1289*c217d954SCole Faust
1290*c217d954SCole Faust#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x))
1291*c217d954SCole Faust
1292*c217d954SCole Faust
1293*c217d954SCole Faust#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
1294*c217d954SCole Faust
1295*c217d954SCole Faust
1296*c217d954SCole Faust#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
1297*c217d954SCole Faust
1298*c217d954SCole Faust
1299*c217d954SCole Faust#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)
1300*c217d954SCole Faust
1301*c217d954SCole Faust#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1302*c217d954SCole Faust
1303*c217d954SCole Faust#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
1304*c217d954SCole Faust
1305*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
1306*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
1307*c217d954SCole Faust
1308*c217d954SCole Faust
1309*c217d954SCole Faust
1310*c217d954SCole Faust
1311*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1312*c217d954SCole Faust    VSTORE(N0)                                                 \
1313*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1314*c217d954SCole Faust
1315*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1316*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1317*c217d954SCole Faust    VSTORE(N0)                                                 \
1318*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1319*c217d954SCole Faust
1320*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1321*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1322*c217d954SCole Faust    VSTORE(N0)                                                 \
1323*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1324*c217d954SCole Faust
1325*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1326*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1327*c217d954SCole Faust    VSTORE(N0)                                                 \
1328*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1329*c217d954SCole Faust
1330*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1331*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1332*c217d954SCole Faust    VSTORE(N0)                                                 \
1333*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1334*c217d954SCole Faust
1335*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1336*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1337*c217d954SCole Faust    VSTORE(N0)                                                 \
1338*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1339*c217d954SCole Faust
1340*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1341*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1342*c217d954SCole Faust    VSTORE(N0)                                                 \
1343*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1344*c217d954SCole Faust
1345*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1346*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1347*c217d954SCole Faust    VSTORE(N0)                                                 \
1348*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1349*c217d954SCole Faust
1350*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1351*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1352*c217d954SCole Faust    VSTORE(N0)                                                 \
1353*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1354*c217d954SCole Faust
1355*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1356*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1357*c217d954SCole Faust    VSTORE(N0)                                                  \
1358*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1359*c217d954SCole Faust
1360*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1361*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1362*c217d954SCole Faust    VSTORE(N0)                                                  \
1363*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1364*c217d954SCole Faust
1365*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1366*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1367*c217d954SCole Faust    VSTORE(N0)                                                  \
1368*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1369*c217d954SCole Faust
1370*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1371*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1372*c217d954SCole Faust    VSTORE(N0)                                                  \
1373*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1374*c217d954SCole Faust
1375*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1376*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1377*c217d954SCole Faust    VSTORE(N0)                                                  \
1378*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1379*c217d954SCole Faust
1380*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1381*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1382*c217d954SCole Faust    VSTORE(N0)                                                  \
1383*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1384*c217d954SCole Faust
1385*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1386*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1387*c217d954SCole Faust    VSTORE(N0)                                                  \
1388*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1389*c217d954SCole Faust
1390*c217d954SCole Faust
1391*c217d954SCole Faust
1392*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1393*c217d954SCole Faust    VSTORE(N0)                                                         \
1394*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1395*c217d954SCole Faust
1396*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1397*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1398*c217d954SCole Faust    VSTORE(N0)                                                         \
1399*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1400*c217d954SCole Faust
1401*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1402*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1403*c217d954SCole Faust    VSTORE(N0)                                                         \
1404*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1405*c217d954SCole Faust
1406*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1407*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1408*c217d954SCole Faust    VSTORE(N0)                                                         \
1409*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1410*c217d954SCole Faust
1411*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1412*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1413*c217d954SCole Faust    VSTORE(N0)                                                         \
1414*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1415*c217d954SCole Faust
1416*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1417*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1418*c217d954SCole Faust    VSTORE(N0)                                                         \
1419*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1420*c217d954SCole Faust
1421*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1422*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1423*c217d954SCole Faust    VSTORE(N0)                                                         \
1424*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1425*c217d954SCole Faust
1426*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1427*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1428*c217d954SCole Faust    VSTORE(N0)                                                         \
1429*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1430*c217d954SCole Faust
1431*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1432*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1433*c217d954SCole Faust    VSTORE(N0)                                                         \
1434*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1435*c217d954SCole Faust
1436*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
1437*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1438*c217d954SCole Faust    VSTORE(N0)                                                     \
1439*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1440*c217d954SCole Faust
1441*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1442*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1443*c217d954SCole Faust    VSTORE(N0)                                                          \
1444*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1445*c217d954SCole Faust
1446*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1447*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1448*c217d954SCole Faust    VSTORE(N0)                                                          \
1449*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1450*c217d954SCole Faust
1451*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1452*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1453*c217d954SCole Faust    VSTORE(N0)                                                          \
1454*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1455*c217d954SCole Faust
1456*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1457*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1458*c217d954SCole Faust    VSTORE(N0)                                                          \
1459*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1460*c217d954SCole Faust
1461*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1462*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1463*c217d954SCole Faust    VSTORE(N0)                                                          \
1464*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1465*c217d954SCole Faust
1466*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1467*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1468*c217d954SCole Faust    VSTORE(N0)                                                          \
1469*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1470*c217d954SCole Faust
1471*c217d954SCole Faust
1472*c217d954SCole Faust
1473*c217d954SCole Faust
1474*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1475*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1476*c217d954SCole Faust
1477*c217d954SCole Faust
1478*c217d954SCole Faust
1479*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1480*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1481*c217d954SCole Faust
1482*c217d954SCole Faust
1483*c217d954SCole Faust
1484*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1485*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1486*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
1487*c217d954SCole Faust
1488*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1489*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1490*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1491*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
1492*c217d954SCole Faust
1493*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1494*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1495*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1496*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
1497*c217d954SCole Faust
1498*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1499*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1500*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1501*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
1502*c217d954SCole Faust
1503*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1504*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1505*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1506*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
1507*c217d954SCole Faust
1508*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1509*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1510*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1511*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
1512*c217d954SCole Faust
1513*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1514*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1515*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1516*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
1517*c217d954SCole Faust
1518*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1519*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1520*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1521*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
1522*c217d954SCole Faust
1523*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1524*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1525*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
1526*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
1527*c217d954SCole Faust
1528*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1529*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
1530*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1531*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
1532*c217d954SCole Faust
1533*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1534*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1535*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1536*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
1537*c217d954SCole Faust
1538*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1539*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1540*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1541*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
1542*c217d954SCole Faust
1543*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1544*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1545*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1546*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
1547*c217d954SCole Faust
1548*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1549*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1550*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1551*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
1552*c217d954SCole Faust
1553*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1554*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1555*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1556*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
1557*c217d954SCole Faust
1558*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
1559*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
1560*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
1561*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
1562*c217d954SCole Faust
1563*c217d954SCole Faust
1564*c217d954SCole Faust
1565*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1566*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1567*c217d954SCole Faust
1568*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1569*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
1570*c217d954SCole Faust    {                                                                                                                                                     \
1571*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
1572*c217d954SCole Faust    }                                                                                                                                                     \
1573*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
1574*c217d954SCole Faust    {                                                                                                                                                     \
1575*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1576*c217d954SCole Faust    }                                                                                                                                                     \
1577*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
1578*c217d954SCole Faust    {                                                                                                                                                     \
1579*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
1580*c217d954SCole Faust    }                                                                                                                                                     \
1581*c217d954SCole Faust    else                                                                                                                                                  \
1582*c217d954SCole Faust    {                                                                                                                                                     \
1583*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
1584*c217d954SCole Faust    }
1585*c217d954SCole Faust
1586*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
1587*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
1588*c217d954SCole Faust    {                                                                                                             \
1589*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1590*c217d954SCole Faust    }                                                                                                             \
1591*c217d954SCole Faust    else                                                                                                          \
1592*c217d954SCole Faust    {                                                                                                             \
1593*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1594*c217d954SCole Faust    }
1595*c217d954SCole Faust
1596*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
1597*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
1598*c217d954SCole Faust    {                                                                                                             \
1599*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
1600*c217d954SCole Faust    }                                                                                                             \
1601*c217d954SCole Faust    else                                                                                                          \
1602*c217d954SCole Faust    {                                                                                                             \
1603*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
1604*c217d954SCole Faust    }
1605*c217d954SCole Faust
1606*c217d954SCole Faust
1607*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
1608*c217d954SCole Faust
1609*c217d954SCole Faust
1610*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
1611*c217d954SCole Faust
1612*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1613*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
1614*c217d954SCole Faust
1615*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
1616*c217d954SCole Faust
1617*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1618*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
1619*c217d954SCole Faust
1620*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
1621*c217d954SCole Faust
1622*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1623*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
1624*c217d954SCole Faust
1625*c217d954SCole Faust#else
1626*c217d954SCole Faust
1627*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
1628*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
1629*c217d954SCole Faust
1630*c217d954SCole Faust#endif
1631*c217d954SCole Faust
1632*c217d954SCole Faust#endif
1633*c217d954SCole Faust
1634*c217d954SCole Faust
1635*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
1636*c217d954SCole Faust
1637*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1638*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
1639*c217d954SCole Faust#else
1640*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
1641*c217d954SCole Faust    ((uint)(y * M0))
1642*c217d954SCole Faust#endif
1643*c217d954SCole Faust
1644*c217d954SCole Faust
1645*c217d954SCole Faust
1646*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
1647*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
1648*c217d954SCole Faust
1649*c217d954SCole Faust
1650*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1651*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
1652*c217d954SCole Faust#endif
1653*c217d954SCole Faust
1654*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
1655*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
1656*c217d954SCole Faust#endif
1657*c217d954SCole Faust
1658*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
1659*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
1660*c217d954SCole Faust#endif
1661*c217d954SCole Faust
1662*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
1663*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
1664*c217d954SCole Faust#endif
1665*c217d954SCole Faust
1666*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
1667*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
1668*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
1669*c217d954SCole Faust
1670*c217d954SCole Faust
1671*c217d954SCole Faust#define CONCAT(a, b) a##b
1672*c217d954SCole Faust
1673*c217d954SCole Faust
1674*c217d954SCole Faust#define EXPAND(x) x
1675*c217d954SCole Faust
1676*c217d954SCole Faust
1677*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
1678*c217d954SCole Faust
1679*c217d954SCole Faust
1680*c217d954SCole Faust#define REV1(x) ((x))
1681*c217d954SCole Faust#define REV2(x) ((x).s10)
1682*c217d954SCole Faust#define REV3(x) ((x).s210)
1683*c217d954SCole Faust#define REV4(x) ((x).s3210)
1684*c217d954SCole Faust#define REV8(x) ((x).s76543210)
1685*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
1686*c217d954SCole Faust
1687*c217d954SCole Faust
1688*c217d954SCole Faust
1689*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
1690*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
1691*c217d954SCole Faust
1692*c217d954SCole Faust
1693*c217d954SCole Faust
1694*c217d954SCole Faust#define ROT1_0(x) ((x))
1695*c217d954SCole Faust#define ROT1_1(x) ((x))
1696*c217d954SCole Faust
1697*c217d954SCole Faust#define ROT2_0(x) ((x))
1698*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
1699*c217d954SCole Faust#define ROT2_2(x) ((x))
1700*c217d954SCole Faust
1701*c217d954SCole Faust#define ROT3_0(x) ((x))
1702*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
1703*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
1704*c217d954SCole Faust#define ROT3_3(x) ((x))
1705*c217d954SCole Faust
1706*c217d954SCole Faust#define ROT4_0(x) ((x))
1707*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
1708*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
1709*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
1710*c217d954SCole Faust#define ROT4_4(x) ((x))
1711*c217d954SCole Faust
1712*c217d954SCole Faust#define ROT8_0(x) ((x))
1713*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
1714*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
1715*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
1716*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
1717*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
1718*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
1719*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
1720*c217d954SCole Faust#define ROT8_8(x) ((x))
1721*c217d954SCole Faust
1722*c217d954SCole Faust#define ROT16_0(x) ((x))
1723*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
1724*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
1725*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
1726*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
1727*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
1728*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
1729*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
1730*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
1731*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
1732*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
1733*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
1734*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
1735*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
1736*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
1737*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
1738*c217d954SCole Faust#define ROT16_16(x) ((x))
1739*c217d954SCole Faust
1740*c217d954SCole Faust
1741*c217d954SCole Faust
1742*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
1743*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
1744*c217d954SCole Faust
1745*c217d954SCole Faust
1746*c217d954SCole Faust
1747*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
1748*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
1749*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
1750*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
1751*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
1752*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
1753*c217d954SCole Faust
1754*c217d954SCole Faust
1755*c217d954SCole Faust
1756*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
1757*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
1758*c217d954SCole Faust
1759*c217d954SCole Faust
1760*c217d954SCole Faust#define VLOAD_STR(size) vload##size
1761*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
1762*c217d954SCole Faust
1763*c217d954SCole Faust
1764*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
1765*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
1766*c217d954SCole Faust
1767*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
1768*c217d954SCole Faust    {                            \
1769*c217d954SCole Faust    }
1770*c217d954SCole Faust
1771*c217d954SCole Faust
1772*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
1773*c217d954SCole Faust#define vload_partial_1_1 vload1
1774*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
1775*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
1776*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
1777*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
1778*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
1779*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
1780*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
1781*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
1782*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
1783*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
1784*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
1785*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
1786*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
1787*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
1788*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
1789*c217d954SCole Faust
1790*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
1791*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
1792*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
1793*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
1794*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
1795*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
1796*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
1797*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
1798*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
1799*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
1800*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
1801*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
1802*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
1803*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
1804*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
1805*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
1806*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
1807*c217d954SCole Faust
1808*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
1809*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
1810*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
1811*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
1812*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
1813*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
1814*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
1815*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
1816*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
1817*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
1818*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
1819*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
1820*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
1821*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
1822*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
1823*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
1824*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
1825*c217d954SCole Faust
1826*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
1827*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
1828*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
1829*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
1830*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
1831*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
1832*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
1833*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
1834*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
1835*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
1836*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
1837*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
1838*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
1839*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
1840*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
1841*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
1842*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
1843*c217d954SCole Faust
1844*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
1845*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
1846*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
1847*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
1848*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
1849*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
1850*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
1851*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
1852*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
1853*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
1854*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
1855*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
1856*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
1857*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
1858*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
1859*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
1860*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
1861*c217d954SCole Faust
1862*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
1863*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
1864*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
1865*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
1866*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
1867*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
1868*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
1869*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
1870*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
1871*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
1872*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
1873*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
1874*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
1875*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
1876*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
1877*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
1878*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
1879*c217d954SCole Faust
1880*c217d954SCole Faust
1881*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
1882*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
1883*c217d954SCole Faust
1884*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
1885*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
1886*c217d954SCole Faust
1887*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
1888*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
1889*c217d954SCole Faust
1890*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
1891*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
1892*c217d954SCole Faust
1893*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
1894*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1895*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
1896*c217d954SCole Faust
1897*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
1898*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1899*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
1900*c217d954SCole Faust
1901*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
1902*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
1903*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
1904*c217d954SCole Faust
1905*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
1906*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
1907*c217d954SCole Faust
1908*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
1909*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1910*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
1911*c217d954SCole Faust
1912*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
1913*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1914*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
1915*c217d954SCole Faust
1916*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
1917*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1918*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
1919*c217d954SCole Faust
1920*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
1921*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1922*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
1923*c217d954SCole Faust
1924*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
1925*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1926*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
1927*c217d954SCole Faust
1928*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
1929*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1930*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
1931*c217d954SCole Faust
1932*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
1933*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
1934*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
1935*c217d954SCole Faust
1936*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
1937*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
1938*c217d954SCole Faust
1939*c217d954SCole Faust
1940*c217d954SCole Faust
1941*c217d954SCole Faust#define PIXEL_UNIT4 1
1942*c217d954SCole Faust#define PIXEL_UNIT8 2
1943*c217d954SCole Faust#define PIXEL_UNIT16 4
1944*c217d954SCole Faust
1945*c217d954SCole Faust
1946*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
1947*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
1948*c217d954SCole Faust
1949*c217d954SCole Faust
1950*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
1951*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
1952*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
1953*c217d954SCole Faust
1954*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1955*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
1956*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
1957*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
1958*c217d954SCole Faust#endif
1959*c217d954SCole Faust
1960*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
1961*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
1962*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1963*c217d954SCole Faust
1964*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
1965*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
1966*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
1967*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
1968*c217d954SCole Faust#endif
1969*c217d954SCole Faust
1970*c217d954SCole Faust
1971*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
1972*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
1973*c217d954SCole Faust
1974*c217d954SCole Faust
1975*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
1976*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
1977*c217d954SCole Faust
1978*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
1979*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
1980*c217d954SCole Faust
1981*c217d954SCole Faust#define float1 float
1982*c217d954SCole Faust#define half1 half
1983*c217d954SCole Faust#define char1 char
1984*c217d954SCole Faust#define uchar1 uchar
1985*c217d954SCole Faust#define short1 short
1986*c217d954SCole Faust#define ushort1 ushort
1987*c217d954SCole Faust#define int1 int
1988*c217d954SCole Faust#define uint1 uint
1989*c217d954SCole Faust#define long1 long
1990*c217d954SCole Faust#define ulong1 ulong
1991*c217d954SCole Faust#define double1 double
1992*c217d954SCole Faust
1993*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
1994*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
1995*c217d954SCole Faust
1996*c217d954SCole Faust
1997*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
1998*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
1999*c217d954SCole Faust
2000*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
2001*c217d954SCole Faust    {                             \
2002*c217d954SCole Faust    }
2003*c217d954SCole Faust
2004*c217d954SCole Faust
2005*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
2006*c217d954SCole Faust#define vstore_partial_1_1 vstore1
2007*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
2008*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
2009*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
2010*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
2011*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
2012*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
2013*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
2014*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
2015*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
2016*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
2017*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
2018*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
2019*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
2020*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
2021*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
2022*c217d954SCole Faust
2023*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
2024*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
2025*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
2026*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
2027*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
2028*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
2029*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
2030*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
2031*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
2032*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
2033*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
2034*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
2035*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
2036*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
2037*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
2038*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
2039*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
2040*c217d954SCole Faust
2041*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
2042*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
2043*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
2044*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
2045*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
2046*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
2047*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
2048*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
2049*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
2050*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
2051*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
2052*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
2053*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
2054*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
2055*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
2056*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
2057*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
2058*c217d954SCole Faust
2059*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
2060*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
2061*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
2062*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
2063*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
2064*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
2065*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
2066*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
2067*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
2068*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
2069*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
2070*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
2071*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
2072*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
2073*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
2074*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
2075*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
2076*c217d954SCole Faust
2077*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
2078*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
2079*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
2080*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
2081*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
2082*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
2083*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
2084*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
2085*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
2086*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
2087*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
2088*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
2089*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
2090*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
2091*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
2092*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
2093*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
2094*c217d954SCole Faust
2095*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
2096*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
2097*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
2098*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
2099*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
2100*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
2101*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
2102*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
2103*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
2104*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
2105*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
2106*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
2107*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
2108*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
2109*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
2110*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
2111*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
2112*c217d954SCole Faust
2113*c217d954SCole Faust
2114*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
2115*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
2116*c217d954SCole Faust
2117*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
2118*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
2119*c217d954SCole Faust
2120*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
2121*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
2122*c217d954SCole Faust
2123*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
2124*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
2125*c217d954SCole Faust
2126*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
2127*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2128*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
2129*c217d954SCole Faust
2130*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
2131*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2132*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
2133*c217d954SCole Faust
2134*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
2135*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
2136*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
2137*c217d954SCole Faust
2138*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
2139*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
2140*c217d954SCole Faust
2141*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
2142*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2143*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
2144*c217d954SCole Faust
2145*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
2146*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2147*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
2148*c217d954SCole Faust
2149*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
2150*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2151*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
2152*c217d954SCole Faust
2153*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
2154*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2155*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
2156*c217d954SCole Faust
2157*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
2158*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2159*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
2160*c217d954SCole Faust
2161*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
2162*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2163*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
2164*c217d954SCole Faust
2165*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
2166*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
2167*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
2168*c217d954SCole Faust
2169*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
2170*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
2171*c217d954SCole Faust
2172*c217d954SCole Faust
2173*c217d954SCole Faust
2174*c217d954SCole Faust
2175*c217d954SCole Faust
2176*c217d954SCole Faust#define convert_float_sat convert_float
2177*c217d954SCole Faust#define convert_float1_sat convert_float
2178*c217d954SCole Faust#define convert_float2_sat convert_float2
2179*c217d954SCole Faust#define convert_float3_sat convert_float3
2180*c217d954SCole Faust#define convert_float4_sat convert_float4
2181*c217d954SCole Faust#define convert_float8_sat convert_float8
2182*c217d954SCole Faust#define convert_float16_sat convert_float16
2183*c217d954SCole Faust#define convert_half_sat convert_float
2184*c217d954SCole Faust#define convert_half1_sat convert_half
2185*c217d954SCole Faust#define convert_half2_sat convert_half2
2186*c217d954SCole Faust#define convert_half3_sat convert_half3
2187*c217d954SCole Faust#define convert_half4_sat convert_half4
2188*c217d954SCole Faust#define convert_half8_sat convert_half8
2189*c217d954SCole Faust#define convert_half16_sat convert_half16
2190*c217d954SCole Faust
2191*c217d954SCole Faust#define convert_float1 convert_float
2192*c217d954SCole Faust#define convert_half1 convert_half
2193*c217d954SCole Faust#define convert_char1 convert_char
2194*c217d954SCole Faust#define convert_uchar1 convert_uchar
2195*c217d954SCole Faust#define convert_short1 convert_short
2196*c217d954SCole Faust#define convert_ushort1 convert_ushort
2197*c217d954SCole Faust#define convert_int1 convert_int
2198*c217d954SCole Faust#define convert_uint1 convert_uint
2199*c217d954SCole Faust#define convert_long1 convert_long
2200*c217d954SCole Faust#define convert_ulong1 convert_ulong
2201*c217d954SCole Faust#define convert_double1 convert_double
2202*c217d954SCole Faust
2203*c217d954SCole Faust#define convert_char1_sat convert_char_sat
2204*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
2205*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
2206*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
2207*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
2208*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
2209*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
2210*c217d954SCole Faust#define convert_short1_sat convert_short_sat
2211*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
2212*c217d954SCole Faust#define convert_int1_sat convert_int_sat
2213*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
2214*c217d954SCole Faust#define convert_long1_sat convert_long_sat
2215*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
2216*c217d954SCole Faust#define convert_double1_sat convert_double_sat
2217*c217d954SCole Faust
2218*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
2219*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
2220*c217d954SCole Faust
2221*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
2222*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
2223*c217d954SCole Faust
2224*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
2225*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
2226*c217d954SCole Faust
2227*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
2228*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
2229*c217d954SCole Faust
2230*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
2231*c217d954SCole Faust#define select_vec_dt_char(size) char##size
2232*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
2233*c217d954SCole Faust#define select_vec_dt_short(size) short##size
2234*c217d954SCole Faust#define select_vec_dt_half(size) short##size
2235*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
2236*c217d954SCole Faust#define select_vec_dt_int(size) int##size
2237*c217d954SCole Faust#define select_vec_dt_float(size) int##size
2238*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
2239*c217d954SCole Faust#define select_vec_dt_long(size) long##size
2240*c217d954SCole Faust
2241*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
2242*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
2243*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
2244*c217d954SCole Faust
2245*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
2246*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
2247*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
2248*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
2249*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
2250*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
2251*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
2252*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
2253*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
2254*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
2255*c217d954SCole Faust
2256*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
2257*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
2258*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
2259*c217d954SCole Faust
2260*c217d954SCole Faust#define sum_reduce_1(x) (x)
2261*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
2262*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
2263*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
2264*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
2265*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
2266*c217d954SCole Faust
2267*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
2268*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
2269*c217d954SCole Faust
2270*c217d954SCole Faust#define prod_reduce_1(x) (x)
2271*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
2272*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
2273*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
2274*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
2275*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
2276*c217d954SCole Faust
2277*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
2278*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
2279*c217d954SCole Faust
2280*c217d954SCole Faust#define max_reduce_1(x) (x)
2281*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
2282*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
2283*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
2284*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
2285*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
2286*c217d954SCole Faust
2287*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
2288*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
2289*c217d954SCole Faust
2290*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
2291*c217d954SCole Faust    __global uchar *name##_ptr,      \
2292*c217d954SCole Faust    uint        name##_stride_x, \
2293*c217d954SCole Faust    uint        name##_step_x,   \
2294*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2295*c217d954SCole Faust
2296*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
2297*c217d954SCole Faust    __global uchar *name##_ptr,      \
2298*c217d954SCole Faust    uint        name##_stride_x, \
2299*c217d954SCole Faust    uint        name##_step_x,   \
2300*c217d954SCole Faust    uint        name##_stride_y, \
2301*c217d954SCole Faust    uint        name##_step_y,   \
2302*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2303*c217d954SCole Faust
2304*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
2305*c217d954SCole Faust    __global uchar *name##_ptr,      \
2306*c217d954SCole Faust    uint        name##_stride_x, \
2307*c217d954SCole Faust    uint        name##_step_x,   \
2308*c217d954SCole Faust    uint        name##_stride_y, \
2309*c217d954SCole Faust    uint        name##_step_y,   \
2310*c217d954SCole Faust    uint        name##_stride_z, \
2311*c217d954SCole Faust    uint        name##_step_z,   \
2312*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2313*c217d954SCole Faust
2314*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
2315*c217d954SCole Faust    __global uchar *name##_ptr,      \
2316*c217d954SCole Faust    uint        name##_stride_x, \
2317*c217d954SCole Faust    uint        name##_step_x,   \
2318*c217d954SCole Faust    uint        name##_stride_y, \
2319*c217d954SCole Faust    uint        name##_step_y,   \
2320*c217d954SCole Faust    uint        name##_stride_z, \
2321*c217d954SCole Faust    uint        name##_step_z,   \
2322*c217d954SCole Faust    uint        name##_stride_w, \
2323*c217d954SCole Faust    uint        name##_step_w,   \
2324*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2325*c217d954SCole Faust
2326*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
2327*c217d954SCole Faust    __global uchar *name##_ptr,      \
2328*c217d954SCole Faust    uint        name##_stride_x, \
2329*c217d954SCole Faust    uint        name##_step_x,   \
2330*c217d954SCole Faust    uint        name##_stride_y, \
2331*c217d954SCole Faust    uint        name##_step_y,   \
2332*c217d954SCole Faust    uint        name##_stride_z, \
2333*c217d954SCole Faust    uint        name##_step_z,   \
2334*c217d954SCole Faust    uint        name##_stride_w, \
2335*c217d954SCole Faust    uint        name##_step_w,   \
2336*c217d954SCole Faust    uint        name##_stride_v, \
2337*c217d954SCole Faust    uint        name##_step_v,   \
2338*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
2339*c217d954SCole Faust
2340*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
2341*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
2342*c217d954SCole Faust
2343*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
2344*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
2345*c217d954SCole Faust
2346*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
2347*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
2348*c217d954SCole Faust
2349*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
2350*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
2351*c217d954SCole Faust
2352*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2353*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2354*c217d954SCole Faust
2355*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
2356*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
2357*c217d954SCole Faust
2358*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
2359*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
2360*c217d954SCole Faust
2361*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
2362*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2363*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
2364*c217d954SCole Faust
2365*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
2366*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
2367*c217d954SCole Faust
2368*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
2369*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2370*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
2371*c217d954SCole Faust
2372*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
2373*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
2374*c217d954SCole Faust
2375*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
2376*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
2377*c217d954SCole Faust                           name##_stride_z, name##_step_z)
2378*c217d954SCole Faust
2379*c217d954SCole Faust
2380*c217d954SCole Fausttypedef struct Vector
2381*c217d954SCole Faust{
2382*c217d954SCole Faust    __global uchar *ptr;
2383*c217d954SCole Faust    int             offset_first_element_in_bytes;
2384*c217d954SCole Faust    int             stride_x;
2385*c217d954SCole Faust} Vector;
2386*c217d954SCole Faust
2387*c217d954SCole Faust
2388*c217d954SCole Fausttypedef struct Image
2389*c217d954SCole Faust{
2390*c217d954SCole Faust    __global uchar *ptr;
2391*c217d954SCole Faust    int             offset_first_element_in_bytes;
2392*c217d954SCole Faust    int             stride_x;
2393*c217d954SCole Faust    int             stride_y;
2394*c217d954SCole Faust} Image;
2395*c217d954SCole Faust
2396*c217d954SCole Faust
2397*c217d954SCole Fausttypedef struct Tensor3D
2398*c217d954SCole Faust{
2399*c217d954SCole Faust    __global uchar *ptr;
2400*c217d954SCole Faust    int             offset_first_element_in_bytes;
2401*c217d954SCole Faust    int             stride_x;
2402*c217d954SCole Faust    int             stride_y;
2403*c217d954SCole Faust    int             stride_z;
2404*c217d954SCole Faust} Tensor3D;
2405*c217d954SCole Faust
2406*c217d954SCole Faust
2407*c217d954SCole Fausttypedef struct Tensor4D
2408*c217d954SCole Faust{
2409*c217d954SCole Faust    __global uchar *ptr;
2410*c217d954SCole Faust    int             offset_first_element_in_bytes;
2411*c217d954SCole Faust    int             stride_x;
2412*c217d954SCole Faust    int             stride_y;
2413*c217d954SCole Faust    int             stride_z;
2414*c217d954SCole Faust    int             stride_w;
2415*c217d954SCole Faust} Tensor4D;
2416*c217d954SCole Faust
2417*c217d954SCole Faust
2418*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
2419*c217d954SCole Faust{
2420*c217d954SCole Faust    Vector vector =
2421*c217d954SCole Faust    {
2422*c217d954SCole Faust        .ptr                           = ptr,
2423*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2424*c217d954SCole Faust        .stride_x                      = stride_x,
2425*c217d954SCole Faust    };
2426*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
2427*c217d954SCole Faust    return vector;
2428*c217d954SCole Faust}
2429*c217d954SCole Faust
2430*c217d954SCole Faust
2431*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
2432*c217d954SCole Faust{
2433*c217d954SCole Faust    Image img =
2434*c217d954SCole Faust    {
2435*c217d954SCole Faust        .ptr                           = ptr,
2436*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2437*c217d954SCole Faust        .stride_x                      = stride_x,
2438*c217d954SCole Faust        .stride_y                      = stride_y
2439*c217d954SCole Faust    };
2440*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
2441*c217d954SCole Faust    return img;
2442*c217d954SCole Faust}
2443*c217d954SCole Faust
2444*c217d954SCole Faust
2445*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2446*c217d954SCole Faust{
2447*c217d954SCole Faust    Image img =
2448*c217d954SCole Faust    {
2449*c217d954SCole Faust        .ptr                           = ptr,
2450*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2451*c217d954SCole Faust        .stride_x                      = stride_x,
2452*c217d954SCole Faust        .stride_y                      = stride_y
2453*c217d954SCole Faust    };
2454*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2455*c217d954SCole Faust    return img;
2456*c217d954SCole Faust}
2457*c217d954SCole Faust
2458*c217d954SCole Faust
2459*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2460*c217d954SCole Faust{
2461*c217d954SCole Faust    Tensor3D tensor =
2462*c217d954SCole Faust    {
2463*c217d954SCole Faust        .ptr                           = ptr,
2464*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2465*c217d954SCole Faust        .stride_x                      = stride_x,
2466*c217d954SCole Faust        .stride_y                      = stride_y,
2467*c217d954SCole Faust        .stride_z                      = stride_z
2468*c217d954SCole Faust    };
2469*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
2470*c217d954SCole Faust    return tensor;
2471*c217d954SCole Faust}
2472*c217d954SCole Faust
2473*c217d954SCole Faust
2474*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
2475*c217d954SCole Faust{
2476*c217d954SCole Faust    Tensor3D tensor =
2477*c217d954SCole Faust    {
2478*c217d954SCole Faust        .ptr                           = ptr,
2479*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2480*c217d954SCole Faust        .stride_x                      = stride_x,
2481*c217d954SCole Faust        .stride_y                      = stride_y,
2482*c217d954SCole Faust        .stride_z                      = stride_z
2483*c217d954SCole Faust    };
2484*c217d954SCole Faust    return tensor;
2485*c217d954SCole Faust}
2486*c217d954SCole Faust
2487*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
2488*c217d954SCole Faust                                             uint step_w,
2489*c217d954SCole Faust                                             uint mod_size)
2490*c217d954SCole Faust{
2491*c217d954SCole Faust    Tensor4D tensor =
2492*c217d954SCole Faust    {
2493*c217d954SCole Faust        .ptr                           = ptr,
2494*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
2495*c217d954SCole Faust        .stride_x                      = stride_x,
2496*c217d954SCole Faust        .stride_y                      = stride_y,
2497*c217d954SCole Faust        .stride_z                      = stride_z,
2498*c217d954SCole Faust        .stride_w                      = stride_w
2499*c217d954SCole Faust    };
2500*c217d954SCole Faust
2501*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
2502*c217d954SCole Faust    return tensor;
2503*c217d954SCole Faust}
2504*c217d954SCole Faust
2505*c217d954SCole Faust
2506*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
2507*c217d954SCole Faust{
2508*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
2509*c217d954SCole Faust}
2510*c217d954SCole Faust
2511*c217d954SCole Faust
2512*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
2513*c217d954SCole Faust{
2514*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
2515*c217d954SCole Faust}
2516*c217d954SCole Faust
2517*c217d954SCole Faust
2518*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
2519*c217d954SCole Faust{
2520*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
2521*c217d954SCole Faust}
2522*c217d954SCole Faust
2523*c217d954SCole Faust
2524*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
2525*c217d954SCole Faust{
2526*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
2527*c217d954SCole Faust}
2528*c217d954SCole Faust
2529*c217d954SCole Faust
2530*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
2531*c217d954SCole Faust{
2532*c217d954SCole Faust    uint num_elements = width * height;
2533*c217d954SCole Faust
2534*c217d954SCole Faust    const uint z = index / num_elements;
2535*c217d954SCole Faust
2536*c217d954SCole Faust    index %= num_elements;
2537*c217d954SCole Faust
2538*c217d954SCole Faust    const uint y = index / width;
2539*c217d954SCole Faust
2540*c217d954SCole Faust    index %= width;
2541*c217d954SCole Faust
2542*c217d954SCole Faust    const uint x = index;
2543*c217d954SCole Faust
2544*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
2545*c217d954SCole Faust}
2546*c217d954SCole Faust
2547*c217d954SCole Faust#endif
2548*c217d954SCole Faust
2549*c217d954SCole Faust
2550*c217d954SCole Faust#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
2551*c217d954SCole Faust#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
2552*c217d954SCole Faust
2553*c217d954SCole Faust
2554*c217d954SCole Faust#define scalar_access_0_1(x) ((x).s0)
2555*c217d954SCole Faust#define scalar_access_0_2(x) ((x).s01)
2556*c217d954SCole Faust#define scalar_access_0_3(x) ((x).s012)
2557*c217d954SCole Faust#define scalar_access_0_4(x) ((x).s0123)
2558*c217d954SCole Faust#define scalar_access_0_8(x) ((x).s01234567)
2559*c217d954SCole Faust#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
2560*c217d954SCole Faust
2561*c217d954SCole Faust
2562*c217d954SCole Faust#define scalar_access_1_1(x) ((x).s1)
2563*c217d954SCole Faust#define scalar_access_1_2(x) ((x).s12)
2564*c217d954SCole Faust#define scalar_access_1_3(x) ((x).s123)
2565*c217d954SCole Faust#define scalar_access_1_4(x) ((x).s1234)
2566*c217d954SCole Faust#define scalar_access_1_8(x) ((x).s12345678)
2567*c217d954SCole Faust
2568*c217d954SCole Faust
2569*c217d954SCole Faust#define scalar_access_2_1(x) ((x).s2)
2570*c217d954SCole Faust#define scalar_access_2_2(x) ((x).s23)
2571*c217d954SCole Faust#define scalar_access_2_3(x) ((x).s234)
2572*c217d954SCole Faust#define scalar_access_2_4(x) ((x).s2345)
2573*c217d954SCole Faust#define scalar_access_2_8(x) ((x).s23456789)
2574*c217d954SCole Faust
2575*c217d954SCole Faust
2576*c217d954SCole Faust#define scalar_access_3_1(x) ((x).s3)
2577*c217d954SCole Faust#define scalar_access_3_2(x) ((x).s34)
2578*c217d954SCole Faust#define scalar_access_3_3(x) ((x).s345)
2579*c217d954SCole Faust#define scalar_access_3_4(x) ((x).s3456)
2580*c217d954SCole Faust#define scalar_access_3_8(x) ((x).s3456789A)
2581*c217d954SCole Faust
2582*c217d954SCole Faust
2583*c217d954SCole Faust#define scalar_access_4_1(x) ((x).s4)
2584*c217d954SCole Faust#define scalar_access_4_2(x) ((x).s45)
2585*c217d954SCole Faust#define scalar_access_4_3(x) ((x).s456)
2586*c217d954SCole Faust#define scalar_access_4_4(x) ((x).s4567)
2587*c217d954SCole Faust#define scalar_access_4_8(x) ((x).s456789AB)
2588*c217d954SCole Faust
2589*c217d954SCole Faust
2590*c217d954SCole Faust#define scalar_access_8_1(x) ((x).s8)
2591*c217d954SCole Faust#define scalar_access_8_2(x) ((x).s89)
2592*c217d954SCole Faust#define scalar_access_8_3(x) ((x).s89A)
2593*c217d954SCole Faust#define scalar_access_8_4(x) ((x).s89AB)
2594*c217d954SCole Faust#define scalar_access_8_8(x) ((x).s89ABCDEF)
2595*c217d954SCole Faust
2596*c217d954SCole Faust
2597*c217d954SCole Faust#define scalar_access_12_1(x) ((x).sC)
2598*c217d954SCole Faust#define scalar_access_12_2(x) ((x).sCD)
2599*c217d954SCole Faust#define scalar_access_12_3(x) ((x).sCDE)
2600*c217d954SCole Faust#define scalar_access_12_4(x) ((x).sCDEF)
2601*c217d954SCole Faust
2602*c217d954SCole Faust
2603*c217d954SCole Faust#define scalar_access_16_1(x) ((x).sF)
2604*c217d954SCole Faust
2605*c217d954SCole Faust
2606*c217d954SCole Faust#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2607*c217d954SCole Faust    ({})
2608*c217d954SCole Faust
2609*c217d954SCole Faust#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2610*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
2611*c217d954SCole Faust
2612*c217d954SCole Faust#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2613*c217d954SCole Faust    LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2614*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
2615*c217d954SCole Faust
2616*c217d954SCole Faust#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2617*c217d954SCole Faust    LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2618*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
2619*c217d954SCole Faust
2620*c217d954SCole Faust#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2621*c217d954SCole Faust    LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2622*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
2623*c217d954SCole Faust
2624*c217d954SCole Faust#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2625*c217d954SCole Faust    LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2626*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
2627*c217d954SCole Faust
2628*c217d954SCole Faust#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2629*c217d954SCole Faust    LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2630*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
2631*c217d954SCole Faust
2632*c217d954SCole Faust#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2633*c217d954SCole Faust    LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2634*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
2635*c217d954SCole Faust
2636*c217d954SCole Faust#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2637*c217d954SCole Faust    LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2638*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
2639*c217d954SCole Faust
2640*c217d954SCole Faust#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2641*c217d954SCole Faust    LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2642*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
2643*c217d954SCole Faust
2644*c217d954SCole Faust#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2645*c217d954SCole Faust    LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
2646*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
2647*c217d954SCole Faust
2648*c217d954SCole Faust#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2649*c217d954SCole Faust    LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2650*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
2651*c217d954SCole Faust
2652*c217d954SCole Faust#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2653*c217d954SCole Faust    LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2654*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
2655*c217d954SCole Faust
2656*c217d954SCole Faust#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2657*c217d954SCole Faust    LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2658*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
2659*c217d954SCole Faust
2660*c217d954SCole Faust#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2661*c217d954SCole Faust    LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2662*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
2663*c217d954SCole Faust
2664*c217d954SCole Faust#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2665*c217d954SCole Faust    LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2666*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
2667*c217d954SCole Faust
2668*c217d954SCole Faust#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
2669*c217d954SCole Faust    LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
2670*c217d954SCole Faust    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
2671*c217d954SCole Faust
2672*c217d954SCole Faust
2673*c217d954SCole Faust
2674*c217d954SCole Faust#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2675*c217d954SCole Faust#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
2676*c217d954SCole Faust
2677*c217d954SCole Faust
2678*c217d954SCole Faust
2679*c217d954SCole Faust#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2680*c217d954SCole Faust    ({})
2681*c217d954SCole Faust
2682*c217d954SCole Faust#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2683*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2684*c217d954SCole Faust
2685*c217d954SCole Faust#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2686*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2687*c217d954SCole Faust
2688*c217d954SCole Faust#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2689*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2690*c217d954SCole Faust
2691*c217d954SCole Faust#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2692*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2693*c217d954SCole Faust
2694*c217d954SCole Faust#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2695*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2696*c217d954SCole Faust    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2697*c217d954SCole Faust
2698*c217d954SCole Faust#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2699*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2700*c217d954SCole Faust    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2701*c217d954SCole Faust
2702*c217d954SCole Faust#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2703*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
2704*c217d954SCole Faust    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
2705*c217d954SCole Faust
2706*c217d954SCole Faust#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2707*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2708*c217d954SCole Faust
2709*c217d954SCole Faust#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2710*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
2711*c217d954SCole Faust    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2712*c217d954SCole Faust
2713*c217d954SCole Faust#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2714*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2715*c217d954SCole Faust    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2716*c217d954SCole Faust
2717*c217d954SCole Faust#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2718*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2719*c217d954SCole Faust    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2720*c217d954SCole Faust
2721*c217d954SCole Faust#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2722*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
2723*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
2724*c217d954SCole Faust
2725*c217d954SCole Faust#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2726*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2727*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2728*c217d954SCole Faust    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2729*c217d954SCole Faust
2730*c217d954SCole Faust#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2731*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
2732*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2733*c217d954SCole Faust    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2734*c217d954SCole Faust
2735*c217d954SCole Faust#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
2736*c217d954SCole Faust    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
2737*c217d954SCole Faust    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
2738*c217d954SCole Faust    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
2739*c217d954SCole Faust
2740*c217d954SCole Faust#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
2741*c217d954SCole Faust    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
2742*c217d954SCole Faust
2743*c217d954SCole Faust
2744*c217d954SCole Faust
2745*c217d954SCole Faust#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2746*c217d954SCole Faust#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
2747*c217d954SCole Faust
2748*c217d954SCole Faust
2749*c217d954SCole Faust#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2750*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2751*c217d954SCole Faust    BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2752*c217d954SCole Faust
2753*c217d954SCole Faust#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2754*c217d954SCole Faust    LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2755*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2756*c217d954SCole Faust    BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2757*c217d954SCole Faust
2758*c217d954SCole Faust#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2759*c217d954SCole Faust    LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2760*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2761*c217d954SCole Faust    BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2762*c217d954SCole Faust
2763*c217d954SCole Faust#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2764*c217d954SCole Faust    LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2765*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2766*c217d954SCole Faust    BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2767*c217d954SCole Faust
2768*c217d954SCole Faust#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2769*c217d954SCole Faust    LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2770*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2771*c217d954SCole Faust    BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2772*c217d954SCole Faust
2773*c217d954SCole Faust#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2774*c217d954SCole Faust    LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2775*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2776*c217d954SCole Faust    BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2777*c217d954SCole Faust
2778*c217d954SCole Faust#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2779*c217d954SCole Faust    LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2780*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2781*c217d954SCole Faust    BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2782*c217d954SCole Faust
2783*c217d954SCole Faust#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2784*c217d954SCole Faust    LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2785*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2786*c217d954SCole Faust    BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2787*c217d954SCole Faust
2788*c217d954SCole Faust#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2789*c217d954SCole Faust    LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2790*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                      \
2791*c217d954SCole Faust    BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2792*c217d954SCole Faust
2793*c217d954SCole Faust#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2794*c217d954SCole Faust    LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2795*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2796*c217d954SCole Faust    BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2797*c217d954SCole Faust
2798*c217d954SCole Faust#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2799*c217d954SCole Faust    LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2800*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2801*c217d954SCole Faust    BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2802*c217d954SCole Faust
2803*c217d954SCole Faust#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2804*c217d954SCole Faust    LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2805*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2806*c217d954SCole Faust    BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2807*c217d954SCole Faust
2808*c217d954SCole Faust#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2809*c217d954SCole Faust    LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2810*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2811*c217d954SCole Faust    BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2812*c217d954SCole Faust
2813*c217d954SCole Faust#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2814*c217d954SCole Faust    LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2815*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2816*c217d954SCole Faust    BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2817*c217d954SCole Faust
2818*c217d954SCole Faust#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2819*c217d954SCole Faust    LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2820*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2821*c217d954SCole Faust    BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2822*c217d954SCole Faust
2823*c217d954SCole Faust#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2824*c217d954SCole Faust    LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2825*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
2826*c217d954SCole Faust    BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2827*c217d954SCole Faust
2828*c217d954SCole Faust
2829*c217d954SCole Faust
2830*c217d954SCole Faust
2831*c217d954SCole Faust#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2832*c217d954SCole Faust#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2833*c217d954SCole Faust
2834*c217d954SCole Faust
2835*c217d954SCole Faust
2836*c217d954SCole Faust#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2837*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2838*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
2839*c217d954SCole Faust
2840*c217d954SCole Faust#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2841*c217d954SCole Faust    LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2842*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2843*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
2844*c217d954SCole Faust
2845*c217d954SCole Faust#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2846*c217d954SCole Faust    LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2847*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2848*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
2849*c217d954SCole Faust
2850*c217d954SCole Faust#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2851*c217d954SCole Faust    LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2852*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2853*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
2854*c217d954SCole Faust
2855*c217d954SCole Faust#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2856*c217d954SCole Faust    LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2857*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2858*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
2859*c217d954SCole Faust
2860*c217d954SCole Faust#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2861*c217d954SCole Faust    LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2862*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2863*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
2864*c217d954SCole Faust
2865*c217d954SCole Faust#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2866*c217d954SCole Faust    LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2867*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2868*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
2869*c217d954SCole Faust
2870*c217d954SCole Faust#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2871*c217d954SCole Faust    LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2872*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2873*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
2874*c217d954SCole Faust
2875*c217d954SCole Faust#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2876*c217d954SCole Faust    LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2877*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                         \
2878*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
2879*c217d954SCole Faust
2880*c217d954SCole Faust#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2881*c217d954SCole Faust    LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)      \
2882*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2883*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
2884*c217d954SCole Faust
2885*c217d954SCole Faust#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2886*c217d954SCole Faust    LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2887*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2888*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
2889*c217d954SCole Faust
2890*c217d954SCole Faust#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2891*c217d954SCole Faust    LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2892*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2893*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
2894*c217d954SCole Faust
2895*c217d954SCole Faust#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2896*c217d954SCole Faust    LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2897*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2898*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
2899*c217d954SCole Faust
2900*c217d954SCole Faust#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2901*c217d954SCole Faust    LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2902*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2903*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
2904*c217d954SCole Faust
2905*c217d954SCole Faust#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2906*c217d954SCole Faust    LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2907*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2908*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
2909*c217d954SCole Faust
2910*c217d954SCole Faust#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
2911*c217d954SCole Faust    LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)     \
2912*c217d954SCole Faust    VLOAD_PARTIAL(N0, LOAD_N0)                                                          \
2913*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
2914*c217d954SCole Faust
2915*c217d954SCole Faust
2916*c217d954SCole Faust
2917*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2918*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2919*c217d954SCole Faust
2920*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2921*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
2922*c217d954SCole Faust    {                                                                                                                                                            \
2923*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
2924*c217d954SCole Faust    }                                                                                                                                                            \
2925*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
2926*c217d954SCole Faust    {                                                                                                                                                            \
2927*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2928*c217d954SCole Faust    }                                                                                                                                                            \
2929*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
2930*c217d954SCole Faust    {                                                                                                                                                            \
2931*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
2932*c217d954SCole Faust    }                                                                                                                                                            \
2933*c217d954SCole Faust    else                                                                                                                                                         \
2934*c217d954SCole Faust    {                                                                                                                                                            \
2935*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
2936*c217d954SCole Faust    }
2937*c217d954SCole Faust
2938*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
2939*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                                \
2940*c217d954SCole Faust    {                                                                                                                    \
2941*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2942*c217d954SCole Faust    }                                                                                                                    \
2943*c217d954SCole Faust    else                                                                                                                 \
2944*c217d954SCole Faust    {                                                                                                                    \
2945*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2946*c217d954SCole Faust    }
2947*c217d954SCole Faust
2948*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
2949*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                                \
2950*c217d954SCole Faust    {                                                                                                                    \
2951*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
2952*c217d954SCole Faust    }                                                                                                                    \
2953*c217d954SCole Faust    else                                                                                                                 \
2954*c217d954SCole Faust    {                                                                                                                    \
2955*c217d954SCole Faust        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
2956*c217d954SCole Faust    }
2957*c217d954SCole Faust
2958*c217d954SCole Faust
2959*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
2960*c217d954SCole Faust
2961*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2962*c217d954SCole Faust    LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
2963*c217d954SCole Faust
2964*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
2965*c217d954SCole Faust
2966*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2967*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2968*c217d954SCole Faust    LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
2969*c217d954SCole Faust
2970*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
2971*c217d954SCole Faust
2972*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2973*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2974*c217d954SCole Faust    LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
2975*c217d954SCole Faust
2976*c217d954SCole Faust#else
2977*c217d954SCole Faust
2978*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
2979*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
2980*c217d954SCole Faust    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
2981*c217d954SCole Faust
2982*c217d954SCole Faust#endif
2983*c217d954SCole Faust
2984*c217d954SCole Faust
2985*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2986*c217d954SCole Faust    BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
2987*c217d954SCole Faust
2988*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2989*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2990*c217d954SCole Faust    BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
2991*c217d954SCole Faust
2992*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2993*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2994*c217d954SCole Faust    BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
2995*c217d954SCole Faust
2996*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
2997*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
2998*c217d954SCole Faust    BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
2999*c217d954SCole Faust
3000*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3001*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3002*c217d954SCole Faust    BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
3003*c217d954SCole Faust
3004*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3005*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3006*c217d954SCole Faust    BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
3007*c217d954SCole Faust
3008*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3009*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3010*c217d954SCole Faust    BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
3011*c217d954SCole Faust
3012*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3013*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3014*c217d954SCole Faust    BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
3015*c217d954SCole Faust
3016*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3017*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3018*c217d954SCole Faust    BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
3019*c217d954SCole Faust
3020*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3021*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
3022*c217d954SCole Faust    BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
3023*c217d954SCole Faust
3024*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3025*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3026*c217d954SCole Faust    BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
3027*c217d954SCole Faust
3028*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3029*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3030*c217d954SCole Faust    BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
3031*c217d954SCole Faust
3032*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3033*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3034*c217d954SCole Faust    BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
3035*c217d954SCole Faust
3036*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3037*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3038*c217d954SCole Faust    BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
3039*c217d954SCole Faust
3040*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3041*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3042*c217d954SCole Faust    BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
3043*c217d954SCole Faust
3044*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
3045*c217d954SCole Faust    LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
3046*c217d954SCole Faust    BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
3047*c217d954SCole Faust
3048*c217d954SCole Faust
3049*c217d954SCole Faust
3050*c217d954SCole Faust#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3051*c217d954SCole Faust#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
3052*c217d954SCole Faust
3053*c217d954SCole Faust
3054*c217d954SCole Faust
3055*c217d954SCole Faust#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3056*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3057*c217d954SCole Faust    BASENAME##0;                                                                            \
3058*c217d954SCole Faust    if(Y_MASK##0 != 0)                                                                      \
3059*c217d954SCole Faust        BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
3060*c217d954SCole Faust    else                                                                                    \
3061*c217d954SCole Faust        BASENAME##0 = 0;
3062*c217d954SCole Faust
3063*c217d954SCole Faust#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3064*c217d954SCole Faust    LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3065*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3066*c217d954SCole Faust    BASENAME##1;                                                                            \
3067*c217d954SCole Faust    if(Y_MASK##1 != 0)                                                                      \
3068*c217d954SCole Faust        BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
3069*c217d954SCole Faust    else                                                                                    \
3070*c217d954SCole Faust        BASENAME##1 = 0;
3071*c217d954SCole Faust
3072*c217d954SCole Faust#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3073*c217d954SCole Faust    LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3074*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3075*c217d954SCole Faust    BASENAME##2;                                                                            \
3076*c217d954SCole Faust    if(Y_MASK##2 != 0)                                                                      \
3077*c217d954SCole Faust        BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
3078*c217d954SCole Faust    else                                                                                    \
3079*c217d954SCole Faust        BASENAME##2 = 0;
3080*c217d954SCole Faust
3081*c217d954SCole Faust#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3082*c217d954SCole Faust    LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3083*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3084*c217d954SCole Faust    BASENAME##3;                                                                            \
3085*c217d954SCole Faust    if(Y_MASK##3 != 0)                                                                      \
3086*c217d954SCole Faust        BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
3087*c217d954SCole Faust    else                                                                                    \
3088*c217d954SCole Faust        BASENAME##3 = 0;
3089*c217d954SCole Faust
3090*c217d954SCole Faust#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3091*c217d954SCole Faust    LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3092*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3093*c217d954SCole Faust    BASENAME##4;                                                                            \
3094*c217d954SCole Faust    if(Y_MASK##4 != 0)                                                                      \
3095*c217d954SCole Faust        BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
3096*c217d954SCole Faust    else                                                                                    \
3097*c217d954SCole Faust        BASENAME##4 = 0;
3098*c217d954SCole Faust
3099*c217d954SCole Faust#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3100*c217d954SCole Faust    LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3101*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3102*c217d954SCole Faust    BASENAME##5;                                                                            \
3103*c217d954SCole Faust    if(Y_MASK##5 != 0)                                                                      \
3104*c217d954SCole Faust        BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
3105*c217d954SCole Faust    else                                                                                    \
3106*c217d954SCole Faust        BASENAME##5 = 0;
3107*c217d954SCole Faust
3108*c217d954SCole Faust#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3109*c217d954SCole Faust    LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3110*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3111*c217d954SCole Faust    BASENAME##6;                                                                            \
3112*c217d954SCole Faust    if(Y_MASK##6 != 0)                                                                      \
3113*c217d954SCole Faust        BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
3114*c217d954SCole Faust    else                                                                                    \
3115*c217d954SCole Faust        BASENAME##6 = 0;
3116*c217d954SCole Faust
3117*c217d954SCole Faust#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3118*c217d954SCole Faust    LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3119*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3120*c217d954SCole Faust    BASENAME##7;                                                                            \
3121*c217d954SCole Faust    if(Y_MASK##7 != 0)                                                                      \
3122*c217d954SCole Faust        BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
3123*c217d954SCole Faust    else                                                                                    \
3124*c217d954SCole Faust        BASENAME##7 = 0;
3125*c217d954SCole Faust
3126*c217d954SCole Faust#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
3127*c217d954SCole Faust    LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3128*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3129*c217d954SCole Faust    BASENAME##8;                                                                            \
3130*c217d954SCole Faust    if(Y_MASK##8 != 0)                                                                      \
3131*c217d954SCole Faust        BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
3132*c217d954SCole Faust    else                                                                                    \
3133*c217d954SCole Faust        BASENAME##8 = 0;
3134*c217d954SCole Faust
3135*c217d954SCole Faust#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3136*c217d954SCole Faust    LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
3137*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3138*c217d954SCole Faust    BASENAME##9;                                                                            \
3139*c217d954SCole Faust    if(Y_MASK##9 != 0)                                                                      \
3140*c217d954SCole Faust        BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
3141*c217d954SCole Faust    else                                                                                    \
3142*c217d954SCole Faust        BASENAME##9 = 0;
3143*c217d954SCole Faust
3144*c217d954SCole Faust#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3145*c217d954SCole Faust    LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3146*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3147*c217d954SCole Faust    BASENAME##A;                                                                            \
3148*c217d954SCole Faust    if(Y_MASK##A != 0)                                                                      \
3149*c217d954SCole Faust        BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
3150*c217d954SCole Faust    else                                                                                    \
3151*c217d954SCole Faust        BASENAME##A = 0;
3152*c217d954SCole Faust
3153*c217d954SCole Faust#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3154*c217d954SCole Faust    LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3155*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3156*c217d954SCole Faust    BASENAME##B;                                                                            \
3157*c217d954SCole Faust    if(Y_MASK##B != 0)                                                                      \
3158*c217d954SCole Faust        BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
3159*c217d954SCole Faust    else                                                                                    \
3160*c217d954SCole Faust        BASENAME##B = 0;
3161*c217d954SCole Faust
3162*c217d954SCole Faust#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3163*c217d954SCole Faust    LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3164*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3165*c217d954SCole Faust    BASENAME##C;                                                                            \
3166*c217d954SCole Faust    if(Y_MASK##C != 0)                                                                      \
3167*c217d954SCole Faust        BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
3168*c217d954SCole Faust    else                                                                                    \
3169*c217d954SCole Faust        BASENAME##C = 0;
3170*c217d954SCole Faust
3171*c217d954SCole Faust#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3172*c217d954SCole Faust    LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3173*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3174*c217d954SCole Faust    BASENAME##D;                                                                            \
3175*c217d954SCole Faust    if(Y_MASK##D != 0)                                                                      \
3176*c217d954SCole Faust        BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
3177*c217d954SCole Faust    else                                                                                    \
3178*c217d954SCole Faust        BASENAME##D = 0;
3179*c217d954SCole Faust
3180*c217d954SCole Faust#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3181*c217d954SCole Faust    LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3182*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3183*c217d954SCole Faust    BASENAME##E;                                                                            \
3184*c217d954SCole Faust    if(Y_MASK##E != 0)                                                                      \
3185*c217d954SCole Faust        BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
3186*c217d954SCole Faust    else                                                                                    \
3187*c217d954SCole Faust        BASENAME##E = 0;
3188*c217d954SCole Faust
3189*c217d954SCole Faust#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)     \
3190*c217d954SCole Faust    LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
3191*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
3192*c217d954SCole Faust    BASENAME##F;                                                                            \
3193*c217d954SCole Faust    if(Y_MASK##F != 0)                                                                      \
3194*c217d954SCole Faust        BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
3195*c217d954SCole Faust    else                                                                                    \
3196*c217d954SCole Faust        BASENAME##F = 0;
3197*c217d954SCole Faust
3198*c217d954SCole Faust
3199*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3200*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
3201*c217d954SCole Faust
3202*c217d954SCole Faust
3203*c217d954SCole Faust#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3204*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3205*c217d954SCole Faust    BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
3206*c217d954SCole Faust
3207*c217d954SCole Faust#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3208*c217d954SCole Faust    LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3209*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3210*c217d954SCole Faust    BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
3211*c217d954SCole Faust
3212*c217d954SCole Faust#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3213*c217d954SCole Faust    LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3214*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3215*c217d954SCole Faust    BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
3216*c217d954SCole Faust
3217*c217d954SCole Faust#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3218*c217d954SCole Faust    LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3219*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3220*c217d954SCole Faust    BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
3221*c217d954SCole Faust
3222*c217d954SCole Faust#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3223*c217d954SCole Faust    LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3224*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3225*c217d954SCole Faust    BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
3226*c217d954SCole Faust
3227*c217d954SCole Faust#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3228*c217d954SCole Faust    LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3229*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3230*c217d954SCole Faust    BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
3231*c217d954SCole Faust
3232*c217d954SCole Faust#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3233*c217d954SCole Faust    LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3234*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3235*c217d954SCole Faust    BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
3236*c217d954SCole Faust
3237*c217d954SCole Faust#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3238*c217d954SCole Faust    LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3239*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3240*c217d954SCole Faust    BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
3241*c217d954SCole Faust
3242*c217d954SCole Faust#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3243*c217d954SCole Faust    LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3244*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                       \
3245*c217d954SCole Faust    BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
3246*c217d954SCole Faust
3247*c217d954SCole Faust#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3248*c217d954SCole Faust    LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)      \
3249*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3250*c217d954SCole Faust    BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
3251*c217d954SCole Faust
3252*c217d954SCole Faust#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3253*c217d954SCole Faust    LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3254*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3255*c217d954SCole Faust    BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
3256*c217d954SCole Faust
3257*c217d954SCole Faust#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3258*c217d954SCole Faust    LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3259*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3260*c217d954SCole Faust    BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
3261*c217d954SCole Faust
3262*c217d954SCole Faust#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3263*c217d954SCole Faust    LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3264*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3265*c217d954SCole Faust    BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
3266*c217d954SCole Faust
3267*c217d954SCole Faust#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3268*c217d954SCole Faust    LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3269*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3270*c217d954SCole Faust    BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
3271*c217d954SCole Faust
3272*c217d954SCole Faust#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3273*c217d954SCole Faust    LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3274*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3275*c217d954SCole Faust    BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
3276*c217d954SCole Faust
3277*c217d954SCole Faust#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
3278*c217d954SCole Faust    LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)     \
3279*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
3280*c217d954SCole Faust    BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
3281*c217d954SCole Faust
3282*c217d954SCole Faust
3283*c217d954SCole Faust
3284*c217d954SCole Faust
3285*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3286*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
3287*c217d954SCole Faust
3288*c217d954SCole Faust
3289*c217d954SCole Faust
3290*c217d954SCole Faust#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3291*c217d954SCole Faust    Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3292*c217d954SCole Faust    Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                                      \
3293*c217d954SCole Faust    Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
3294*c217d954SCole Faust
3295*c217d954SCole Faust#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3296*c217d954SCole Faust    CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3297*c217d954SCole Faust    Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3298*c217d954SCole Faust    Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                                      \
3299*c217d954SCole Faust    Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
3300*c217d954SCole Faust
3301*c217d954SCole Faust#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3302*c217d954SCole Faust    CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3303*c217d954SCole Faust    Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3304*c217d954SCole Faust    Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                                      \
3305*c217d954SCole Faust    Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
3306*c217d954SCole Faust
3307*c217d954SCole Faust#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3308*c217d954SCole Faust    CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3309*c217d954SCole Faust    Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3310*c217d954SCole Faust    Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                                      \
3311*c217d954SCole Faust    Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
3312*c217d954SCole Faust
3313*c217d954SCole Faust#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3314*c217d954SCole Faust    CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3315*c217d954SCole Faust    Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3316*c217d954SCole Faust    Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                                      \
3317*c217d954SCole Faust    Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
3318*c217d954SCole Faust
3319*c217d954SCole Faust#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3320*c217d954SCole Faust    CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3321*c217d954SCole Faust    Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3322*c217d954SCole Faust    Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                                      \
3323*c217d954SCole Faust    Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
3324*c217d954SCole Faust
3325*c217d954SCole Faust#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3326*c217d954SCole Faust    CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3327*c217d954SCole Faust    Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3328*c217d954SCole Faust    Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                                      \
3329*c217d954SCole Faust    Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
3330*c217d954SCole Faust
3331*c217d954SCole Faust#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
3332*c217d954SCole Faust    CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
3333*c217d954SCole Faust    Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D;                                               \
3334*c217d954SCole Faust    Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                      \
3335*c217d954SCole Faust    Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
3336*c217d954SCole Faust
3337*c217d954SCole Faust
3338*c217d954SCole Faust
3339*c217d954SCole Faust
3340*c217d954SCole Faust#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3341*c217d954SCole Faust#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
3342*c217d954SCole Faust
3343*c217d954SCole Faust
3344*c217d954SCole Faust
3345*c217d954SCole Faust#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
3346*c217d954SCole Faust    BASENAME##0 *= (DATA_TYPE)SCALE;
3347*c217d954SCole Faust
3348*c217d954SCole Faust#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
3349*c217d954SCole Faust    SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
3350*c217d954SCole Faust    BASENAME##1 *= (DATA_TYPE)SCALE;
3351*c217d954SCole Faust
3352*c217d954SCole Faust#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
3353*c217d954SCole Faust    SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)     \
3354*c217d954SCole Faust    BASENAME##2 *= (DATA_TYPE)SCALE;
3355*c217d954SCole Faust
3356*c217d954SCole Faust#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
3357*c217d954SCole Faust    SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)     \
3358*c217d954SCole Faust    BASENAME##3 *= (DATA_TYPE)SCALE;
3359*c217d954SCole Faust
3360*c217d954SCole Faust#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
3361*c217d954SCole Faust    SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)     \
3362*c217d954SCole Faust    BASENAME##4 *= (DATA_TYPE)SCALE;
3363*c217d954SCole Faust
3364*c217d954SCole Faust#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
3365*c217d954SCole Faust    SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)     \
3366*c217d954SCole Faust    BASENAME##5 *= (DATA_TYPE)SCALE;
3367*c217d954SCole Faust
3368*c217d954SCole Faust#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
3369*c217d954SCole Faust    SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)     \
3370*c217d954SCole Faust    BASENAME##6 *= (DATA_TYPE)SCALE;
3371*c217d954SCole Faust
3372*c217d954SCole Faust#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
3373*c217d954SCole Faust    SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)     \
3374*c217d954SCole Faust    BASENAME##7 *= (DATA_TYPE)SCALE;
3375*c217d954SCole Faust
3376*c217d954SCole Faust#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
3377*c217d954SCole Faust    SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)     \
3378*c217d954SCole Faust    BASENAME##8 *= (DATA_TYPE)SCALE;
3379*c217d954SCole Faust
3380*c217d954SCole Faust#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
3381*c217d954SCole Faust    SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)      \
3382*c217d954SCole Faust    BASENAME##9 *= (DATA_TYPE)SCALE;
3383*c217d954SCole Faust
3384*c217d954SCole Faust#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
3385*c217d954SCole Faust    SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)     \
3386*c217d954SCole Faust    BASENAME##A *= (DATA_TYPE)SCALE;
3387*c217d954SCole Faust
3388*c217d954SCole Faust#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
3389*c217d954SCole Faust    SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)     \
3390*c217d954SCole Faust    BASENAME##B *= (DATA_TYPE)SCALE;
3391*c217d954SCole Faust
3392*c217d954SCole Faust#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
3393*c217d954SCole Faust    SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)     \
3394*c217d954SCole Faust    BASENAME##C *= (DATA_TYPE)SCALE;
3395*c217d954SCole Faust
3396*c217d954SCole Faust#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
3397*c217d954SCole Faust    SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)     \
3398*c217d954SCole Faust    BASENAME##D *= (DATA_TYPE)SCALE;
3399*c217d954SCole Faust
3400*c217d954SCole Faust#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
3401*c217d954SCole Faust    SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)     \
3402*c217d954SCole Faust    BASENAME##E *= (DATA_TYPE)SCALE;
3403*c217d954SCole Faust
3404*c217d954SCole Faust#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
3405*c217d954SCole Faust    SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)     \
3406*c217d954SCole Faust    BASENAME##F *= (DATA_TYPE)SCALE;
3407*c217d954SCole Faust
3408*c217d954SCole Faust
3409*c217d954SCole Faust
3410*c217d954SCole Faust#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
3411*c217d954SCole Faust#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
3412*c217d954SCole Faust
3413*c217d954SCole Faust
3414*c217d954SCole Faust
3415*c217d954SCole Faust#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
3416*c217d954SCole Faust    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
3417*c217d954SCole Faust#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
3418*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 2)                         \
3419*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
3420*c217d954SCole Faust#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
3421*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 3)                         \
3422*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
3423*c217d954SCole Faust#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
3424*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 4)                         \
3425*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
3426*c217d954SCole Faust#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
3427*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 8)                         \
3428*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
3429*c217d954SCole Faust#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
3430*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 16)                         \
3431*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
3432*c217d954SCole Faust
3433*c217d954SCole Faust
3434*c217d954SCole Faust
3435*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
3436*c217d954SCole Faust    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
3437*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
3438*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 2)                                \
3439*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
3440*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
3441*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 3)                                \
3442*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
3443*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
3444*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 4)                                \
3445*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
3446*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
3447*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 8)                                \
3448*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
3449*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
3450*c217d954SCole Faust    VEC_DATA_TYPE(TYPE, 16)                                \
3451*c217d954SCole Faust    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
3452*c217d954SCole Faust
3453*c217d954SCole Faust
3454*c217d954SCole Faust
3455*c217d954SCole Faust#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
3456*c217d954SCole Faust    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
3457*c217d954SCole Faust#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
3458*c217d954SCole Faust    COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
3459*c217d954SCole Faust    COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
3460*c217d954SCole Faust#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
3461*c217d954SCole Faust    TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE);    \
3462*c217d954SCole Faust    COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
3463*c217d954SCole Faust#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
3464*c217d954SCole Faust    TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE);    \
3465*c217d954SCole Faust    COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
3466*c217d954SCole Faust#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
3467*c217d954SCole Faust    TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE);    \
3468*c217d954SCole Faust    COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE);  \
3469*c217d954SCole Faust    COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE);  \
3470*c217d954SCole Faust    COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE);  \
3471*c217d954SCole Faust    COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
3472*c217d954SCole Faust#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
3473*c217d954SCole Faust    TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE);     \
3474*c217d954SCole Faust    COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE);   \
3475*c217d954SCole Faust    COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE);   \
3476*c217d954SCole Faust    COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE);   \
3477*c217d954SCole Faust    COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE);   \
3478*c217d954SCole Faust    COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE);   \
3479*c217d954SCole Faust    COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE);   \
3480*c217d954SCole Faust    COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE);   \
3481*c217d954SCole Faust    COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
3482*c217d954SCole Faust
3483*c217d954SCole Faust
3484*c217d954SCole Faust
3485*c217d954SCole Faust
3486*c217d954SCole Faust#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
3487*c217d954SCole Faust    CONCAT(COLUMN_VECTOR, K0)                          \
3488*c217d954SCole Faust    (IDX_COL, BASENAME, BS, TYPE);
3489*c217d954SCole Faust
3490*c217d954SCole Faust
3491*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
3492*c217d954SCole Faust    CONCAT(COLUMN_VECTOR_SCALAR, K0)                          \
3493*c217d954SCole Faust    (IDX_COL, BASENAME, BS, TYPE);
3494*c217d954SCole Faust
3495*c217d954SCole Faust
3496*c217d954SCole Faust#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
3497*c217d954SCole Faust    CONCAT(TRANSPOSE_K0X, N0)                       \
3498*c217d954SCole Faust    (K0, BASENAME, BS, TYPE);
3499*c217d954SCole Faust
3500*c217d954SCole Faust
3501*c217d954SCole Faust#define ADD_ROW_1(BASENAME, BIAS) \
3502*c217d954SCole Faust    BASENAME##0 += BIAS##0;
3503*c217d954SCole Faust
3504*c217d954SCole Faust#define ADD_ROW_2(BASENAME, BIAS) \
3505*c217d954SCole Faust    ADD_ROW_1(BASENAME, BIAS)     \
3506*c217d954SCole Faust    BASENAME##1 += BIAS##1;
3507*c217d954SCole Faust
3508*c217d954SCole Faust#define ADD_ROW_3(BASENAME, BIAS) \
3509*c217d954SCole Faust    ADD_ROW_2(BASENAME, BIAS)     \
3510*c217d954SCole Faust    BASENAME##2 += BIAS##2;
3511*c217d954SCole Faust
3512*c217d954SCole Faust#define ADD_ROW_4(BASENAME, BIAS) \
3513*c217d954SCole Faust    ADD_ROW_3(BASENAME, BIAS)     \
3514*c217d954SCole Faust    BASENAME##3 += BIAS##3;
3515*c217d954SCole Faust
3516*c217d954SCole Faust#define ADD_ROW_5(BASENAME, BIAS) \
3517*c217d954SCole Faust    ADD_ROW_4(BASENAME, BIAS)     \
3518*c217d954SCole Faust    BASENAME##4 += BIAS##4;
3519*c217d954SCole Faust
3520*c217d954SCole Faust#define ADD_ROW_6(BASENAME, BIAS) \
3521*c217d954SCole Faust    ADD_ROW_5(BASENAME, BIAS)     \
3522*c217d954SCole Faust    BASENAME##5 += BIAS##5;
3523*c217d954SCole Faust
3524*c217d954SCole Faust#define ADD_ROW_7(BASENAME, BIAS) \
3525*c217d954SCole Faust    ADD_ROW_6(BASENAME, BIAS)     \
3526*c217d954SCole Faust    BASENAME##6 += BIAS##6;
3527*c217d954SCole Faust
3528*c217d954SCole Faust#define ADD_ROW_8(BASENAME, BIAS) \
3529*c217d954SCole Faust    ADD_ROW_7(BASENAME, BIAS)     \
3530*c217d954SCole Faust    BASENAME##7 += BIAS##7;
3531*c217d954SCole Faust
3532*c217d954SCole Faust#define ADD_ROW_9(BASENAME, BIAS) \
3533*c217d954SCole Faust    ADD_ROW_8(BASENAME, BIAS)     \
3534*c217d954SCole Faust    BASENAME##8 += BIAS##8;
3535*c217d954SCole Faust
3536*c217d954SCole Faust#define ADD_ROW_10(BASENAME, BIAS) \
3537*c217d954SCole Faust    ADD_ROW_9(BASENAME, BIAS)      \
3538*c217d954SCole Faust    BASENAME##9 += BIAS##9;
3539*c217d954SCole Faust
3540*c217d954SCole Faust#define ADD_ROW_11(BASENAME, BIAS) \
3541*c217d954SCole Faust    ADD_ROW_10(BASENAME, BIAS)     \
3542*c217d954SCole Faust    BASENAME##A += BIAS##A;
3543*c217d954SCole Faust
3544*c217d954SCole Faust#define ADD_ROW_12(BASENAME, BIAS) \
3545*c217d954SCole Faust    ADD_ROW_11(BASENAME, BIAS)     \
3546*c217d954SCole Faust    BASENAME##B += BIAS##B;
3547*c217d954SCole Faust
3548*c217d954SCole Faust#define ADD_ROW_13(BASENAME, BIAS) \
3549*c217d954SCole Faust    ADD_ROW_12(BASENAME, BIAS)     \
3550*c217d954SCole Faust    BASENAME##C += BIAS##C;
3551*c217d954SCole Faust
3552*c217d954SCole Faust#define ADD_ROW_14(BASENAME, BIAS) \
3553*c217d954SCole Faust    ADD_ROW_13(BASENAME, BIAS)     \
3554*c217d954SCole Faust    BASENAME##D += BIAS##D;
3555*c217d954SCole Faust
3556*c217d954SCole Faust#define ADD_ROW_15(BASENAME, BIAS) \
3557*c217d954SCole Faust    ADD_ROW_14(BASENAME, BIAS)     \
3558*c217d954SCole Faust    BASENAME##E += BIAS##E;
3559*c217d954SCole Faust
3560*c217d954SCole Faust#define ADD_ROW_16(BASENAME, BIAS) \
3561*c217d954SCole Faust    ADD_ROW_15(BASENAME, BIAS)     \
3562*c217d954SCole Faust    BASENAME##F += BIAS##F;
3563*c217d954SCole Faust
3564*c217d954SCole Faust
3565*c217d954SCole Faust
3566*c217d954SCole Faust
3567*c217d954SCole Faust#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
3568*c217d954SCole Faust#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
3569*c217d954SCole Faust
3570*c217d954SCole Faust
3571*c217d954SCole Faust
3572*c217d954SCole Faust#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
3573*c217d954SCole Faust    BASENAME##0 += BIAS;
3574*c217d954SCole Faust
3575*c217d954SCole Faust#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
3576*c217d954SCole Faust    ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
3577*c217d954SCole Faust    BASENAME##1 += BIAS;
3578*c217d954SCole Faust
3579*c217d954SCole Faust#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
3580*c217d954SCole Faust    ADD_ROW_BROADCAST_2(BASENAME, BIAS)     \
3581*c217d954SCole Faust    BASENAME##2 += BIAS;
3582*c217d954SCole Faust
3583*c217d954SCole Faust#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
3584*c217d954SCole Faust    ADD_ROW_BROADCAST_3(BASENAME, BIAS)     \
3585*c217d954SCole Faust    BASENAME##3 += BIAS;
3586*c217d954SCole Faust
3587*c217d954SCole Faust#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
3588*c217d954SCole Faust    ADD_ROW_BROADCAST_4(BASENAME, BIAS)     \
3589*c217d954SCole Faust    BASENAME##4 += BIAS;
3590*c217d954SCole Faust
3591*c217d954SCole Faust#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
3592*c217d954SCole Faust    ADD_ROW_BROADCAST_5(BASENAME, BIAS)     \
3593*c217d954SCole Faust    BASENAME##5 += BIAS;
3594*c217d954SCole Faust
3595*c217d954SCole Faust#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
3596*c217d954SCole Faust    ADD_ROW_BROADCAST_6(BASENAME, BIAS)     \
3597*c217d954SCole Faust    BASENAME##6 += BIAS;
3598*c217d954SCole Faust
3599*c217d954SCole Faust#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
3600*c217d954SCole Faust    ADD_ROW_BROADCAST_7(BASENAME, BIAS)     \
3601*c217d954SCole Faust    BASENAME##7 += BIAS;
3602*c217d954SCole Faust
3603*c217d954SCole Faust#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
3604*c217d954SCole Faust    ADD_ROW_BROADCAST_8(BASENAME, BIAS)     \
3605*c217d954SCole Faust    BASENAME##8 += BIAS;
3606*c217d954SCole Faust
3607*c217d954SCole Faust#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
3608*c217d954SCole Faust    ADD_ROW_BROADCAST_9(BASENAME, BIAS)      \
3609*c217d954SCole Faust    BASENAME##9 += BIAS;
3610*c217d954SCole Faust
3611*c217d954SCole Faust#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
3612*c217d954SCole Faust    ADD_ROW_BROADCAST_10(BASENAME, BIAS)     \
3613*c217d954SCole Faust    BASENAME##A += BIAS;
3614*c217d954SCole Faust
3615*c217d954SCole Faust#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
3616*c217d954SCole Faust    ADD_ROW_BROADCAST_11(BASENAME, BIAS)     \
3617*c217d954SCole Faust    BASENAME##B += BIAS;
3618*c217d954SCole Faust
3619*c217d954SCole Faust#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
3620*c217d954SCole Faust    ADD_ROW_BROADCAST_12(BASENAME, BIAS)     \
3621*c217d954SCole Faust    BASENAME##C += BIAS;
3622*c217d954SCole Faust
3623*c217d954SCole Faust#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
3624*c217d954SCole Faust    ADD_ROW_BROADCAST_13(BASENAME, BIAS)     \
3625*c217d954SCole Faust    BASENAME##D += BIAS;
3626*c217d954SCole Faust
3627*c217d954SCole Faust#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
3628*c217d954SCole Faust    ADD_ROW_BROADCAST_14(BASENAME, BIAS)     \
3629*c217d954SCole Faust    BASENAME##E += BIAS;
3630*c217d954SCole Faust
3631*c217d954SCole Faust#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
3632*c217d954SCole Faust    ADD_ROW_BROADCAST_15(BASENAME, BIAS)     \
3633*c217d954SCole Faust    BASENAME##F += BIAS;
3634*c217d954SCole Faust
3635*c217d954SCole Faust
3636*c217d954SCole Faust#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
3637*c217d954SCole Faust#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
3638*c217d954SCole Faust
3639*c217d954SCole Faust
3640*c217d954SCole Faust
3641*c217d954SCole Faust#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3642*c217d954SCole Faust    BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
3643*c217d954SCole Faust
3644*c217d954SCole Faust#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3645*c217d954SCole Faust    ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3646*c217d954SCole Faust    BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
3647*c217d954SCole Faust
3648*c217d954SCole Faust#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3649*c217d954SCole Faust    ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3650*c217d954SCole Faust    BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
3651*c217d954SCole Faust
3652*c217d954SCole Faust#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3653*c217d954SCole Faust    ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3654*c217d954SCole Faust    BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
3655*c217d954SCole Faust
3656*c217d954SCole Faust#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3657*c217d954SCole Faust    ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3658*c217d954SCole Faust    BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
3659*c217d954SCole Faust
3660*c217d954SCole Faust#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3661*c217d954SCole Faust    ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3662*c217d954SCole Faust    BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
3663*c217d954SCole Faust
3664*c217d954SCole Faust#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3665*c217d954SCole Faust    ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3666*c217d954SCole Faust    BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
3667*c217d954SCole Faust
3668*c217d954SCole Faust#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3669*c217d954SCole Faust    ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3670*c217d954SCole Faust    BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
3671*c217d954SCole Faust
3672*c217d954SCole Faust#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3673*c217d954SCole Faust    ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3674*c217d954SCole Faust    BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
3675*c217d954SCole Faust
3676*c217d954SCole Faust#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3677*c217d954SCole Faust    ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)      \
3678*c217d954SCole Faust    BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
3679*c217d954SCole Faust
3680*c217d954SCole Faust#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3681*c217d954SCole Faust    ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3682*c217d954SCole Faust    BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
3683*c217d954SCole Faust
3684*c217d954SCole Faust#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3685*c217d954SCole Faust    ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3686*c217d954SCole Faust    BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
3687*c217d954SCole Faust
3688*c217d954SCole Faust#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3689*c217d954SCole Faust    ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3690*c217d954SCole Faust    BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
3691*c217d954SCole Faust
3692*c217d954SCole Faust#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3693*c217d954SCole Faust    ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3694*c217d954SCole Faust    BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
3695*c217d954SCole Faust
3696*c217d954SCole Faust#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3697*c217d954SCole Faust    ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3698*c217d954SCole Faust    BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
3699*c217d954SCole Faust
3700*c217d954SCole Faust#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
3701*c217d954SCole Faust    ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)     \
3702*c217d954SCole Faust    BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
3703*c217d954SCole Faust
3704*c217d954SCole Faust
3705*c217d954SCole Faust
3706*c217d954SCole Faust#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3707*c217d954SCole Faust#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
3708*c217d954SCole Faust
3709*c217d954SCole Faust
3710*c217d954SCole Faust
3711*c217d954SCole Faust#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3712*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3713*c217d954SCole Faust    BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
3714*c217d954SCole Faust
3715*c217d954SCole Faust#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3716*c217d954SCole Faust    CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3717*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3718*c217d954SCole Faust    BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
3719*c217d954SCole Faust
3720*c217d954SCole Faust#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3721*c217d954SCole Faust    CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3722*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3723*c217d954SCole Faust    BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
3724*c217d954SCole Faust
3725*c217d954SCole Faust#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3726*c217d954SCole Faust    CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3727*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3728*c217d954SCole Faust    BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
3729*c217d954SCole Faust
3730*c217d954SCole Faust#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3731*c217d954SCole Faust    CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3732*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3733*c217d954SCole Faust    BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
3734*c217d954SCole Faust
3735*c217d954SCole Faust#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3736*c217d954SCole Faust    CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3737*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3738*c217d954SCole Faust    BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
3739*c217d954SCole Faust
3740*c217d954SCole Faust#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3741*c217d954SCole Faust    CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3742*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3743*c217d954SCole Faust    BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
3744*c217d954SCole Faust
3745*c217d954SCole Faust#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3746*c217d954SCole Faust    CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3747*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3748*c217d954SCole Faust    BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
3749*c217d954SCole Faust
3750*c217d954SCole Faust#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3751*c217d954SCole Faust    CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3752*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                 \
3753*c217d954SCole Faust    BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
3754*c217d954SCole Faust
3755*c217d954SCole Faust#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3756*c217d954SCole Faust    CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)      \
3757*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3758*c217d954SCole Faust    BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
3759*c217d954SCole Faust
3760*c217d954SCole Faust#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3761*c217d954SCole Faust    CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3762*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3763*c217d954SCole Faust    BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
3764*c217d954SCole Faust
3765*c217d954SCole Faust#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3766*c217d954SCole Faust    CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3767*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3768*c217d954SCole Faust    BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
3769*c217d954SCole Faust
3770*c217d954SCole Faust#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3771*c217d954SCole Faust    CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3772*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3773*c217d954SCole Faust    BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
3774*c217d954SCole Faust
3775*c217d954SCole Faust#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3776*c217d954SCole Faust    CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3777*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3778*c217d954SCole Faust    BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
3779*c217d954SCole Faust
3780*c217d954SCole Faust#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3781*c217d954SCole Faust    CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3782*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3783*c217d954SCole Faust    BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
3784*c217d954SCole Faust
3785*c217d954SCole Faust#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
3786*c217d954SCole Faust    CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)     \
3787*c217d954SCole Faust    VEC_DATA_TYPE(DATA_TYPE, N)                                  \
3788*c217d954SCole Faust    BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
3789*c217d954SCole Faust
3790*c217d954SCole Faust
3791*c217d954SCole Faust
3792*c217d954SCole Faust#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3793*c217d954SCole Faust#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
3794*c217d954SCole Faust
3795*c217d954SCole Faust
3796*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
3797*c217d954SCole Faust#define ARM_COMPUTE_HELPERS_ASYMM_H
3798*c217d954SCole Faust
3799*c217d954SCole Faust
3800*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
3801*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
3802*c217d954SCole Faust
3803*c217d954SCole Faust
3804*c217d954SCole Faust
3805*c217d954SCole Faust
3806*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3807*c217d954SCole Faust    VSTORE(N0)                                                 \
3808*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3809*c217d954SCole Faust
3810*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3811*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3812*c217d954SCole Faust    VSTORE(N0)                                                 \
3813*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3814*c217d954SCole Faust
3815*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3816*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3817*c217d954SCole Faust    VSTORE(N0)                                                 \
3818*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3819*c217d954SCole Faust
3820*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3821*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3822*c217d954SCole Faust    VSTORE(N0)                                                 \
3823*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3824*c217d954SCole Faust
3825*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3826*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3827*c217d954SCole Faust    VSTORE(N0)                                                 \
3828*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3829*c217d954SCole Faust
3830*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3831*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3832*c217d954SCole Faust    VSTORE(N0)                                                 \
3833*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3834*c217d954SCole Faust
3835*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3836*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3837*c217d954SCole Faust    VSTORE(N0)                                                 \
3838*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3839*c217d954SCole Faust
3840*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3841*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3842*c217d954SCole Faust    VSTORE(N0)                                                 \
3843*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3844*c217d954SCole Faust
3845*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3846*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3847*c217d954SCole Faust    VSTORE(N0)                                                 \
3848*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3849*c217d954SCole Faust
3850*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3851*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
3852*c217d954SCole Faust    VSTORE(N0)                                                  \
3853*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3854*c217d954SCole Faust
3855*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3856*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3857*c217d954SCole Faust    VSTORE(N0)                                                  \
3858*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3859*c217d954SCole Faust
3860*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3861*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3862*c217d954SCole Faust    VSTORE(N0)                                                  \
3863*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3864*c217d954SCole Faust
3865*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3866*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3867*c217d954SCole Faust    VSTORE(N0)                                                  \
3868*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3869*c217d954SCole Faust
3870*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3871*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3872*c217d954SCole Faust    VSTORE(N0)                                                  \
3873*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3874*c217d954SCole Faust
3875*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3876*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3877*c217d954SCole Faust    VSTORE(N0)                                                  \
3878*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3879*c217d954SCole Faust
3880*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3881*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3882*c217d954SCole Faust    VSTORE(N0)                                                  \
3883*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3884*c217d954SCole Faust
3885*c217d954SCole Faust
3886*c217d954SCole Faust
3887*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3888*c217d954SCole Faust    VSTORE(N0)                                                         \
3889*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3890*c217d954SCole Faust
3891*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3892*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3893*c217d954SCole Faust    VSTORE(N0)                                                         \
3894*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3895*c217d954SCole Faust
3896*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3897*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3898*c217d954SCole Faust    VSTORE(N0)                                                         \
3899*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3900*c217d954SCole Faust
3901*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3902*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3903*c217d954SCole Faust    VSTORE(N0)                                                         \
3904*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3905*c217d954SCole Faust
3906*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3907*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3908*c217d954SCole Faust    VSTORE(N0)                                                         \
3909*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
3910*c217d954SCole Faust
3911*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3912*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3913*c217d954SCole Faust    VSTORE(N0)                                                         \
3914*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
3915*c217d954SCole Faust
3916*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3917*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3918*c217d954SCole Faust    VSTORE(N0)                                                         \
3919*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
3920*c217d954SCole Faust
3921*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3922*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3923*c217d954SCole Faust    VSTORE(N0)                                                         \
3924*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
3925*c217d954SCole Faust
3926*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3927*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3928*c217d954SCole Faust    VSTORE(N0)                                                         \
3929*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
3930*c217d954SCole Faust
3931*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
3932*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3933*c217d954SCole Faust    VSTORE(N0)                                                     \
3934*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
3935*c217d954SCole Faust
3936*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3937*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3938*c217d954SCole Faust    VSTORE(N0)                                                          \
3939*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
3940*c217d954SCole Faust
3941*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3942*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3943*c217d954SCole Faust    VSTORE(N0)                                                          \
3944*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
3945*c217d954SCole Faust
3946*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3947*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3948*c217d954SCole Faust    VSTORE(N0)                                                          \
3949*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
3950*c217d954SCole Faust
3951*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3952*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3953*c217d954SCole Faust    VSTORE(N0)                                                          \
3954*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
3955*c217d954SCole Faust
3956*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3957*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3958*c217d954SCole Faust    VSTORE(N0)                                                          \
3959*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
3960*c217d954SCole Faust
3961*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3962*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3963*c217d954SCole Faust    VSTORE(N0)                                                          \
3964*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
3965*c217d954SCole Faust
3966*c217d954SCole Faust
3967*c217d954SCole Faust
3968*c217d954SCole Faust
3969*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3970*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3971*c217d954SCole Faust
3972*c217d954SCole Faust
3973*c217d954SCole Faust
3974*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3975*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
3976*c217d954SCole Faust
3977*c217d954SCole Faust
3978*c217d954SCole Faust
3979*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3980*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3981*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
3982*c217d954SCole Faust
3983*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3984*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3985*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3986*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
3987*c217d954SCole Faust
3988*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3989*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3990*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3991*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
3992*c217d954SCole Faust
3993*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3994*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
3995*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
3996*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
3997*c217d954SCole Faust
3998*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
3999*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4000*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4001*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
4002*c217d954SCole Faust
4003*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4004*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4005*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4006*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
4007*c217d954SCole Faust
4008*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4009*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4010*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4011*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
4012*c217d954SCole Faust
4013*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4014*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4015*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4016*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
4017*c217d954SCole Faust
4018*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4019*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4020*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
4021*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
4022*c217d954SCole Faust
4023*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4024*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
4025*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4026*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
4027*c217d954SCole Faust
4028*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4029*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4030*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4031*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
4032*c217d954SCole Faust
4033*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4034*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4035*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4036*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
4037*c217d954SCole Faust
4038*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4039*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4040*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4041*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
4042*c217d954SCole Faust
4043*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4044*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4045*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4046*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
4047*c217d954SCole Faust
4048*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4049*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4050*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4051*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
4052*c217d954SCole Faust
4053*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
4054*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
4055*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
4056*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
4057*c217d954SCole Faust
4058*c217d954SCole Faust
4059*c217d954SCole Faust
4060*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4061*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4062*c217d954SCole Faust
4063*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4064*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
4065*c217d954SCole Faust    {                                                                                                                                                     \
4066*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
4067*c217d954SCole Faust    }                                                                                                                                                     \
4068*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
4069*c217d954SCole Faust    {                                                                                                                                                     \
4070*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4071*c217d954SCole Faust    }                                                                                                                                                     \
4072*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
4073*c217d954SCole Faust    {                                                                                                                                                     \
4074*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
4075*c217d954SCole Faust    }                                                                                                                                                     \
4076*c217d954SCole Faust    else                                                                                                                                                  \
4077*c217d954SCole Faust    {                                                                                                                                                     \
4078*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
4079*c217d954SCole Faust    }
4080*c217d954SCole Faust
4081*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
4082*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
4083*c217d954SCole Faust    {                                                                                                             \
4084*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4085*c217d954SCole Faust    }                                                                                                             \
4086*c217d954SCole Faust    else                                                                                                          \
4087*c217d954SCole Faust    {                                                                                                             \
4088*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4089*c217d954SCole Faust    }
4090*c217d954SCole Faust
4091*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
4092*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
4093*c217d954SCole Faust    {                                                                                                             \
4094*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
4095*c217d954SCole Faust    }                                                                                                             \
4096*c217d954SCole Faust    else                                                                                                          \
4097*c217d954SCole Faust    {                                                                                                             \
4098*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
4099*c217d954SCole Faust    }
4100*c217d954SCole Faust
4101*c217d954SCole Faust
4102*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
4103*c217d954SCole Faust
4104*c217d954SCole Faust
4105*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
4106*c217d954SCole Faust
4107*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4108*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
4109*c217d954SCole Faust
4110*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
4111*c217d954SCole Faust
4112*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4113*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
4114*c217d954SCole Faust
4115*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
4116*c217d954SCole Faust
4117*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4118*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
4119*c217d954SCole Faust
4120*c217d954SCole Faust#else
4121*c217d954SCole Faust
4122*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
4123*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
4124*c217d954SCole Faust
4125*c217d954SCole Faust#endif
4126*c217d954SCole Faust
4127*c217d954SCole Faust#endif
4128*c217d954SCole Faust
4129*c217d954SCole Faust
4130*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
4131*c217d954SCole Faust
4132*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4133*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
4134*c217d954SCole Faust#else
4135*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
4136*c217d954SCole Faust    ((uint)(y * M0))
4137*c217d954SCole Faust#endif
4138*c217d954SCole Faust
4139*c217d954SCole Faust
4140*c217d954SCole Faust
4141*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
4142*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
4143*c217d954SCole Faust
4144*c217d954SCole Faust
4145*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4146*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
4147*c217d954SCole Faust#endif
4148*c217d954SCole Faust
4149*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
4150*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
4151*c217d954SCole Faust#endif
4152*c217d954SCole Faust
4153*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
4154*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
4155*c217d954SCole Faust#endif
4156*c217d954SCole Faust
4157*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
4158*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
4159*c217d954SCole Faust#endif
4160*c217d954SCole Faust
4161*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
4162*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
4163*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
4164*c217d954SCole Faust
4165*c217d954SCole Faust
4166*c217d954SCole Faust#define CONCAT(a, b) a##b
4167*c217d954SCole Faust
4168*c217d954SCole Faust
4169*c217d954SCole Faust#define EXPAND(x) x
4170*c217d954SCole Faust
4171*c217d954SCole Faust
4172*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
4173*c217d954SCole Faust
4174*c217d954SCole Faust
4175*c217d954SCole Faust#define REV1(x) ((x))
4176*c217d954SCole Faust#define REV2(x) ((x).s10)
4177*c217d954SCole Faust#define REV3(x) ((x).s210)
4178*c217d954SCole Faust#define REV4(x) ((x).s3210)
4179*c217d954SCole Faust#define REV8(x) ((x).s76543210)
4180*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
4181*c217d954SCole Faust
4182*c217d954SCole Faust
4183*c217d954SCole Faust
4184*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
4185*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
4186*c217d954SCole Faust
4187*c217d954SCole Faust
4188*c217d954SCole Faust
4189*c217d954SCole Faust#define ROT1_0(x) ((x))
4190*c217d954SCole Faust#define ROT1_1(x) ((x))
4191*c217d954SCole Faust
4192*c217d954SCole Faust#define ROT2_0(x) ((x))
4193*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
4194*c217d954SCole Faust#define ROT2_2(x) ((x))
4195*c217d954SCole Faust
4196*c217d954SCole Faust#define ROT3_0(x) ((x))
4197*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
4198*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
4199*c217d954SCole Faust#define ROT3_3(x) ((x))
4200*c217d954SCole Faust
4201*c217d954SCole Faust#define ROT4_0(x) ((x))
4202*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
4203*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
4204*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
4205*c217d954SCole Faust#define ROT4_4(x) ((x))
4206*c217d954SCole Faust
4207*c217d954SCole Faust#define ROT8_0(x) ((x))
4208*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
4209*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
4210*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
4211*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
4212*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
4213*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
4214*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
4215*c217d954SCole Faust#define ROT8_8(x) ((x))
4216*c217d954SCole Faust
4217*c217d954SCole Faust#define ROT16_0(x) ((x))
4218*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
4219*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
4220*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
4221*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
4222*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
4223*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
4224*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
4225*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
4226*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
4227*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
4228*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
4229*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
4230*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
4231*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
4232*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
4233*c217d954SCole Faust#define ROT16_16(x) ((x))
4234*c217d954SCole Faust
4235*c217d954SCole Faust
4236*c217d954SCole Faust
4237*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
4238*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
4239*c217d954SCole Faust
4240*c217d954SCole Faust
4241*c217d954SCole Faust
4242*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
4243*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
4244*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
4245*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
4246*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
4247*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
4248*c217d954SCole Faust
4249*c217d954SCole Faust
4250*c217d954SCole Faust
4251*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
4252*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
4253*c217d954SCole Faust
4254*c217d954SCole Faust
4255*c217d954SCole Faust#define VLOAD_STR(size) vload##size
4256*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
4257*c217d954SCole Faust
4258*c217d954SCole Faust
4259*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
4260*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
4261*c217d954SCole Faust
4262*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
4263*c217d954SCole Faust    {                            \
4264*c217d954SCole Faust    }
4265*c217d954SCole Faust
4266*c217d954SCole Faust
4267*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
4268*c217d954SCole Faust#define vload_partial_1_1 vload1
4269*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
4270*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
4271*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
4272*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
4273*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
4274*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
4275*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
4276*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
4277*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
4278*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
4279*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
4280*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
4281*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
4282*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
4283*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
4284*c217d954SCole Faust
4285*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
4286*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
4287*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
4288*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
4289*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
4290*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
4291*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
4292*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
4293*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
4294*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
4295*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
4296*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
4297*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
4298*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
4299*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
4300*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
4301*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
4302*c217d954SCole Faust
4303*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
4304*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
4305*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
4306*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
4307*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
4308*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
4309*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
4310*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
4311*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
4312*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
4313*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
4314*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
4315*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
4316*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
4317*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
4318*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
4319*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
4320*c217d954SCole Faust
4321*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
4322*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
4323*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
4324*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
4325*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
4326*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
4327*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
4328*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
4329*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
4330*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
4331*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
4332*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
4333*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
4334*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
4335*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
4336*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
4337*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
4338*c217d954SCole Faust
4339*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
4340*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
4341*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
4342*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
4343*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
4344*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
4345*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
4346*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
4347*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
4348*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
4349*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
4350*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
4351*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
4352*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
4353*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
4354*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
4355*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
4356*c217d954SCole Faust
4357*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
4358*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
4359*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
4360*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
4361*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
4362*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
4363*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
4364*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
4365*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
4366*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
4367*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
4368*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
4369*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
4370*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
4371*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
4372*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
4373*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
4374*c217d954SCole Faust
4375*c217d954SCole Faust
4376*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
4377*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
4378*c217d954SCole Faust
4379*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
4380*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
4381*c217d954SCole Faust
4382*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
4383*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
4384*c217d954SCole Faust
4385*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
4386*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
4387*c217d954SCole Faust
4388*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
4389*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4390*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
4391*c217d954SCole Faust
4392*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
4393*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4394*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
4395*c217d954SCole Faust
4396*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
4397*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
4398*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
4399*c217d954SCole Faust
4400*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
4401*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
4402*c217d954SCole Faust
4403*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
4404*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4405*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
4406*c217d954SCole Faust
4407*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
4408*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4409*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
4410*c217d954SCole Faust
4411*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
4412*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4413*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
4414*c217d954SCole Faust
4415*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
4416*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4417*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
4418*c217d954SCole Faust
4419*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
4420*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4421*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
4422*c217d954SCole Faust
4423*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
4424*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4425*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
4426*c217d954SCole Faust
4427*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
4428*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
4429*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
4430*c217d954SCole Faust
4431*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
4432*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
4433*c217d954SCole Faust
4434*c217d954SCole Faust
4435*c217d954SCole Faust
4436*c217d954SCole Faust#define PIXEL_UNIT4 1
4437*c217d954SCole Faust#define PIXEL_UNIT8 2
4438*c217d954SCole Faust#define PIXEL_UNIT16 4
4439*c217d954SCole Faust
4440*c217d954SCole Faust
4441*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
4442*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
4443*c217d954SCole Faust
4444*c217d954SCole Faust
4445*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
4446*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
4447*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
4448*c217d954SCole Faust
4449*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4450*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
4451*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
4452*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
4453*c217d954SCole Faust#endif
4454*c217d954SCole Faust
4455*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
4456*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
4457*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4458*c217d954SCole Faust
4459*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
4460*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
4461*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
4462*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
4463*c217d954SCole Faust#endif
4464*c217d954SCole Faust
4465*c217d954SCole Faust
4466*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
4467*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
4468*c217d954SCole Faust
4469*c217d954SCole Faust
4470*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
4471*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
4472*c217d954SCole Faust
4473*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
4474*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
4475*c217d954SCole Faust
4476*c217d954SCole Faust#define float1 float
4477*c217d954SCole Faust#define half1 half
4478*c217d954SCole Faust#define char1 char
4479*c217d954SCole Faust#define uchar1 uchar
4480*c217d954SCole Faust#define short1 short
4481*c217d954SCole Faust#define ushort1 ushort
4482*c217d954SCole Faust#define int1 int
4483*c217d954SCole Faust#define uint1 uint
4484*c217d954SCole Faust#define long1 long
4485*c217d954SCole Faust#define ulong1 ulong
4486*c217d954SCole Faust#define double1 double
4487*c217d954SCole Faust
4488*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
4489*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
4490*c217d954SCole Faust
4491*c217d954SCole Faust
4492*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
4493*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
4494*c217d954SCole Faust
4495*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
4496*c217d954SCole Faust    {                             \
4497*c217d954SCole Faust    }
4498*c217d954SCole Faust
4499*c217d954SCole Faust
4500*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
4501*c217d954SCole Faust#define vstore_partial_1_1 vstore1
4502*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
4503*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
4504*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
4505*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
4506*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
4507*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
4508*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
4509*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
4510*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
4511*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
4512*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
4513*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
4514*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
4515*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
4516*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
4517*c217d954SCole Faust
4518*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
4519*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
4520*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
4521*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
4522*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
4523*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
4524*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
4525*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
4526*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
4527*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
4528*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
4529*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
4530*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
4531*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
4532*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
4533*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
4534*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
4535*c217d954SCole Faust
4536*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
4537*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
4538*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
4539*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
4540*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
4541*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
4542*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
4543*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
4544*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
4545*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
4546*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
4547*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
4548*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
4549*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
4550*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
4551*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
4552*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
4553*c217d954SCole Faust
4554*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
4555*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
4556*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
4557*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
4558*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
4559*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
4560*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
4561*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
4562*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
4563*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
4564*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
4565*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
4566*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
4567*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
4568*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
4569*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
4570*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
4571*c217d954SCole Faust
4572*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
4573*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
4574*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
4575*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
4576*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
4577*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
4578*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
4579*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
4580*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
4581*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
4582*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
4583*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
4584*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
4585*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
4586*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
4587*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
4588*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
4589*c217d954SCole Faust
4590*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
4591*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
4592*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
4593*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
4594*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
4595*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
4596*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
4597*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
4598*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
4599*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
4600*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
4601*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
4602*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
4603*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
4604*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
4605*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
4606*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
4607*c217d954SCole Faust
4608*c217d954SCole Faust
4609*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
4610*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
4611*c217d954SCole Faust
4612*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
4613*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
4614*c217d954SCole Faust
4615*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
4616*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
4617*c217d954SCole Faust
4618*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
4619*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
4620*c217d954SCole Faust
4621*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
4622*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4623*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
4624*c217d954SCole Faust
4625*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
4626*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4627*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
4628*c217d954SCole Faust
4629*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
4630*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
4631*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
4632*c217d954SCole Faust
4633*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
4634*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
4635*c217d954SCole Faust
4636*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
4637*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4638*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
4639*c217d954SCole Faust
4640*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
4641*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4642*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
4643*c217d954SCole Faust
4644*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
4645*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4646*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
4647*c217d954SCole Faust
4648*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
4649*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4650*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
4651*c217d954SCole Faust
4652*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
4653*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4654*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
4655*c217d954SCole Faust
4656*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
4657*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4658*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
4659*c217d954SCole Faust
4660*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
4661*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
4662*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
4663*c217d954SCole Faust
4664*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
4665*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
4666*c217d954SCole Faust
4667*c217d954SCole Faust
4668*c217d954SCole Faust
4669*c217d954SCole Faust
4670*c217d954SCole Faust
4671*c217d954SCole Faust#define convert_float_sat convert_float
4672*c217d954SCole Faust#define convert_float1_sat convert_float
4673*c217d954SCole Faust#define convert_float2_sat convert_float2
4674*c217d954SCole Faust#define convert_float3_sat convert_float3
4675*c217d954SCole Faust#define convert_float4_sat convert_float4
4676*c217d954SCole Faust#define convert_float8_sat convert_float8
4677*c217d954SCole Faust#define convert_float16_sat convert_float16
4678*c217d954SCole Faust#define convert_half_sat convert_float
4679*c217d954SCole Faust#define convert_half1_sat convert_half
4680*c217d954SCole Faust#define convert_half2_sat convert_half2
4681*c217d954SCole Faust#define convert_half3_sat convert_half3
4682*c217d954SCole Faust#define convert_half4_sat convert_half4
4683*c217d954SCole Faust#define convert_half8_sat convert_half8
4684*c217d954SCole Faust#define convert_half16_sat convert_half16
4685*c217d954SCole Faust
4686*c217d954SCole Faust#define convert_float1 convert_float
4687*c217d954SCole Faust#define convert_half1 convert_half
4688*c217d954SCole Faust#define convert_char1 convert_char
4689*c217d954SCole Faust#define convert_uchar1 convert_uchar
4690*c217d954SCole Faust#define convert_short1 convert_short
4691*c217d954SCole Faust#define convert_ushort1 convert_ushort
4692*c217d954SCole Faust#define convert_int1 convert_int
4693*c217d954SCole Faust#define convert_uint1 convert_uint
4694*c217d954SCole Faust#define convert_long1 convert_long
4695*c217d954SCole Faust#define convert_ulong1 convert_ulong
4696*c217d954SCole Faust#define convert_double1 convert_double
4697*c217d954SCole Faust
4698*c217d954SCole Faust#define convert_char1_sat convert_char_sat
4699*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
4700*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
4701*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
4702*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
4703*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
4704*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
4705*c217d954SCole Faust#define convert_short1_sat convert_short_sat
4706*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
4707*c217d954SCole Faust#define convert_int1_sat convert_int_sat
4708*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
4709*c217d954SCole Faust#define convert_long1_sat convert_long_sat
4710*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
4711*c217d954SCole Faust#define convert_double1_sat convert_double_sat
4712*c217d954SCole Faust
4713*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
4714*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
4715*c217d954SCole Faust
4716*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
4717*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
4718*c217d954SCole Faust
4719*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
4720*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
4721*c217d954SCole Faust
4722*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
4723*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
4724*c217d954SCole Faust
4725*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
4726*c217d954SCole Faust#define select_vec_dt_char(size) char##size
4727*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
4728*c217d954SCole Faust#define select_vec_dt_short(size) short##size
4729*c217d954SCole Faust#define select_vec_dt_half(size) short##size
4730*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
4731*c217d954SCole Faust#define select_vec_dt_int(size) int##size
4732*c217d954SCole Faust#define select_vec_dt_float(size) int##size
4733*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
4734*c217d954SCole Faust#define select_vec_dt_long(size) long##size
4735*c217d954SCole Faust
4736*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
4737*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
4738*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
4739*c217d954SCole Faust
4740*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
4741*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
4742*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
4743*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
4744*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
4745*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
4746*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
4747*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
4748*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
4749*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
4750*c217d954SCole Faust
4751*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
4752*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
4753*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
4754*c217d954SCole Faust
4755*c217d954SCole Faust#define sum_reduce_1(x) (x)
4756*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
4757*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
4758*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
4759*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
4760*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
4761*c217d954SCole Faust
4762*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
4763*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
4764*c217d954SCole Faust
4765*c217d954SCole Faust#define prod_reduce_1(x) (x)
4766*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
4767*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
4768*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
4769*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
4770*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
4771*c217d954SCole Faust
4772*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
4773*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
4774*c217d954SCole Faust
4775*c217d954SCole Faust#define max_reduce_1(x) (x)
4776*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
4777*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
4778*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
4779*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
4780*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
4781*c217d954SCole Faust
4782*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
4783*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
4784*c217d954SCole Faust
4785*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
4786*c217d954SCole Faust    __global uchar *name##_ptr,      \
4787*c217d954SCole Faust    uint        name##_stride_x, \
4788*c217d954SCole Faust    uint        name##_step_x,   \
4789*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4790*c217d954SCole Faust
4791*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
4792*c217d954SCole Faust    __global uchar *name##_ptr,      \
4793*c217d954SCole Faust    uint        name##_stride_x, \
4794*c217d954SCole Faust    uint        name##_step_x,   \
4795*c217d954SCole Faust    uint        name##_stride_y, \
4796*c217d954SCole Faust    uint        name##_step_y,   \
4797*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4798*c217d954SCole Faust
4799*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
4800*c217d954SCole Faust    __global uchar *name##_ptr,      \
4801*c217d954SCole Faust    uint        name##_stride_x, \
4802*c217d954SCole Faust    uint        name##_step_x,   \
4803*c217d954SCole Faust    uint        name##_stride_y, \
4804*c217d954SCole Faust    uint        name##_step_y,   \
4805*c217d954SCole Faust    uint        name##_stride_z, \
4806*c217d954SCole Faust    uint        name##_step_z,   \
4807*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4808*c217d954SCole Faust
4809*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
4810*c217d954SCole Faust    __global uchar *name##_ptr,      \
4811*c217d954SCole Faust    uint        name##_stride_x, \
4812*c217d954SCole Faust    uint        name##_step_x,   \
4813*c217d954SCole Faust    uint        name##_stride_y, \
4814*c217d954SCole Faust    uint        name##_step_y,   \
4815*c217d954SCole Faust    uint        name##_stride_z, \
4816*c217d954SCole Faust    uint        name##_step_z,   \
4817*c217d954SCole Faust    uint        name##_stride_w, \
4818*c217d954SCole Faust    uint        name##_step_w,   \
4819*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4820*c217d954SCole Faust
4821*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
4822*c217d954SCole Faust    __global uchar *name##_ptr,      \
4823*c217d954SCole Faust    uint        name##_stride_x, \
4824*c217d954SCole Faust    uint        name##_step_x,   \
4825*c217d954SCole Faust    uint        name##_stride_y, \
4826*c217d954SCole Faust    uint        name##_step_y,   \
4827*c217d954SCole Faust    uint        name##_stride_z, \
4828*c217d954SCole Faust    uint        name##_step_z,   \
4829*c217d954SCole Faust    uint        name##_stride_w, \
4830*c217d954SCole Faust    uint        name##_step_w,   \
4831*c217d954SCole Faust    uint        name##_stride_v, \
4832*c217d954SCole Faust    uint        name##_step_v,   \
4833*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
4834*c217d954SCole Faust
4835*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
4836*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
4837*c217d954SCole Faust
4838*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
4839*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
4840*c217d954SCole Faust
4841*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
4842*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
4843*c217d954SCole Faust
4844*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
4845*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
4846*c217d954SCole Faust
4847*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4848*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4849*c217d954SCole Faust
4850*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
4851*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
4852*c217d954SCole Faust
4853*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
4854*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
4855*c217d954SCole Faust
4856*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
4857*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4858*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
4859*c217d954SCole Faust
4860*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
4861*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
4862*c217d954SCole Faust
4863*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
4864*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4865*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
4866*c217d954SCole Faust
4867*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
4868*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
4869*c217d954SCole Faust
4870*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
4871*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
4872*c217d954SCole Faust                           name##_stride_z, name##_step_z)
4873*c217d954SCole Faust
4874*c217d954SCole Faust
4875*c217d954SCole Fausttypedef struct Vector
4876*c217d954SCole Faust{
4877*c217d954SCole Faust    __global uchar *ptr;
4878*c217d954SCole Faust    int             offset_first_element_in_bytes;
4879*c217d954SCole Faust    int             stride_x;
4880*c217d954SCole Faust} Vector;
4881*c217d954SCole Faust
4882*c217d954SCole Faust
4883*c217d954SCole Fausttypedef struct Image
4884*c217d954SCole Faust{
4885*c217d954SCole Faust    __global uchar *ptr;
4886*c217d954SCole Faust    int             offset_first_element_in_bytes;
4887*c217d954SCole Faust    int             stride_x;
4888*c217d954SCole Faust    int             stride_y;
4889*c217d954SCole Faust} Image;
4890*c217d954SCole Faust
4891*c217d954SCole Faust
4892*c217d954SCole Fausttypedef struct Tensor3D
4893*c217d954SCole Faust{
4894*c217d954SCole Faust    __global uchar *ptr;
4895*c217d954SCole Faust    int             offset_first_element_in_bytes;
4896*c217d954SCole Faust    int             stride_x;
4897*c217d954SCole Faust    int             stride_y;
4898*c217d954SCole Faust    int             stride_z;
4899*c217d954SCole Faust} Tensor3D;
4900*c217d954SCole Faust
4901*c217d954SCole Faust
4902*c217d954SCole Fausttypedef struct Tensor4D
4903*c217d954SCole Faust{
4904*c217d954SCole Faust    __global uchar *ptr;
4905*c217d954SCole Faust    int             offset_first_element_in_bytes;
4906*c217d954SCole Faust    int             stride_x;
4907*c217d954SCole Faust    int             stride_y;
4908*c217d954SCole Faust    int             stride_z;
4909*c217d954SCole Faust    int             stride_w;
4910*c217d954SCole Faust} Tensor4D;
4911*c217d954SCole Faust
4912*c217d954SCole Faust
4913*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
4914*c217d954SCole Faust{
4915*c217d954SCole Faust    Vector vector =
4916*c217d954SCole Faust    {
4917*c217d954SCole Faust        .ptr                           = ptr,
4918*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4919*c217d954SCole Faust        .stride_x                      = stride_x,
4920*c217d954SCole Faust    };
4921*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
4922*c217d954SCole Faust    return vector;
4923*c217d954SCole Faust}
4924*c217d954SCole Faust
4925*c217d954SCole Faust
4926*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
4927*c217d954SCole Faust{
4928*c217d954SCole Faust    Image img =
4929*c217d954SCole Faust    {
4930*c217d954SCole Faust        .ptr                           = ptr,
4931*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4932*c217d954SCole Faust        .stride_x                      = stride_x,
4933*c217d954SCole Faust        .stride_y                      = stride_y
4934*c217d954SCole Faust    };
4935*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
4936*c217d954SCole Faust    return img;
4937*c217d954SCole Faust}
4938*c217d954SCole Faust
4939*c217d954SCole Faust
4940*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4941*c217d954SCole Faust{
4942*c217d954SCole Faust    Image img =
4943*c217d954SCole Faust    {
4944*c217d954SCole Faust        .ptr                           = ptr,
4945*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4946*c217d954SCole Faust        .stride_x                      = stride_x,
4947*c217d954SCole Faust        .stride_y                      = stride_y
4948*c217d954SCole Faust    };
4949*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4950*c217d954SCole Faust    return img;
4951*c217d954SCole Faust}
4952*c217d954SCole Faust
4953*c217d954SCole Faust
4954*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4955*c217d954SCole Faust{
4956*c217d954SCole Faust    Tensor3D tensor =
4957*c217d954SCole Faust    {
4958*c217d954SCole Faust        .ptr                           = ptr,
4959*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4960*c217d954SCole Faust        .stride_x                      = stride_x,
4961*c217d954SCole Faust        .stride_y                      = stride_y,
4962*c217d954SCole Faust        .stride_z                      = stride_z
4963*c217d954SCole Faust    };
4964*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
4965*c217d954SCole Faust    return tensor;
4966*c217d954SCole Faust}
4967*c217d954SCole Faust
4968*c217d954SCole Faust
4969*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
4970*c217d954SCole Faust{
4971*c217d954SCole Faust    Tensor3D tensor =
4972*c217d954SCole Faust    {
4973*c217d954SCole Faust        .ptr                           = ptr,
4974*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4975*c217d954SCole Faust        .stride_x                      = stride_x,
4976*c217d954SCole Faust        .stride_y                      = stride_y,
4977*c217d954SCole Faust        .stride_z                      = stride_z
4978*c217d954SCole Faust    };
4979*c217d954SCole Faust    return tensor;
4980*c217d954SCole Faust}
4981*c217d954SCole Faust
4982*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
4983*c217d954SCole Faust                                             uint step_w,
4984*c217d954SCole Faust                                             uint mod_size)
4985*c217d954SCole Faust{
4986*c217d954SCole Faust    Tensor4D tensor =
4987*c217d954SCole Faust    {
4988*c217d954SCole Faust        .ptr                           = ptr,
4989*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
4990*c217d954SCole Faust        .stride_x                      = stride_x,
4991*c217d954SCole Faust        .stride_y                      = stride_y,
4992*c217d954SCole Faust        .stride_z                      = stride_z,
4993*c217d954SCole Faust        .stride_w                      = stride_w
4994*c217d954SCole Faust    };
4995*c217d954SCole Faust
4996*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
4997*c217d954SCole Faust    return tensor;
4998*c217d954SCole Faust}
4999*c217d954SCole Faust
5000*c217d954SCole Faust
5001*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
5002*c217d954SCole Faust{
5003*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
5004*c217d954SCole Faust}
5005*c217d954SCole Faust
5006*c217d954SCole Faust
5007*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
5008*c217d954SCole Faust{
5009*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
5010*c217d954SCole Faust}
5011*c217d954SCole Faust
5012*c217d954SCole Faust
5013*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
5014*c217d954SCole Faust{
5015*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
5016*c217d954SCole Faust}
5017*c217d954SCole Faust
5018*c217d954SCole Faust
5019*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
5020*c217d954SCole Faust{
5021*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
5022*c217d954SCole Faust}
5023*c217d954SCole Faust
5024*c217d954SCole Faust
5025*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
5026*c217d954SCole Faust{
5027*c217d954SCole Faust    uint num_elements = width * height;
5028*c217d954SCole Faust
5029*c217d954SCole Faust    const uint z = index / num_elements;
5030*c217d954SCole Faust
5031*c217d954SCole Faust    index %= num_elements;
5032*c217d954SCole Faust
5033*c217d954SCole Faust    const uint y = index / width;
5034*c217d954SCole Faust
5035*c217d954SCole Faust    index %= width;
5036*c217d954SCole Faust
5037*c217d954SCole Faust    const uint x = index;
5038*c217d954SCole Faust
5039*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
5040*c217d954SCole Faust}
5041*c217d954SCole Faust
5042*c217d954SCole Faust#endif
5043*c217d954SCole Faust
5044*c217d954SCole Faust
5045*c217d954SCole Faust#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
5046*c217d954SCole Faust#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
5047*c217d954SCole Faust
5048*c217d954SCole Faust
5049*c217d954SCole Faustinline uchar quantize_qasymm8(float input, float offset, float scale)
5050*c217d954SCole Faust{
5051*c217d954SCole Faust    float out_f32 = input / scale + offset;
5052*c217d954SCole Faust    uchar res_u8  = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
5053*c217d954SCole Faust    return res_u8;
5054*c217d954SCole Faust}
5055*c217d954SCole Faust
5056*c217d954SCole Faust
5057*c217d954SCole Faustinline float dequantize_qasymm8(uchar input, float offset, float scale)
5058*c217d954SCole Faust{
5059*c217d954SCole Faust    return ((float)input - offset) * scale;
5060*c217d954SCole Faust}
5061*c217d954SCole Faust
5062*c217d954SCole Faust
5063*c217d954SCole Faustinline float dequantize_qasymm8_signed(char input, float offset, float scale)
5064*c217d954SCole Faust{
5065*c217d954SCole Faust    return ((float)input - offset) * scale;
5066*c217d954SCole Faust}
5067*c217d954SCole Faust
5068*c217d954SCole Faust
5069*c217d954SCole Faust#define QUANTIZE_IMPL(type, size)                                                                                       \
5070*c217d954SCole Faust    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
5071*c217d954SCole Faust    {                                                                                                                   \
5072*c217d954SCole Faust        VEC_DATA_TYPE(float, size)                                                                                      \
5073*c217d954SCole Faust        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
5074*c217d954SCole Faust        VEC_DATA_TYPE(type, size)                                                                                       \
5075*c217d954SCole Faust        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
5076*c217d954SCole Faust        return res;                                                                                                     \
5077*c217d954SCole Faust    }
5078*c217d954SCole Faust
5079*c217d954SCole Faust
5080*c217d954SCole Faust#define DEQUANTIZE_IMPL(type, size)                                                                                       \
5081*c217d954SCole Faust    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
5082*c217d954SCole Faust    {                                                                                                                     \
5083*c217d954SCole Faust        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
5084*c217d954SCole Faust    }
5085*c217d954SCole Faust
5086*c217d954SCole Faust
5087*c217d954SCole Faust#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
5088*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
5089*c217d954SCole Faust    {                                                                                                                                   \
5090*c217d954SCole Faust        const VEC_DATA_TYPE(int, size)                                                                                                  \
5091*c217d954SCole Faust        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
5092*c217d954SCole Faust        const VEC_DATA_TYPE(int, size)                                                                                                  \
5093*c217d954SCole Faust        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
5094*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                        \
5095*c217d954SCole Faust        mask = (one << exponent) - one;                                                                                                 \
5096*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                        \
5097*c217d954SCole Faust        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
5098*c217d954SCole Faust        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
5099*c217d954SCole Faust    }
5100*c217d954SCole Faust
5101*c217d954SCole Faust
5102*c217d954SCole Faust#define ASYMM_MULT_IMPL(size)                                                                                \
5103*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
5104*c217d954SCole Faust    {                                                                                                        \
5105*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                             \
5106*c217d954SCole Faust        overflow = a == b && a == INT_MIN;                                                                   \
5107*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
5108*c217d954SCole Faust        a_64 = convert_long##size(a);                                                                        \
5109*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
5110*c217d954SCole Faust        b_64 = convert_long##size(b);                                                                        \
5111*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
5112*c217d954SCole Faust        ab_64 = a_64 * b_64;                                                                                 \
5113*c217d954SCole Faust                                                                                      \
5114*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
5115*c217d954SCole Faust        mask1 = 1 << 30;                                                                                     \
5116*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
5117*c217d954SCole Faust        mask2 = 1 - (1 << 30);                                                                               \
5118*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
5119*c217d954SCole Faust        is_positive_or_zero = ab_64 >= 0;                                                                    \
5120*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
5121*c217d954SCole Faust        nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero));               \
5122*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                            \
5123*c217d954SCole Faust        mask = 1ll << 31;                                                                                    \
5124*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                             \
5125*c217d954SCole Faust        ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
5126*c217d954SCole Faust        return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow));                   \
5127*c217d954SCole Faust    }
5128*c217d954SCole Faust
5129*c217d954SCole Faust
5130*c217d954SCole Faust#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
5131*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
5132*c217d954SCole Faust    {                                                                                                                               \
5133*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
5134*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
5135*c217d954SCole Faust        const int k_fractional_bits = 31;                                                                                           \
5136*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
5137*c217d954SCole Faust        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
5138*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
5139*c217d954SCole Faust        x2 = ASYMM_MULT(x, x, size);                                                                                                \
5140*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
5141*c217d954SCole Faust        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
5142*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
5143*c217d954SCole Faust        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
5144*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
5145*c217d954SCole Faust        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
5146*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
5147*c217d954SCole Faust        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
5148*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                                    \
5149*c217d954SCole Faust        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
5150*c217d954SCole Faust        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
5151*c217d954SCole Faust    }
5152*c217d954SCole Faust
5153*c217d954SCole Faust
5154*c217d954SCole Faust#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
5155*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
5156*c217d954SCole Faust    {                                                                                                                                                                     \
5157*c217d954SCole Faust        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
5158*c217d954SCole Faust    }
5159*c217d954SCole Faust
5160*c217d954SCole Faust
5161*c217d954SCole Faust#define ASYMM_MASK_IF_ZERO_IMPL(size)                                                    \
5162*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \
5163*c217d954SCole Faust    {                                                                                    \
5164*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                    \
5165*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                   \
5166*c217d954SCole Faust        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0));   \
5167*c217d954SCole Faust    }
5168*c217d954SCole Faust
5169*c217d954SCole Faust
5170*c217d954SCole Faust#define ASYMM_MASK_IF_NON_ZERO_IMPL(size)                                                    \
5171*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \
5172*c217d954SCole Faust    {                                                                                        \
5173*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) all_zeros = 0;                                        \
5174*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) all_ones  = ~0;                                       \
5175*c217d954SCole Faust        return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
5176*c217d954SCole Faust    }
5177*c217d954SCole Faust
5178*c217d954SCole Faust#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
5179*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
5180*c217d954SCole Faust    {                                                                                                                                                                                                         \
5181*c217d954SCole Faust        if(k_integer_bits > exponent)                                                                                                                                                                         \
5182*c217d954SCole Faust        {                                                                                                                                                                                                     \
5183*c217d954SCole Faust            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
5184*c217d954SCole Faust            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
5185*c217d954SCole Faust                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
5186*c217d954SCole Faust                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
5187*c217d954SCole Faust        }                                                                                                                                                                                                     \
5188*c217d954SCole Faust        \
5189*c217d954SCole Faust        return result;                                                                                                                                                                                        \
5190*c217d954SCole Faust    }
5191*c217d954SCole Faust
5192*c217d954SCole Faust
5193*c217d954SCole Faust#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
5194*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
5195*c217d954SCole Faust    {                                                                                                                         \
5196*c217d954SCole Faust        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
5197*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
5198*c217d954SCole Faust        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
5199*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
5200*c217d954SCole Faust        mask = k_one_quarter - 1;                                                                                             \
5201*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
5202*c217d954SCole Faust        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
5203*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
5204*c217d954SCole Faust        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
5205*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
5206*c217d954SCole Faust        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
5207*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                              \
5208*c217d954SCole Faust        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
5209*c217d954SCole Faust        \
5210*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
5211*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
5212*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
5213*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
5214*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
5215*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
5216*c217d954SCole Faust        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
5217*c217d954SCole Faust        \
5218*c217d954SCole Faust        if(k_integer_bits > 5)                                                                                                \
5219*c217d954SCole Faust        {                                                                                                                     \
5220*c217d954SCole Faust            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
5221*c217d954SCole Faust            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
5222*c217d954SCole Faust        }                                                                                                                     \
5223*c217d954SCole Faust        \
5224*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
5225*c217d954SCole Faust        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
5226*c217d954SCole Faust    }
5227*c217d954SCole Faust
5228*c217d954SCole Faust
5229*c217d954SCole Faust#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
5230*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
5231*c217d954SCole Faust    {                                                                                                                      \
5232*c217d954SCole Faust        if(exponent < 0)                                                                                                   \
5233*c217d954SCole Faust        {                                                                                                                  \
5234*c217d954SCole Faust            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
5235*c217d954SCole Faust        }                                                                                                                  \
5236*c217d954SCole Faust        \
5237*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
5238*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
5239*c217d954SCole Faust        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
5240*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                           \
5241*c217d954SCole Faust        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
5242*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                           \
5243*c217d954SCole Faust        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
5244*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                                           \
5245*c217d954SCole Faust        result = x << exponent;                                                                                            \
5246*c217d954SCole Faust        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
5247*c217d954SCole Faust        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
5248*c217d954SCole Faust        return result;                                                                                                     \
5249*c217d954SCole Faust    }
5250*c217d954SCole Faust
5251*c217d954SCole Faust
5252*c217d954SCole Faust#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
5253*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
5254*c217d954SCole Faust    {                                                                                                                     \
5255*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                                         \
5256*c217d954SCole Faust        a64 = convert_long##size(a);                                                                                      \
5257*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                                         \
5258*c217d954SCole Faust        b64 = convert_long##size(b);                                                                                      \
5259*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                                         \
5260*c217d954SCole Faust        sum = a64 + b64;                                                                                                  \
5261*c217d954SCole Faust        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
5262*c217d954SCole Faust        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
5263*c217d954SCole Faust        VEC_DATA_TYPE(long, size)                                                                                         \
5264*c217d954SCole Faust        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
5265*c217d954SCole Faust        return convert_int##size((sum + sign) / 2);                                                                       \
5266*c217d954SCole Faust    }
5267*c217d954SCole Faust
5268*c217d954SCole Faust
5269*c217d954SCole Faust#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size)                                                    \
5270*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \
5271*c217d954SCole Faust    {                                                                                                        \
5272*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
5273*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
5274*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                             \
5275*c217d954SCole Faust        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
5276*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
5277*c217d954SCole Faust        const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
5278*c217d954SCole Faust        VEC_DATA_TYPE(int, size)                                                                             \
5279*c217d954SCole Faust        x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
5280*c217d954SCole Faust        for(int i = 0; i < 3; i++)                                                                           \
5281*c217d954SCole Faust        {                                                                                                    \
5282*c217d954SCole Faust            VEC_DATA_TYPE(int, size)                                                                         \
5283*c217d954SCole Faust            half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
5284*c217d954SCole Faust            VEC_DATA_TYPE(int, size)                                                                         \
5285*c217d954SCole Faust            one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;                          \
5286*c217d954SCole Faust            VEC_DATA_TYPE(int, size)                                                                         \
5287*c217d954SCole Faust            tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size);                                   \
5288*c217d954SCole Faust            x   = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size);                                  \
5289*c217d954SCole Faust        }                                                                                                    \
5290*c217d954SCole Faust        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size);                                           \
5291*c217d954SCole Faust    }
5292*c217d954SCole Faust
5293*c217d954SCole Faust
5294*c217d954SCole Faust#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
5295*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
5296*c217d954SCole Faust    {                                                                                                                               \
5297*c217d954SCole Faust        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
5298*c217d954SCole Faust        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
5299*c217d954SCole Faust    }
5300*c217d954SCole Faust
5301*c217d954SCole Faust#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
5302*c217d954SCole Faust#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
5303*c217d954SCole Faust#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
5304*c217d954SCole Faust#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
5305*c217d954SCole Faust
5306*c217d954SCole Faust#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
5307*c217d954SCole Faust#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
5308*c217d954SCole Faust#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
5309*c217d954SCole Faust#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
5310*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
5311*c217d954SCole Faust    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
5312*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
5313*c217d954SCole Faust    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
5314*c217d954SCole Faust#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
5315*c217d954SCole Faust#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
5316*c217d954SCole Faust#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
5317*c217d954SCole Faust#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
5318*c217d954SCole Faust#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
5319*c217d954SCole Faust#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
5320*c217d954SCole Faust#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
5321*c217d954SCole Faust#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
5322*c217d954SCole Faust#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
5323*c217d954SCole Faust#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
5324*c217d954SCole Faust#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
5325*c217d954SCole Faust#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
5326*c217d954SCole Faust#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
5327*c217d954SCole Faust
5328*c217d954SCole Faust#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
5329*c217d954SCole Faust    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
5330*c217d954SCole Faust    {                                                                                                                           \
5331*c217d954SCole Faust        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
5332*c217d954SCole Faust        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
5333*c217d954SCole Faust        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
5334*c217d954SCole Faust    }
5335*c217d954SCole Faust#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
5336*c217d954SCole Faust
5337*c217d954SCole FaustQUANTIZE_IMPL(uchar, 1)
5338*c217d954SCole FaustQUANTIZE_IMPL(char, 1)
5339*c217d954SCole FaustQUANTIZE_IMPL(uint, 1)
5340*c217d954SCole FaustQUANTIZE_IMPL(int, 1)
5341*c217d954SCole FaustQUANTIZE_IMPL(uchar, 2)
5342*c217d954SCole FaustQUANTIZE_IMPL(char, 2)
5343*c217d954SCole FaustQUANTIZE_IMPL(uint, 2)
5344*c217d954SCole FaustQUANTIZE_IMPL(int, 2)
5345*c217d954SCole FaustQUANTIZE_IMPL(uchar, 3)
5346*c217d954SCole FaustQUANTIZE_IMPL(char, 3)
5347*c217d954SCole FaustQUANTIZE_IMPL(uint, 3)
5348*c217d954SCole FaustQUANTIZE_IMPL(int, 3)
5349*c217d954SCole FaustQUANTIZE_IMPL(uchar, 4)
5350*c217d954SCole FaustQUANTIZE_IMPL(ushort, 4)
5351*c217d954SCole FaustQUANTIZE_IMPL(short, 4)
5352*c217d954SCole FaustQUANTIZE_IMPL(int, 4)
5353*c217d954SCole FaustQUANTIZE_IMPL(uchar, 8)
5354*c217d954SCole FaustQUANTIZE_IMPL(char, 8)
5355*c217d954SCole FaustQUANTIZE_IMPL(uint, 8)
5356*c217d954SCole FaustQUANTIZE_IMPL(int, 8)
5357*c217d954SCole FaustQUANTIZE_IMPL(uchar, 16)
5358*c217d954SCole FaustQUANTIZE_IMPL(char, 16)
5359*c217d954SCole FaustQUANTIZE_IMPL(ushort, 16)
5360*c217d954SCole FaustQUANTIZE_IMPL(short, 16)
5361*c217d954SCole FaustQUANTIZE_IMPL(uint, 16)
5362*c217d954SCole FaustQUANTIZE_IMPL(int, 16)
5363*c217d954SCole Faust
5364*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 1)
5365*c217d954SCole FaustDEQUANTIZE_IMPL(char, 1)
5366*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 1)
5367*c217d954SCole FaustDEQUANTIZE_IMPL(int, 1)
5368*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 2)
5369*c217d954SCole FaustDEQUANTIZE_IMPL(char, 2)
5370*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 2)
5371*c217d954SCole FaustDEQUANTIZE_IMPL(int, 2)
5372*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 3)
5373*c217d954SCole FaustDEQUANTIZE_IMPL(char, 3)
5374*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 3)
5375*c217d954SCole FaustDEQUANTIZE_IMPL(int, 3)
5376*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 4)
5377*c217d954SCole FaustDEQUANTIZE_IMPL(ushort, 4)
5378*c217d954SCole FaustDEQUANTIZE_IMPL(short, 4)
5379*c217d954SCole FaustDEQUANTIZE_IMPL(int, 4)
5380*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 8)
5381*c217d954SCole FaustDEQUANTIZE_IMPL(char, 8)
5382*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 8)
5383*c217d954SCole FaustDEQUANTIZE_IMPL(int, 8)
5384*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 16)
5385*c217d954SCole FaustDEQUANTIZE_IMPL(char, 16)
5386*c217d954SCole FaustDEQUANTIZE_IMPL(ushort, 16)
5387*c217d954SCole FaustDEQUANTIZE_IMPL(short, 16)
5388*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 16)
5389*c217d954SCole FaustDEQUANTIZE_IMPL(int, 16)
5390*c217d954SCole Faust
5391*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
5392*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
5393*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3)
5394*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
5395*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
5396*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
5397*c217d954SCole Faust
5398*c217d954SCole FaustASYMM_MULT_IMPL(1)
5399*c217d954SCole FaustASYMM_MULT_IMPL(2)
5400*c217d954SCole FaustASYMM_MULT_IMPL(3)
5401*c217d954SCole FaustASYMM_MULT_IMPL(4)
5402*c217d954SCole FaustASYMM_MULT_IMPL(8)
5403*c217d954SCole FaustASYMM_MULT_IMPL(16)
5404*c217d954SCole Faust
5405*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1)
5406*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2)
5407*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3)
5408*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
5409*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
5410*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
5411*c217d954SCole Faust
5412*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(1)
5413*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(2)
5414*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(3)
5415*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(4)
5416*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(8)
5417*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(16)
5418*c217d954SCole Faust
5419*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(1)
5420*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(2)
5421*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(3)
5422*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(4)
5423*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(8)
5424*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(16)
5425*c217d954SCole Faust
5426*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(1)
5427*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(2)
5428*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(3)
5429*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(4)
5430*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(8)
5431*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(16)
5432*c217d954SCole Faust
5433*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(1)
5434*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(2)
5435*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(3)
5436*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(4)
5437*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(8)
5438*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(16)
5439*c217d954SCole Faust
5440*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1)
5441*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2)
5442*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3)
5443*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
5444*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
5445*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
5446*c217d954SCole Faust
5447*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
5448*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
5449*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3)
5450*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
5451*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
5452*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16)
5453*c217d954SCole Faust
5454*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(1)
5455*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(2)
5456*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(3)
5457*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(4)
5458*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(8)
5459*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(16)
5460*c217d954SCole Faust
5461*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1)
5462*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2)
5463*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3)
5464*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
5465*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
5466*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
5467*c217d954SCole Faust
5468*c217d954SCole FaustASYMM_RESCALE_IMPL(1)
5469*c217d954SCole FaustASYMM_RESCALE_IMPL(2)
5470*c217d954SCole FaustASYMM_RESCALE_IMPL(3)
5471*c217d954SCole FaustASYMM_RESCALE_IMPL(4)
5472*c217d954SCole FaustASYMM_RESCALE_IMPL(8)
5473*c217d954SCole FaustASYMM_RESCALE_IMPL(16)
5474*c217d954SCole Faust
5475*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
5476*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
5477*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3)
5478*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
5479*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
5480*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
5481*c217d954SCole Faust
5482*c217d954SCole Faust#endif
5483*c217d954SCole Faust
5484*c217d954SCole Faust#ifndef ARM_COMPUTE_REPEAT_H
5485*c217d954SCole Faust#define ARM_COMPUTE_REPEAT_H
5486*c217d954SCole Faust
5487*c217d954SCole Faust
5488*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H
5489*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H
5490*c217d954SCole Faust
5491*c217d954SCole Faust
5492*c217d954SCole Faust
5493*c217d954SCole Faust
5494*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5495*c217d954SCole Faust    VSTORE(N0)                                                 \
5496*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5497*c217d954SCole Faust
5498*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5499*c217d954SCole Faust    STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5500*c217d954SCole Faust    VSTORE(N0)                                                 \
5501*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5502*c217d954SCole Faust
5503*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5504*c217d954SCole Faust    STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5505*c217d954SCole Faust    VSTORE(N0)                                                 \
5506*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5507*c217d954SCole Faust
5508*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5509*c217d954SCole Faust    STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5510*c217d954SCole Faust    VSTORE(N0)                                                 \
5511*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5512*c217d954SCole Faust
5513*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5514*c217d954SCole Faust    STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5515*c217d954SCole Faust    VSTORE(N0)                                                 \
5516*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5517*c217d954SCole Faust
5518*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5519*c217d954SCole Faust    STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5520*c217d954SCole Faust    VSTORE(N0)                                                 \
5521*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5522*c217d954SCole Faust
5523*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5524*c217d954SCole Faust    STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5525*c217d954SCole Faust    VSTORE(N0)                                                 \
5526*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5527*c217d954SCole Faust
5528*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5529*c217d954SCole Faust    STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5530*c217d954SCole Faust    VSTORE(N0)                                                 \
5531*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5532*c217d954SCole Faust
5533*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5534*c217d954SCole Faust    STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5535*c217d954SCole Faust    VSTORE(N0)                                                 \
5536*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5537*c217d954SCole Faust
5538*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5539*c217d954SCole Faust    STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5540*c217d954SCole Faust    VSTORE(N0)                                                  \
5541*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5542*c217d954SCole Faust
5543*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5544*c217d954SCole Faust    STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5545*c217d954SCole Faust    VSTORE(N0)                                                  \
5546*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5547*c217d954SCole Faust
5548*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5549*c217d954SCole Faust    STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5550*c217d954SCole Faust    VSTORE(N0)                                                  \
5551*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5552*c217d954SCole Faust
5553*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5554*c217d954SCole Faust    STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5555*c217d954SCole Faust    VSTORE(N0)                                                  \
5556*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5557*c217d954SCole Faust
5558*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5559*c217d954SCole Faust    STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5560*c217d954SCole Faust    VSTORE(N0)                                                  \
5561*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5562*c217d954SCole Faust
5563*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5564*c217d954SCole Faust    STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5565*c217d954SCole Faust    VSTORE(N0)                                                  \
5566*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5567*c217d954SCole Faust
5568*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5569*c217d954SCole Faust    STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5570*c217d954SCole Faust    VSTORE(N0)                                                  \
5571*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5572*c217d954SCole Faust
5573*c217d954SCole Faust
5574*c217d954SCole Faust
5575*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5576*c217d954SCole Faust    VSTORE(N0)                                                         \
5577*c217d954SCole Faust    (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5578*c217d954SCole Faust
5579*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5580*c217d954SCole Faust    CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5581*c217d954SCole Faust    VSTORE(N0)                                                         \
5582*c217d954SCole Faust    (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5583*c217d954SCole Faust
5584*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5585*c217d954SCole Faust    CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5586*c217d954SCole Faust    VSTORE(N0)                                                         \
5587*c217d954SCole Faust    (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5588*c217d954SCole Faust
5589*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5590*c217d954SCole Faust    CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5591*c217d954SCole Faust    VSTORE(N0)                                                         \
5592*c217d954SCole Faust    (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5593*c217d954SCole Faust
5594*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5595*c217d954SCole Faust    CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5596*c217d954SCole Faust    VSTORE(N0)                                                         \
5597*c217d954SCole Faust    (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5598*c217d954SCole Faust
5599*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5600*c217d954SCole Faust    CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5601*c217d954SCole Faust    VSTORE(N0)                                                         \
5602*c217d954SCole Faust    (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5603*c217d954SCole Faust
5604*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5605*c217d954SCole Faust    CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5606*c217d954SCole Faust    VSTORE(N0)                                                         \
5607*c217d954SCole Faust    (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5608*c217d954SCole Faust
5609*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5610*c217d954SCole Faust    CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5611*c217d954SCole Faust    VSTORE(N0)                                                         \
5612*c217d954SCole Faust    (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5613*c217d954SCole Faust
5614*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5615*c217d954SCole Faust    CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5616*c217d954SCole Faust    VSTORE(N0)                                                         \
5617*c217d954SCole Faust    (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5618*c217d954SCole Faust
5619*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
5620*c217d954SCole Faust    CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5621*c217d954SCole Faust    VSTORE(N0)                                                     \
5622*c217d954SCole Faust    (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5623*c217d954SCole Faust
5624*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5625*c217d954SCole Faust    CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5626*c217d954SCole Faust    VSTORE(N0)                                                          \
5627*c217d954SCole Faust    (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5628*c217d954SCole Faust
5629*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5630*c217d954SCole Faust    CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5631*c217d954SCole Faust    VSTORE(N0)                                                          \
5632*c217d954SCole Faust    (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5633*c217d954SCole Faust
5634*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5635*c217d954SCole Faust    CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5636*c217d954SCole Faust    VSTORE(N0)                                                          \
5637*c217d954SCole Faust    (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5638*c217d954SCole Faust
5639*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5640*c217d954SCole Faust    CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5641*c217d954SCole Faust    VSTORE(N0)                                                          \
5642*c217d954SCole Faust    (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5643*c217d954SCole Faust
5644*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5645*c217d954SCole Faust    CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5646*c217d954SCole Faust    VSTORE(N0)                                                          \
5647*c217d954SCole Faust    (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5648*c217d954SCole Faust
5649*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5650*c217d954SCole Faust    CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5651*c217d954SCole Faust    VSTORE(N0)                                                          \
5652*c217d954SCole Faust    (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5653*c217d954SCole Faust
5654*c217d954SCole Faust
5655*c217d954SCole Faust
5656*c217d954SCole Faust
5657*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5658*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5659*c217d954SCole Faust
5660*c217d954SCole Faust
5661*c217d954SCole Faust
5662*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5663*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5664*c217d954SCole Faust
5665*c217d954SCole Faust
5666*c217d954SCole Faust
5667*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5668*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5669*c217d954SCole Faust    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
5670*c217d954SCole Faust
5671*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5672*c217d954SCole Faust    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5673*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5674*c217d954SCole Faust    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
5675*c217d954SCole Faust
5676*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5677*c217d954SCole Faust    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5678*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5679*c217d954SCole Faust    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
5680*c217d954SCole Faust
5681*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5682*c217d954SCole Faust    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5683*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5684*c217d954SCole Faust    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
5685*c217d954SCole Faust
5686*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5687*c217d954SCole Faust    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5688*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5689*c217d954SCole Faust    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
5690*c217d954SCole Faust
5691*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5692*c217d954SCole Faust    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5693*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5694*c217d954SCole Faust    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
5695*c217d954SCole Faust
5696*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5697*c217d954SCole Faust    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5698*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5699*c217d954SCole Faust    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
5700*c217d954SCole Faust
5701*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5702*c217d954SCole Faust    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5703*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5704*c217d954SCole Faust    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
5705*c217d954SCole Faust
5706*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5707*c217d954SCole Faust    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5708*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
5709*c217d954SCole Faust    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
5710*c217d954SCole Faust
5711*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5712*c217d954SCole Faust    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
5713*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5714*c217d954SCole Faust    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
5715*c217d954SCole Faust
5716*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5717*c217d954SCole Faust    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5718*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5719*c217d954SCole Faust    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
5720*c217d954SCole Faust
5721*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5722*c217d954SCole Faust    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5723*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5724*c217d954SCole Faust    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
5725*c217d954SCole Faust
5726*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5727*c217d954SCole Faust    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5728*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5729*c217d954SCole Faust    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
5730*c217d954SCole Faust
5731*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5732*c217d954SCole Faust    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5733*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5734*c217d954SCole Faust    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
5735*c217d954SCole Faust
5736*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5737*c217d954SCole Faust    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5738*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5739*c217d954SCole Faust    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
5740*c217d954SCole Faust
5741*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
5742*c217d954SCole Faust    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
5743*c217d954SCole Faust    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
5744*c217d954SCole Faust    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
5745*c217d954SCole Faust
5746*c217d954SCole Faust
5747*c217d954SCole Faust
5748*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5749*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5750*c217d954SCole Faust
5751*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5752*c217d954SCole Faust    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
5753*c217d954SCole Faust    {                                                                                                                                                     \
5754*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
5755*c217d954SCole Faust    }                                                                                                                                                     \
5756*c217d954SCole Faust    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
5757*c217d954SCole Faust    {                                                                                                                                                     \
5758*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5759*c217d954SCole Faust    }                                                                                                                                                     \
5760*c217d954SCole Faust    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
5761*c217d954SCole Faust    {                                                                                                                                                     \
5762*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
5763*c217d954SCole Faust    }                                                                                                                                                     \
5764*c217d954SCole Faust    else                                                                                                                                                  \
5765*c217d954SCole Faust    {                                                                                                                                                     \
5766*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
5767*c217d954SCole Faust    }
5768*c217d954SCole Faust
5769*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
5770*c217d954SCole Faust    if(!(PARTIAL_COND_X))                                                                                         \
5771*c217d954SCole Faust    {                                                                                                             \
5772*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5773*c217d954SCole Faust    }                                                                                                             \
5774*c217d954SCole Faust    else                                                                                                          \
5775*c217d954SCole Faust    {                                                                                                             \
5776*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5777*c217d954SCole Faust    }
5778*c217d954SCole Faust
5779*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
5780*c217d954SCole Faust    if(!(PARTIAL_COND_Y))                                                                                         \
5781*c217d954SCole Faust    {                                                                                                             \
5782*c217d954SCole Faust        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
5783*c217d954SCole Faust    }                                                                                                             \
5784*c217d954SCole Faust    else                                                                                                          \
5785*c217d954SCole Faust    {                                                                                                             \
5786*c217d954SCole Faust        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                     \
5787*c217d954SCole Faust    }
5788*c217d954SCole Faust
5789*c217d954SCole Faust
5790*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
5791*c217d954SCole Faust
5792*c217d954SCole Faust
5793*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
5794*c217d954SCole Faust
5795*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5796*c217d954SCole Faust    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
5797*c217d954SCole Faust
5798*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
5799*c217d954SCole Faust
5800*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5801*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
5802*c217d954SCole Faust
5803*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
5804*c217d954SCole Faust
5805*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5806*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
5807*c217d954SCole Faust
5808*c217d954SCole Faust#else
5809*c217d954SCole Faust
5810*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
5811*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
5812*c217d954SCole Faust
5813*c217d954SCole Faust#endif
5814*c217d954SCole Faust
5815*c217d954SCole Faust#endif
5816*c217d954SCole Faust
5817*c217d954SCole Faust
5818*c217d954SCole Faust#if defined(PARTIAL_STORE_M0)
5819*c217d954SCole Faust
5820*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5821*c217d954SCole Faust    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
5822*c217d954SCole Faust#else
5823*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
5824*c217d954SCole Faust    ((uint)(y * M0))
5825*c217d954SCole Faust#endif
5826*c217d954SCole Faust
5827*c217d954SCole Faust
5828*c217d954SCole Faust
5829*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \
5830*c217d954SCole Faust    STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond)
5831*c217d954SCole Faust
5832*c217d954SCole Faust
5833*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
5834*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable
5835*c217d954SCole Faust#endif
5836*c217d954SCole Faust
5837*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
5838*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
5839*c217d954SCole Faust#endif
5840*c217d954SCole Faust
5841*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
5842*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
5843*c217d954SCole Faust#endif
5844*c217d954SCole Faust
5845*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
5846*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable
5847*c217d954SCole Faust#endif
5848*c217d954SCole Faust
5849*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100
5850*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200
5851*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300
5852*c217d954SCole Faust
5853*c217d954SCole Faust
5854*c217d954SCole Faust#define CONCAT(a, b) a##b
5855*c217d954SCole Faust
5856*c217d954SCole Faust
5857*c217d954SCole Faust#define EXPAND(x) x
5858*c217d954SCole Faust
5859*c217d954SCole Faust
5860*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
5861*c217d954SCole Faust
5862*c217d954SCole Faust
5863*c217d954SCole Faust#define REV1(x) ((x))
5864*c217d954SCole Faust#define REV2(x) ((x).s10)
5865*c217d954SCole Faust#define REV3(x) ((x).s210)
5866*c217d954SCole Faust#define REV4(x) ((x).s3210)
5867*c217d954SCole Faust#define REV8(x) ((x).s76543210)
5868*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210)
5869*c217d954SCole Faust
5870*c217d954SCole Faust
5871*c217d954SCole Faust
5872*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x))
5873*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s)
5874*c217d954SCole Faust
5875*c217d954SCole Faust
5876*c217d954SCole Faust
5877*c217d954SCole Faust#define ROT1_0(x) ((x))
5878*c217d954SCole Faust#define ROT1_1(x) ((x))
5879*c217d954SCole Faust
5880*c217d954SCole Faust#define ROT2_0(x) ((x))
5881*c217d954SCole Faust#define ROT2_1(x) ((x).s10)
5882*c217d954SCole Faust#define ROT2_2(x) ((x))
5883*c217d954SCole Faust
5884*c217d954SCole Faust#define ROT3_0(x) ((x))
5885*c217d954SCole Faust#define ROT3_1(x) ((x).s201)
5886*c217d954SCole Faust#define ROT3_2(x) ((x).s120)
5887*c217d954SCole Faust#define ROT3_3(x) ((x))
5888*c217d954SCole Faust
5889*c217d954SCole Faust#define ROT4_0(x) ((x))
5890*c217d954SCole Faust#define ROT4_1(x) ((x).s3012)
5891*c217d954SCole Faust#define ROT4_2(x) ((x).s2301)
5892*c217d954SCole Faust#define ROT4_3(x) ((x).s1230)
5893*c217d954SCole Faust#define ROT4_4(x) ((x))
5894*c217d954SCole Faust
5895*c217d954SCole Faust#define ROT8_0(x) ((x))
5896*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456)
5897*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345)
5898*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234)
5899*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123)
5900*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012)
5901*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701)
5902*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670)
5903*c217d954SCole Faust#define ROT8_8(x) ((x))
5904*c217d954SCole Faust
5905*c217d954SCole Faust#define ROT16_0(x) ((x))
5906*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE)
5907*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD)
5908*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC)
5909*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB)
5910*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A)
5911*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789)
5912*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678)
5913*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567)
5914*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456)
5915*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345)
5916*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234)
5917*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123)
5918*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012)
5919*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01)
5920*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0)
5921*c217d954SCole Faust#define ROT16_16(x) ((x))
5922*c217d954SCole Faust
5923*c217d954SCole Faust
5924*c217d954SCole Faust
5925*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
5926*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
5927*c217d954SCole Faust
5928*c217d954SCole Faust
5929*c217d954SCole Faust
5930*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0)
5931*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1)
5932*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2)
5933*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
5934*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
5935*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
5936*c217d954SCole Faust
5937*c217d954SCole Faust
5938*c217d954SCole Faust
5939*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
5940*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
5941*c217d954SCole Faust
5942*c217d954SCole Faust
5943*c217d954SCole Faust#define VLOAD_STR(size) vload##size
5944*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size)
5945*c217d954SCole Faust
5946*c217d954SCole Faust
5947*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
5948*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
5949*c217d954SCole Faust
5950*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \
5951*c217d954SCole Faust    {                            \
5952*c217d954SCole Faust    }
5953*c217d954SCole Faust
5954*c217d954SCole Faust
5955*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD
5956*c217d954SCole Faust#define vload_partial_1_1 vload1
5957*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD
5958*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD
5959*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD
5960*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD
5961*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD
5962*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD
5963*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD
5964*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD
5965*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD
5966*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD
5967*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD
5968*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD
5969*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD
5970*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD
5971*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD
5972*c217d954SCole Faust
5973*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD
5974*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1
5975*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2
5976*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD
5977*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD
5978*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD
5979*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD
5980*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD
5981*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD
5982*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD
5983*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD
5984*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD
5985*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD
5986*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD
5987*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD
5988*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD
5989*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD
5990*c217d954SCole Faust
5991*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD
5992*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1
5993*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2
5994*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3
5995*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD
5996*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD
5997*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD
5998*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD
5999*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD
6000*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD
6001*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD
6002*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD
6003*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD
6004*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD
6005*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD
6006*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD
6007*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD
6008*c217d954SCole Faust
6009*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD
6010*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1
6011*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2
6012*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3
6013*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4
6014*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD
6015*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD
6016*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD
6017*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD
6018*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD
6019*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD
6020*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD
6021*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD
6022*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD
6023*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD
6024*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD
6025*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD
6026*c217d954SCole Faust
6027*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD
6028*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1
6029*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2
6030*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3
6031*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4
6032*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5
6033*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6
6034*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7
6035*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8
6036*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD
6037*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD
6038*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD
6039*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD
6040*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD
6041*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD
6042*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD
6043*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD
6044*c217d954SCole Faust
6045*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD
6046*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1
6047*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2
6048*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3
6049*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4
6050*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5
6051*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6
6052*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7
6053*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8
6054*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9
6055*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10
6056*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11
6057*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12
6058*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13
6059*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14
6060*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15
6061*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16
6062*c217d954SCole Faust
6063*c217d954SCole Faust
6064*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \
6065*c217d954SCole Faust    DATA.s0 = vload1(OFFSET, PTR);
6066*c217d954SCole Faust
6067*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \
6068*c217d954SCole Faust    DATA.s01 = vload2(OFFSET, PTR);
6069*c217d954SCole Faust
6070*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \
6071*c217d954SCole Faust    DATA.s012 = vload3(OFFSET, PTR);
6072*c217d954SCole Faust
6073*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \
6074*c217d954SCole Faust    DATA.s0123 = vload4(OFFSET, PTR);
6075*c217d954SCole Faust
6076*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR)    \
6077*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
6078*c217d954SCole Faust    DATA.s4 = vload1(OFFSET, PTR + 4);
6079*c217d954SCole Faust
6080*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR)    \
6081*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
6082*c217d954SCole Faust    vload_partial_2(DATA.s45, OFFSET, PTR + 4);
6083*c217d954SCole Faust
6084*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR)    \
6085*c217d954SCole Faust    vload_partial_4(DATA.s0123, OFFSET, PTR); \
6086*c217d954SCole Faust    vload_partial_3(DATA.s456, OFFSET, PTR + 4);
6087*c217d954SCole Faust
6088*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \
6089*c217d954SCole Faust    DATA.s01234567 = vload8(OFFSET, PTR);
6090*c217d954SCole Faust
6091*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR)        \
6092*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6093*c217d954SCole Faust    DATA.s8 = vload1(OFFSET, PTR + 8);
6094*c217d954SCole Faust
6095*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR)       \
6096*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6097*c217d954SCole Faust    vload_partial_2(DATA.s89, OFFSET, PTR + 8);
6098*c217d954SCole Faust
6099*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR)       \
6100*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6101*c217d954SCole Faust    vload_partial_3(DATA.s89A, OFFSET, PTR + 8);
6102*c217d954SCole Faust
6103*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR)       \
6104*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6105*c217d954SCole Faust    vload_partial_4(DATA.s89AB, OFFSET, PTR + 8);
6106*c217d954SCole Faust
6107*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR)       \
6108*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6109*c217d954SCole Faust    vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8);
6110*c217d954SCole Faust
6111*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR)       \
6112*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6113*c217d954SCole Faust    vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8);
6114*c217d954SCole Faust
6115*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR)       \
6116*c217d954SCole Faust    vload_partial_8(DATA.s01234567, OFFSET, PTR); \
6117*c217d954SCole Faust    vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
6118*c217d954SCole Faust
6119*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \
6120*c217d954SCole Faust    DATA = vload16(OFFSET, PTR);
6121*c217d954SCole Faust
6122*c217d954SCole Faust
6123*c217d954SCole Faust
6124*c217d954SCole Faust#define PIXEL_UNIT4 1
6125*c217d954SCole Faust#define PIXEL_UNIT8 2
6126*c217d954SCole Faust#define PIXEL_UNIT16 4
6127*c217d954SCole Faust
6128*c217d954SCole Faust
6129*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
6130*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
6131*c217d954SCole Faust
6132*c217d954SCole Faust
6133*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
6134*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
6135*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
6136*c217d954SCole Faust
6137*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
6138*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
6139*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
6140*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
6141*c217d954SCole Faust#endif
6142*c217d954SCole Faust
6143*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
6144*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
6145*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
6146*c217d954SCole Faust
6147*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
6148*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
6149*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
6150*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
6151*c217d954SCole Faust#endif
6152*c217d954SCole Faust
6153*c217d954SCole Faust
6154*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
6155*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
6156*c217d954SCole Faust
6157*c217d954SCole Faust
6158*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
6159*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
6160*c217d954SCole Faust
6161*c217d954SCole Faust#define VSTORE_STR(size) vstore##size
6162*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size)
6163*c217d954SCole Faust
6164*c217d954SCole Faust#define float1 float
6165*c217d954SCole Faust#define half1 half
6166*c217d954SCole Faust#define char1 char
6167*c217d954SCole Faust#define uchar1 uchar
6168*c217d954SCole Faust#define short1 short
6169*c217d954SCole Faust#define ushort1 ushort
6170*c217d954SCole Faust#define int1 int
6171*c217d954SCole Faust#define uint1 uint
6172*c217d954SCole Faust#define long1 long
6173*c217d954SCole Faust#define ulong1 ulong
6174*c217d954SCole Faust#define double1 double
6175*c217d954SCole Faust
6176*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR)
6177*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
6178*c217d954SCole Faust
6179*c217d954SCole Faust
6180*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
6181*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
6182*c217d954SCole Faust
6183*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \
6184*c217d954SCole Faust    {                             \
6185*c217d954SCole Faust    }
6186*c217d954SCole Faust
6187*c217d954SCole Faust
6188*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE
6189*c217d954SCole Faust#define vstore_partial_1_1 vstore1
6190*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE
6191*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE
6192*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE
6193*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE
6194*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE
6195*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE
6196*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE
6197*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE
6198*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE
6199*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE
6200*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE
6201*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE
6202*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE
6203*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE
6204*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE
6205*c217d954SCole Faust
6206*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE
6207*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1
6208*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2
6209*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE
6210*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE
6211*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE
6212*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE
6213*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE
6214*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE
6215*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE
6216*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE
6217*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE
6218*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE
6219*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE
6220*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE
6221*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE
6222*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE
6223*c217d954SCole Faust
6224*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE
6225*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1
6226*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2
6227*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3
6228*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE
6229*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE
6230*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE
6231*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE
6232*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE
6233*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE
6234*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE
6235*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE
6236*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE
6237*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE
6238*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE
6239*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE
6240*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE
6241*c217d954SCole Faust
6242*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE
6243*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1
6244*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2
6245*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3
6246*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4
6247*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE
6248*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE
6249*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE
6250*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE
6251*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE
6252*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE
6253*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE
6254*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE
6255*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE
6256*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE
6257*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE
6258*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE
6259*c217d954SCole Faust
6260*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE
6261*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1
6262*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2
6263*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3
6264*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4
6265*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5
6266*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6
6267*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7
6268*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8
6269*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE
6270*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE
6271*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE
6272*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE
6273*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE
6274*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE
6275*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE
6276*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE
6277*c217d954SCole Faust
6278*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE
6279*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1
6280*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2
6281*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3
6282*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4
6283*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5
6284*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6
6285*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7
6286*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8
6287*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9
6288*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10
6289*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11
6290*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12
6291*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13
6292*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14
6293*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15
6294*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16
6295*c217d954SCole Faust
6296*c217d954SCole Faust
6297*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \
6298*c217d954SCole Faust    vstore1(DATA.s0, OFFSET, PTR);
6299*c217d954SCole Faust
6300*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \
6301*c217d954SCole Faust    vstore2(DATA.s01, OFFSET, PTR);
6302*c217d954SCole Faust
6303*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \
6304*c217d954SCole Faust    vstore3(DATA.s012, OFFSET, PTR);
6305*c217d954SCole Faust
6306*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \
6307*c217d954SCole Faust    vstore4(DATA.s0123, OFFSET, PTR);
6308*c217d954SCole Faust
6309*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR)    \
6310*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
6311*c217d954SCole Faust    vstore1(DATA.s4, OFFSET, PTR + 4);
6312*c217d954SCole Faust
6313*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR)    \
6314*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
6315*c217d954SCole Faust    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
6316*c217d954SCole Faust
6317*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR)    \
6318*c217d954SCole Faust    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
6319*c217d954SCole Faust    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
6320*c217d954SCole Faust
6321*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \
6322*c217d954SCole Faust    vstore8(DATA.s01234567, OFFSET, PTR);
6323*c217d954SCole Faust
6324*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR)        \
6325*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6326*c217d954SCole Faust    vstore1(DATA.s8, OFFSET, PTR + 8);
6327*c217d954SCole Faust
6328*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR)       \
6329*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6330*c217d954SCole Faust    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
6331*c217d954SCole Faust
6332*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR)       \
6333*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6334*c217d954SCole Faust    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
6335*c217d954SCole Faust
6336*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR)       \
6337*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6338*c217d954SCole Faust    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
6339*c217d954SCole Faust
6340*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR)       \
6341*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6342*c217d954SCole Faust    vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
6343*c217d954SCole Faust
6344*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR)       \
6345*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6346*c217d954SCole Faust    vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
6347*c217d954SCole Faust
6348*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR)       \
6349*c217d954SCole Faust    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
6350*c217d954SCole Faust    vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
6351*c217d954SCole Faust
6352*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \
6353*c217d954SCole Faust    vstore16(DATA, OFFSET, PTR);
6354*c217d954SCole Faust
6355*c217d954SCole Faust
6356*c217d954SCole Faust
6357*c217d954SCole Faust
6358*c217d954SCole Faust
6359*c217d954SCole Faust#define convert_float_sat convert_float
6360*c217d954SCole Faust#define convert_float1_sat convert_float
6361*c217d954SCole Faust#define convert_float2_sat convert_float2
6362*c217d954SCole Faust#define convert_float3_sat convert_float3
6363*c217d954SCole Faust#define convert_float4_sat convert_float4
6364*c217d954SCole Faust#define convert_float8_sat convert_float8
6365*c217d954SCole Faust#define convert_float16_sat convert_float16
6366*c217d954SCole Faust#define convert_half_sat convert_float
6367*c217d954SCole Faust#define convert_half1_sat convert_half
6368*c217d954SCole Faust#define convert_half2_sat convert_half2
6369*c217d954SCole Faust#define convert_half3_sat convert_half3
6370*c217d954SCole Faust#define convert_half4_sat convert_half4
6371*c217d954SCole Faust#define convert_half8_sat convert_half8
6372*c217d954SCole Faust#define convert_half16_sat convert_half16
6373*c217d954SCole Faust
6374*c217d954SCole Faust#define convert_float1 convert_float
6375*c217d954SCole Faust#define convert_half1 convert_half
6376*c217d954SCole Faust#define convert_char1 convert_char
6377*c217d954SCole Faust#define convert_uchar1 convert_uchar
6378*c217d954SCole Faust#define convert_short1 convert_short
6379*c217d954SCole Faust#define convert_ushort1 convert_ushort
6380*c217d954SCole Faust#define convert_int1 convert_int
6381*c217d954SCole Faust#define convert_uint1 convert_uint
6382*c217d954SCole Faust#define convert_long1 convert_long
6383*c217d954SCole Faust#define convert_ulong1 convert_ulong
6384*c217d954SCole Faust#define convert_double1 convert_double
6385*c217d954SCole Faust
6386*c217d954SCole Faust#define convert_char1_sat convert_char_sat
6387*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat
6388*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat
6389*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat
6390*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat
6391*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat
6392*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat
6393*c217d954SCole Faust#define convert_short1_sat convert_short_sat
6394*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat
6395*c217d954SCole Faust#define convert_int1_sat convert_int_sat
6396*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat
6397*c217d954SCole Faust#define convert_long1_sat convert_long_sat
6398*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat
6399*c217d954SCole Faust#define convert_double1_sat convert_double_sat
6400*c217d954SCole Faust
6401*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size
6402*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
6403*c217d954SCole Faust
6404*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x)))
6405*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type)
6406*c217d954SCole Faust
6407*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
6408*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
6409*c217d954SCole Faust
6410*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
6411*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
6412*c217d954SCole Faust
6413*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size
6414*c217d954SCole Faust#define select_vec_dt_char(size) char##size
6415*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size
6416*c217d954SCole Faust#define select_vec_dt_short(size) short##size
6417*c217d954SCole Faust#define select_vec_dt_half(size) short##size
6418*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size
6419*c217d954SCole Faust#define select_vec_dt_int(size) int##size
6420*c217d954SCole Faust#define select_vec_dt_float(size) int##size
6421*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size
6422*c217d954SCole Faust#define select_vec_dt_long(size) long##size
6423*c217d954SCole Faust
6424*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
6425*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
6426*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
6427*c217d954SCole Faust
6428*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size
6429*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size
6430*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size
6431*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size
6432*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size
6433*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size
6434*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size
6435*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size
6436*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size
6437*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size
6438*c217d954SCole Faust
6439*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
6440*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
6441*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
6442*c217d954SCole Faust
6443*c217d954SCole Faust#define sum_reduce_1(x) (x)
6444*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1)
6445*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
6446*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
6447*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
6448*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
6449*c217d954SCole Faust
6450*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
6451*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
6452*c217d954SCole Faust
6453*c217d954SCole Faust#define prod_reduce_1(x) (x)
6454*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1)
6455*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
6456*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
6457*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
6458*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
6459*c217d954SCole Faust
6460*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
6461*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
6462*c217d954SCole Faust
6463*c217d954SCole Faust#define max_reduce_1(x) (x)
6464*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1))
6465*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
6466*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
6467*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
6468*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
6469*c217d954SCole Faust
6470*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
6471*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
6472*c217d954SCole Faust
6473*c217d954SCole Faust#define VECTOR_DECLARATION(name)     \
6474*c217d954SCole Faust    __global uchar *name##_ptr,      \
6475*c217d954SCole Faust    uint        name##_stride_x, \
6476*c217d954SCole Faust    uint        name##_step_x,   \
6477*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6478*c217d954SCole Faust
6479*c217d954SCole Faust#define IMAGE_DECLARATION(name)      \
6480*c217d954SCole Faust    __global uchar *name##_ptr,      \
6481*c217d954SCole Faust    uint        name##_stride_x, \
6482*c217d954SCole Faust    uint        name##_step_x,   \
6483*c217d954SCole Faust    uint        name##_stride_y, \
6484*c217d954SCole Faust    uint        name##_step_y,   \
6485*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6486*c217d954SCole Faust
6487*c217d954SCole Faust#define TENSOR3D_DECLARATION(name)   \
6488*c217d954SCole Faust    __global uchar *name##_ptr,      \
6489*c217d954SCole Faust    uint        name##_stride_x, \
6490*c217d954SCole Faust    uint        name##_step_x,   \
6491*c217d954SCole Faust    uint        name##_stride_y, \
6492*c217d954SCole Faust    uint        name##_step_y,   \
6493*c217d954SCole Faust    uint        name##_stride_z, \
6494*c217d954SCole Faust    uint        name##_step_z,   \
6495*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6496*c217d954SCole Faust
6497*c217d954SCole Faust#define TENSOR4D_DECLARATION(name)   \
6498*c217d954SCole Faust    __global uchar *name##_ptr,      \
6499*c217d954SCole Faust    uint        name##_stride_x, \
6500*c217d954SCole Faust    uint        name##_step_x,   \
6501*c217d954SCole Faust    uint        name##_stride_y, \
6502*c217d954SCole Faust    uint        name##_step_y,   \
6503*c217d954SCole Faust    uint        name##_stride_z, \
6504*c217d954SCole Faust    uint        name##_step_z,   \
6505*c217d954SCole Faust    uint        name##_stride_w, \
6506*c217d954SCole Faust    uint        name##_step_w,   \
6507*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6508*c217d954SCole Faust
6509*c217d954SCole Faust#define TENSOR5D_DECLARATION(name)   \
6510*c217d954SCole Faust    __global uchar *name##_ptr,      \
6511*c217d954SCole Faust    uint        name##_stride_x, \
6512*c217d954SCole Faust    uint        name##_step_x,   \
6513*c217d954SCole Faust    uint        name##_stride_y, \
6514*c217d954SCole Faust    uint        name##_step_y,   \
6515*c217d954SCole Faust    uint        name##_stride_z, \
6516*c217d954SCole Faust    uint        name##_step_z,   \
6517*c217d954SCole Faust    uint        name##_stride_w, \
6518*c217d954SCole Faust    uint        name##_step_w,   \
6519*c217d954SCole Faust    uint        name##_stride_v, \
6520*c217d954SCole Faust    uint        name##_step_v,   \
6521*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6522*c217d954SCole Faust
6523*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \
6524*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
6525*c217d954SCole Faust
6526*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
6527*c217d954SCole Faust    update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
6528*c217d954SCole Faust
6529*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \
6530*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
6531*c217d954SCole Faust
6532*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
6533*c217d954SCole Faust    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
6534*c217d954SCole Faust
6535*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
6536*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
6537*c217d954SCole Faust
6538*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
6539*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
6540*c217d954SCole Faust
6541*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
6542*c217d954SCole Faust    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
6543*c217d954SCole Faust
6544*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
6545*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6546*c217d954SCole Faust                                 name##_stride_z, name##_step_z)
6547*c217d954SCole Faust
6548*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
6549*c217d954SCole Faust    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
6550*c217d954SCole Faust
6551*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
6552*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6553*c217d954SCole Faust                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
6554*c217d954SCole Faust
6555*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
6556*c217d954SCole Faust    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
6557*c217d954SCole Faust
6558*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
6559*c217d954SCole Faust    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
6560*c217d954SCole Faust                           name##_stride_z, name##_step_z)
6561*c217d954SCole Faust
6562*c217d954SCole Faust
6563*c217d954SCole Fausttypedef struct Vector
6564*c217d954SCole Faust{
6565*c217d954SCole Faust    __global uchar *ptr;
6566*c217d954SCole Faust    int             offset_first_element_in_bytes;
6567*c217d954SCole Faust    int             stride_x;
6568*c217d954SCole Faust} Vector;
6569*c217d954SCole Faust
6570*c217d954SCole Faust
6571*c217d954SCole Fausttypedef struct Image
6572*c217d954SCole Faust{
6573*c217d954SCole Faust    __global uchar *ptr;
6574*c217d954SCole Faust    int             offset_first_element_in_bytes;
6575*c217d954SCole Faust    int             stride_x;
6576*c217d954SCole Faust    int             stride_y;
6577*c217d954SCole Faust} Image;
6578*c217d954SCole Faust
6579*c217d954SCole Faust
6580*c217d954SCole Fausttypedef struct Tensor3D
6581*c217d954SCole Faust{
6582*c217d954SCole Faust    __global uchar *ptr;
6583*c217d954SCole Faust    int             offset_first_element_in_bytes;
6584*c217d954SCole Faust    int             stride_x;
6585*c217d954SCole Faust    int             stride_y;
6586*c217d954SCole Faust    int             stride_z;
6587*c217d954SCole Faust} Tensor3D;
6588*c217d954SCole Faust
6589*c217d954SCole Faust
6590*c217d954SCole Fausttypedef struct Tensor4D
6591*c217d954SCole Faust{
6592*c217d954SCole Faust    __global uchar *ptr;
6593*c217d954SCole Faust    int             offset_first_element_in_bytes;
6594*c217d954SCole Faust    int             stride_x;
6595*c217d954SCole Faust    int             stride_y;
6596*c217d954SCole Faust    int             stride_z;
6597*c217d954SCole Faust    int             stride_w;
6598*c217d954SCole Faust} Tensor4D;
6599*c217d954SCole Faust
6600*c217d954SCole Faust
6601*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
6602*c217d954SCole Faust{
6603*c217d954SCole Faust    Vector vector =
6604*c217d954SCole Faust    {
6605*c217d954SCole Faust        .ptr                           = ptr,
6606*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6607*c217d954SCole Faust        .stride_x                      = stride_x,
6608*c217d954SCole Faust    };
6609*c217d954SCole Faust    vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
6610*c217d954SCole Faust    return vector;
6611*c217d954SCole Faust}
6612*c217d954SCole Faust
6613*c217d954SCole Faust
6614*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
6615*c217d954SCole Faust{
6616*c217d954SCole Faust    Image img =
6617*c217d954SCole Faust    {
6618*c217d954SCole Faust        .ptr                           = ptr,
6619*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6620*c217d954SCole Faust        .stride_x                      = stride_x,
6621*c217d954SCole Faust        .stride_y                      = stride_y
6622*c217d954SCole Faust    };
6623*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
6624*c217d954SCole Faust    return img;
6625*c217d954SCole Faust}
6626*c217d954SCole Faust
6627*c217d954SCole Faust
6628*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6629*c217d954SCole Faust{
6630*c217d954SCole Faust    Image img =
6631*c217d954SCole Faust    {
6632*c217d954SCole Faust        .ptr                           = ptr,
6633*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6634*c217d954SCole Faust        .stride_x                      = stride_x,
6635*c217d954SCole Faust        .stride_y                      = stride_y
6636*c217d954SCole Faust    };
6637*c217d954SCole Faust    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
6638*c217d954SCole Faust    return img;
6639*c217d954SCole Faust}
6640*c217d954SCole Faust
6641*c217d954SCole Faust
6642*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6643*c217d954SCole Faust{
6644*c217d954SCole Faust    Tensor3D tensor =
6645*c217d954SCole Faust    {
6646*c217d954SCole Faust        .ptr                           = ptr,
6647*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6648*c217d954SCole Faust        .stride_x                      = stride_x,
6649*c217d954SCole Faust        .stride_y                      = stride_y,
6650*c217d954SCole Faust        .stride_z                      = stride_z
6651*c217d954SCole Faust    };
6652*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
6653*c217d954SCole Faust    return tensor;
6654*c217d954SCole Faust}
6655*c217d954SCole Faust
6656*c217d954SCole Faust
6657*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
6658*c217d954SCole Faust{
6659*c217d954SCole Faust    Tensor3D tensor =
6660*c217d954SCole Faust    {
6661*c217d954SCole Faust        .ptr                           = ptr,
6662*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6663*c217d954SCole Faust        .stride_x                      = stride_x,
6664*c217d954SCole Faust        .stride_y                      = stride_y,
6665*c217d954SCole Faust        .stride_z                      = stride_z
6666*c217d954SCole Faust    };
6667*c217d954SCole Faust    return tensor;
6668*c217d954SCole Faust}
6669*c217d954SCole Faust
6670*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
6671*c217d954SCole Faust                                             uint step_w,
6672*c217d954SCole Faust                                             uint mod_size)
6673*c217d954SCole Faust{
6674*c217d954SCole Faust    Tensor4D tensor =
6675*c217d954SCole Faust    {
6676*c217d954SCole Faust        .ptr                           = ptr,
6677*c217d954SCole Faust        .offset_first_element_in_bytes = offset_first_element_in_bytes,
6678*c217d954SCole Faust        .stride_x                      = stride_x,
6679*c217d954SCole Faust        .stride_y                      = stride_y,
6680*c217d954SCole Faust        .stride_z                      = stride_z,
6681*c217d954SCole Faust        .stride_w                      = stride_w
6682*c217d954SCole Faust    };
6683*c217d954SCole Faust
6684*c217d954SCole Faust    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
6685*c217d954SCole Faust    return tensor;
6686*c217d954SCole Faust}
6687*c217d954SCole Faust
6688*c217d954SCole Faust
6689*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x)
6690*c217d954SCole Faust{
6691*c217d954SCole Faust    return vec->ptr + x * vec->stride_x;
6692*c217d954SCole Faust}
6693*c217d954SCole Faust
6694*c217d954SCole Faust
6695*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y)
6696*c217d954SCole Faust{
6697*c217d954SCole Faust    return img->ptr + x * img->stride_x + y * img->stride_y;
6698*c217d954SCole Faust}
6699*c217d954SCole Faust
6700*c217d954SCole Faust
6701*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
6702*c217d954SCole Faust{
6703*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
6704*c217d954SCole Faust}
6705*c217d954SCole Faust
6706*c217d954SCole Faust
6707*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
6708*c217d954SCole Faust{
6709*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
6710*c217d954SCole Faust}
6711*c217d954SCole Faust
6712*c217d954SCole Faust
6713*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
6714*c217d954SCole Faust{
6715*c217d954SCole Faust    uint num_elements = width * height;
6716*c217d954SCole Faust
6717*c217d954SCole Faust    const uint z = index / num_elements;
6718*c217d954SCole Faust
6719*c217d954SCole Faust    index %= num_elements;
6720*c217d954SCole Faust
6721*c217d954SCole Faust    const uint y = index / width;
6722*c217d954SCole Faust
6723*c217d954SCole Faust    index %= width;
6724*c217d954SCole Faust
6725*c217d954SCole Faust    const uint x = index;
6726*c217d954SCole Faust
6727*c217d954SCole Faust    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
6728*c217d954SCole Faust}
6729*c217d954SCole Faust
6730*c217d954SCole Faust#endif
6731*c217d954SCole Faust
6732*c217d954SCole Faust
6733*c217d954SCole Faust
6734*c217d954SCole Faust#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
6735*c217d954SCole Faust#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
6736*c217d954SCole Faust    P_X##_DEF(1, P_A, P_B, P_C);       \
6737*c217d954SCole Faust    REPEAT_3_1(P_X, P_A, P_B, P_C)
6738*c217d954SCole Faust#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
6739*c217d954SCole Faust    P_X##_DEF(2, P_A, P_B, P_C);       \
6740*c217d954SCole Faust    REPEAT_3_2(P_X, P_A, P_B, P_C)
6741*c217d954SCole Faust#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
6742*c217d954SCole Faust    P_X##_DEF(3, P_A, P_B, P_C);       \
6743*c217d954SCole Faust    REPEAT_3_3(P_X, P_A, P_B, P_C)
6744*c217d954SCole Faust#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
6745*c217d954SCole Faust    P_X##_DEF(4, P_A, P_B, P_C);       \
6746*c217d954SCole Faust    REPEAT_3_4(P_X, P_A, P_B, P_C)
6747*c217d954SCole Faust#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
6748*c217d954SCole Faust    P_X##_DEF(5, P_A, P_B, P_C);       \
6749*c217d954SCole Faust    REPEAT_3_5(P_X, P_A, P_B, P_C)
6750*c217d954SCole Faust#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
6751*c217d954SCole Faust    P_X##_DEF(6, P_A, P_B, P_C);       \
6752*c217d954SCole Faust    REPEAT_3_6(P_X, P_A, P_B, P_C)
6753*c217d954SCole Faust#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
6754*c217d954SCole Faust    P_X##_DEF(7, P_A, P_B, P_C);       \
6755*c217d954SCole Faust    REPEAT_3_7(P_X, P_A, P_B, P_C)
6756*c217d954SCole Faust#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
6757*c217d954SCole Faust    P_X##_DEF(8, P_A, P_B, P_C);       \
6758*c217d954SCole Faust    REPEAT_3_8(P_X, P_A, P_B, P_C)
6759*c217d954SCole Faust#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
6760*c217d954SCole Faust    P_X##_DEF(9, P_A, P_B, P_C);        \
6761*c217d954SCole Faust    REPEAT_3_9(P_X, P_A, P_B, P_C)
6762*c217d954SCole Faust#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
6763*c217d954SCole Faust    P_X##_DEF(A, P_A, P_B, P_C);        \
6764*c217d954SCole Faust    REPEAT_3_10(P_X, P_A, P_B, P_C)
6765*c217d954SCole Faust#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
6766*c217d954SCole Faust    P_X##_DEF(B, P_A, P_B, P_C);        \
6767*c217d954SCole Faust    REPEAT_3_11(P_X, P_A, P_B, P_C)
6768*c217d954SCole Faust#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
6769*c217d954SCole Faust    P_X##_DEF(C, P_A, P_B, P_C);        \
6770*c217d954SCole Faust    REPEAT_3_12(P_X, P_A, P_B, P_C)
6771*c217d954SCole Faust#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
6772*c217d954SCole Faust    P_X##_DEF(D, P_A, P_B, P_C);        \
6773*c217d954SCole Faust    REPEAT_3_13(P_X, P_A, P_B, P_C)
6774*c217d954SCole Faust#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
6775*c217d954SCole Faust    P_X##_DEF(E, P_A, P_B, P_C);        \
6776*c217d954SCole Faust    REPEAT_3_14(P_X, P_A, P_B, P_C)
6777*c217d954SCole Faust#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
6778*c217d954SCole Faust    P_X##_DEF(F, P_A, P_B, P_C);        \
6779*c217d954SCole Faust    REPEAT_3_15(P_X, P_A, P_B, P_C)
6780*c217d954SCole Faust
6781*c217d954SCole Faust#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C)
6782*c217d954SCole Faust#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
6783*c217d954SCole Faust
6784*c217d954SCole Faust
6785*c217d954SCole Faust#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
6786*c217d954SCole Faust#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
6787*c217d954SCole Faust    P_X##_DEF(1, P_A, P_B, P_C, P_D);       \
6788*c217d954SCole Faust    REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
6789*c217d954SCole Faust#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
6790*c217d954SCole Faust    P_X##_DEF(2, P_A, P_B, P_C, P_D);       \
6791*c217d954SCole Faust    REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
6792*c217d954SCole Faust#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
6793*c217d954SCole Faust    P_X##_DEF(3, P_A, P_B, P_C, P_D);       \
6794*c217d954SCole Faust    REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
6795*c217d954SCole Faust#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
6796*c217d954SCole Faust    P_X##_DEF(4, P_A, P_B, P_C, P_D);       \
6797*c217d954SCole Faust    REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
6798*c217d954SCole Faust#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
6799*c217d954SCole Faust    P_X##_DEF(5, P_A, P_B, P_C, P_D);       \
6800*c217d954SCole Faust    REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
6801*c217d954SCole Faust#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
6802*c217d954SCole Faust    P_X##_DEF(6, P_A, P_B, P_C, P_D);       \
6803*c217d954SCole Faust    REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
6804*c217d954SCole Faust#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
6805*c217d954SCole Faust    P_X##_DEF(7, P_A, P_B, P_C, P_D);       \
6806*c217d954SCole Faust    REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
6807*c217d954SCole Faust#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
6808*c217d954SCole Faust    P_X##_DEF(8, P_A, P_B, P_C, P_D);       \
6809*c217d954SCole Faust    REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
6810*c217d954SCole Faust#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
6811*c217d954SCole Faust    P_X##_DEF(9, P_A, P_B, P_C, P_D);        \
6812*c217d954SCole Faust    REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
6813*c217d954SCole Faust#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
6814*c217d954SCole Faust    P_X##_DEF(A, P_A, P_B, P_C, P_D);        \
6815*c217d954SCole Faust    REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
6816*c217d954SCole Faust#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
6817*c217d954SCole Faust    P_X##_DEF(B, P_A, P_B, P_C, P_D);        \
6818*c217d954SCole Faust    REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
6819*c217d954SCole Faust#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
6820*c217d954SCole Faust    P_X##_DEF(C, P_A, P_B, P_C, P_D);        \
6821*c217d954SCole Faust    REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
6822*c217d954SCole Faust#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
6823*c217d954SCole Faust    P_X##_DEF(D, P_A, P_B, P_C, P_D);        \
6824*c217d954SCole Faust    REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
6825*c217d954SCole Faust#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
6826*c217d954SCole Faust    P_X##_DEF(E, P_A, P_B, P_C, P_D);        \
6827*c217d954SCole Faust    REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
6828*c217d954SCole Faust#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
6829*c217d954SCole Faust    P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
6830*c217d954SCole Faust    REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
6831*c217d954SCole Faust
6832*c217d954SCole Faust#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D)
6833*c217d954SCole Faust#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
6834*c217d954SCole Faust
6835*c217d954SCole Faust
6836*c217d954SCole Faust#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
6837*c217d954SCole Faust#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
6838*c217d954SCole Faust
6839*c217d954SCole Faust
6840*c217d954SCole Faust#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
6841*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
6842*c217d954SCole Faust
6843*c217d954SCole Faust
6844*c217d954SCole Faust#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
6845*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
6846*c217d954SCole Faust
6847*c217d954SCole Faust
6848*c217d954SCole Faust#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
6849*c217d954SCole Faust#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
6850*c217d954SCole Faust
6851*c217d954SCole Faust
6852*c217d954SCole Faust#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
6853*c217d954SCole Faust#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
6854*c217d954SCole Faust
6855*c217d954SCole Faust
6856*c217d954SCole Faust#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
6857*c217d954SCole Faust#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
6858*c217d954SCole Faust
6859*c217d954SCole Faust
6860*c217d954SCole Faust#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
6861*c217d954SCole Faust#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
6862*c217d954SCole Faust
6863*c217d954SCole Faust
6864*c217d954SCole Faust#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
6865*c217d954SCole Faust#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
6866*c217d954SCole Faust
6867*c217d954SCole Faust
6868*c217d954SCole Faust#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
6869*c217d954SCole Faust#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
6870*c217d954SCole Faust
6871*c217d954SCole Faust
6872*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
6873*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
6874*c217d954SCole Faust
6875*c217d954SCole Faust
6876*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
6877*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
6878*c217d954SCole Faust
6879*c217d954SCole Faust
6880*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
6881*c217d954SCole Faust    ({                                                                                                        \
6882*c217d954SCole Faust        VEC_DATA_TYPE(int, N0)                                                                                \
6883*c217d954SCole Faust        VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \
6884*c217d954SCole Faust        VEC_DATA_TYPE(int, N0)                                                                                \
6885*c217d954SCole Faust        VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
6886*c217d954SCole Faust        VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
6887*c217d954SCole Faust    })
6888*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
6889*c217d954SCole Faust
6890*c217d954SCole Faust#endif
6891*c217d954SCole Faust
6892*c217d954SCole Faust#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
6893*c217d954SCole Faust#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS
6894*c217d954SCole Faust
6895*c217d954SCole Faust
6896*c217d954SCole Faust
6897*c217d954SCole Faust
6898*c217d954SCole Faust#define TILE_VECTOR_SIZE1 1
6899*c217d954SCole Faust#define TILE_VECTOR_SIZE2 2
6900*c217d954SCole Faust#define TILE_VECTOR_SIZE3 3
6901*c217d954SCole Faust#define TILE_VECTOR_SIZE4 4
6902*c217d954SCole Faust#define TILE_VECTOR_SIZE5 8
6903*c217d954SCole Faust#define TILE_VECTOR_SIZE6 8
6904*c217d954SCole Faust#define TILE_VECTOR_SIZE7 8
6905*c217d954SCole Faust#define TILE_VECTOR_SIZE8 8
6906*c217d954SCole Faust#define TILE_VECTOR_SIZE9 16
6907*c217d954SCole Faust#define TILE_VECTOR_SIZE10 16
6908*c217d954SCole Faust#define TILE_VECTOR_SIZE11 16
6909*c217d954SCole Faust#define TILE_VECTOR_SIZE12 16
6910*c217d954SCole Faust#define TILE_VECTOR_SIZE13 16
6911*c217d954SCole Faust#define TILE_VECTOR_SIZE14 16
6912*c217d954SCole Faust#define TILE_VECTOR_SIZE15 16
6913*c217d954SCole Faust#define TILE_VECTOR_SIZE16 16
6914*c217d954SCole Faust
6915*c217d954SCole Faust#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
6916*c217d954SCole Faust#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
6917*c217d954SCole Faust#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
6918*c217d954SCole Faust#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
6919*c217d954SCole Faust#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
6920*c217d954SCole Faust#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
6921*c217d954SCole Faust#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
6922*c217d954SCole Faust#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
6923*c217d954SCole Faust#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
6924*c217d954SCole Faust#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
6925*c217d954SCole Faust#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
6926*c217d954SCole Faust#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
6927*c217d954SCole Faust#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
6928*c217d954SCole Faust#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
6929*c217d954SCole Faust#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
6930*c217d954SCole Faust#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
6931*c217d954SCole Faust
6932*c217d954SCole Faust
6933*c217d954SCole Faust#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
6934*c217d954SCole Faust#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
6935*c217d954SCole Faust    union {                                 \
6936*c217d954SCole Faust        DATA_TYPE                      s[TILE_VECTOR_SIZE##W];                  \
6937*c217d954SCole Faust        TILE_VECTOR_TYPE##W(DATA_TYPE) v;                     \
6938*c217d954SCole Faust    } BASENAME[H]
6939*c217d954SCole Faust
6940*c217d954SCole Faust#define TENSOR4D_IMAGE(name)          \
6941*c217d954SCole Faust    __read_only image2d_t name##_img, \
6942*c217d954SCole Faust    __global uchar *name##_ptr,       \
6943*c217d954SCole Faust    uint            name##_stride_x,  \
6944*c217d954SCole Faust    uint            name##_step_x,    \
6945*c217d954SCole Faust    uint            name##_stride_y,  \
6946*c217d954SCole Faust    uint            name##_step_y,    \
6947*c217d954SCole Faust    uint            name##_stride_z,  \
6948*c217d954SCole Faust    uint            name##_step_z,    \
6949*c217d954SCole Faust    uint            name##_stride_w,  \
6950*c217d954SCole Faust    uint            name##_step_w,    \
6951*c217d954SCole Faust    uint            name##_offset_first_element_in_bytes
6952*c217d954SCole Faust
6953*c217d954SCole Faust#define TENSOR4D_BUFFER(name)    \
6954*c217d954SCole Faust    __global uchar *name##_ptr,  \
6955*c217d954SCole Faust    uint        name##_stride_x, \
6956*c217d954SCole Faust    uint        name##_step_x,   \
6957*c217d954SCole Faust    uint        name##_stride_y, \
6958*c217d954SCole Faust    uint        name##_step_y,   \
6959*c217d954SCole Faust    uint        name##_stride_z, \
6960*c217d954SCole Faust    uint        name##_step_z,   \
6961*c217d954SCole Faust    uint        name##_stride_w, \
6962*c217d954SCole Faust    uint        name##_step_w,   \
6963*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6964*c217d954SCole Faust
6965*c217d954SCole Faust#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
6966*c217d954SCole Faust#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
6967*c217d954SCole Faust
6968*c217d954SCole Faust#define TENSOR4D_T_IMAGE(name)          \
6969*c217d954SCole Faust    __read_only image2d_t name##_img, \
6970*c217d954SCole Faust    __global uchar *name##_ptr,       \
6971*c217d954SCole Faust    uint        name##_stride_y, \
6972*c217d954SCole Faust    uint        name##_stride_z, \
6973*c217d954SCole Faust    uint        name##_stride_w, \
6974*c217d954SCole Faust    uint        name##_c,   \
6975*c217d954SCole Faust    uint        name##_w,   \
6976*c217d954SCole Faust    uint        name##_h,   \
6977*c217d954SCole Faust    uint        name##_n,   \
6978*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6979*c217d954SCole Faust
6980*c217d954SCole Faust#define TENSOR4D_T_BUFFER(name)    \
6981*c217d954SCole Faust    __global uchar *name##_ptr,  \
6982*c217d954SCole Faust    uint        name##_stride_y, \
6983*c217d954SCole Faust    uint        name##_stride_z, \
6984*c217d954SCole Faust    uint        name##_stride_w, \
6985*c217d954SCole Faust    uint        name##_c,   \
6986*c217d954SCole Faust    uint        name##_w,   \
6987*c217d954SCole Faust    uint        name##_h,   \
6988*c217d954SCole Faust    uint        name##_n,   \
6989*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
6990*c217d954SCole Faust
6991*c217d954SCole Faust#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
6992*c217d954SCole Faust
6993*c217d954SCole Faust
6994*c217d954SCole Faust#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
6995*c217d954SCole Faust
6996*c217d954SCole Faust#define TENSOR4D_RO_T_IMAGE(name)          \
6997*c217d954SCole Faust    __read_only image2d_t name##_img, \
6998*c217d954SCole Faust    TENSOR4D_T_BUFFER(name)
6999*c217d954SCole Faust
7000*c217d954SCole Faust#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
7001*c217d954SCole Faust
7002*c217d954SCole Faust#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name)
7003*c217d954SCole Faust
7004*c217d954SCole Faust
7005*c217d954SCole Faust#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type)
7006*c217d954SCole Faust
7007*c217d954SCole Faust#define TENSOR4D_WO_T_IMAGE(name)          \
7008*c217d954SCole Faust    __write_only image2d_t name##_img, \
7009*c217d954SCole Faust    TENSOR4D_T_BUFFER(name)
7010*c217d954SCole Faust
7011*c217d954SCole Faust#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name)
7012*c217d954SCole Faust
7013*c217d954SCole Faust#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name)
7014*c217d954SCole Faust
7015*c217d954SCole Faust
7016*c217d954SCole Faust#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type)
7017*c217d954SCole Faust
7018*c217d954SCole Faust#define TENSOR3D_T_IMAGE(name)          \
7019*c217d954SCole Faust    __read_only image2d_t name##_img, \
7020*c217d954SCole Faust    __global uchar *name##_ptr,       \
7021*c217d954SCole Faust    uint        name##_stride_y, \
7022*c217d954SCole Faust    uint        name##_stride_z, \
7023*c217d954SCole Faust    uint        name##_w,   \
7024*c217d954SCole Faust    uint        name##_h,   \
7025*c217d954SCole Faust    uint        name##_n,   \
7026*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
7027*c217d954SCole Faust
7028*c217d954SCole Faust#define TENSOR3D_T_BUFFER(name)    \
7029*c217d954SCole Faust    __global uchar *name##_ptr,  \
7030*c217d954SCole Faust    uint        name##_stride_y, \
7031*c217d954SCole Faust    uint        name##_stride_z, \
7032*c217d954SCole Faust    uint        name##_w,   \
7033*c217d954SCole Faust    uint        name##_h,   \
7034*c217d954SCole Faust    uint        name##_n,   \
7035*c217d954SCole Faust    uint        name##_offset_first_element_in_bytes
7036*c217d954SCole Faust
7037*c217d954SCole Faust#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
7038*c217d954SCole Faust#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
7039*c217d954SCole Faust
7040*c217d954SCole Faust#if !defined(UNROLL_WITH_PRAGMA)
7041*c217d954SCole Faust#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
7042*c217d954SCole Faust
7043*c217d954SCole Faust#define LOOP_UNROLLING_1(idx, step, macro) (macro)
7044*c217d954SCole Faust#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
7045*c217d954SCole Faust#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
7046*c217d954SCole Faust#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
7047*c217d954SCole Faust#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
7048*c217d954SCole Faust#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
7049*c217d954SCole Faust#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
7050*c217d954SCole Faust#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
7051*c217d954SCole Faust#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
7052*c217d954SCole Faust#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
7053*c217d954SCole Faust#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
7054*c217d954SCole Faust#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
7055*c217d954SCole Faust#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
7056*c217d954SCole Faust#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
7057*c217d954SCole Faust#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
7058*c217d954SCole Faust#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
7059*c217d954SCole Faust#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
7060*c217d954SCole Faust#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
7061*c217d954SCole Faust#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
7062*c217d954SCole Faust#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
7063*c217d954SCole Faust#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
7064*c217d954SCole Faust#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
7065*c217d954SCole Faust#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
7066*c217d954SCole Faust#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
7067*c217d954SCole Faust#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
7068*c217d954SCole Faust#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
7069*c217d954SCole Faust#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
7070*c217d954SCole Faust#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
7071*c217d954SCole Faust#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
7072*c217d954SCole Faust#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
7073*c217d954SCole Faust#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
7074*c217d954SCole Faust#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
7075*c217d954SCole Faust#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
7076*c217d954SCole Faust#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
7077*c217d954SCole Faust#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
7078*c217d954SCole Faust#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
7079*c217d954SCole Faust#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
7080*c217d954SCole Faust#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
7081*c217d954SCole Faust#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
7082*c217d954SCole Faust#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
7083*c217d954SCole Faust#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
7084*c217d954SCole Faust#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
7085*c217d954SCole Faust#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
7086*c217d954SCole Faust#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
7087*c217d954SCole Faust#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
7088*c217d954SCole Faust#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
7089*c217d954SCole Faust#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
7090*c217d954SCole Faust#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
7091*c217d954SCole Faust#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
7092*c217d954SCole Faust#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
7093*c217d954SCole Faust#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
7094*c217d954SCole Faust#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
7095*c217d954SCole Faust#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
7096*c217d954SCole Faust#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
7097*c217d954SCole Faust#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
7098*c217d954SCole Faust#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
7099*c217d954SCole Faust#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
7100*c217d954SCole Faust#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
7101*c217d954SCole Faust#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
7102*c217d954SCole Faust#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
7103*c217d954SCole Faust#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
7104*c217d954SCole Faust#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
7105*c217d954SCole Faust#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
7106*c217d954SCole Faust#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
7107*c217d954SCole Faust#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
7108*c217d954SCole Faust#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
7109*c217d954SCole Faust#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
7110*c217d954SCole Faust#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
7111*c217d954SCole Faust#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
7112*c217d954SCole Faust#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
7113*c217d954SCole Faust#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
7114*c217d954SCole Faust#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
7115*c217d954SCole Faust#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
7116*c217d954SCole Faust#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
7117*c217d954SCole Faust#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
7118*c217d954SCole Faust#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
7119*c217d954SCole Faust#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
7120*c217d954SCole Faust#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
7121*c217d954SCole Faust#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
7122*c217d954SCole Faust#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
7123*c217d954SCole Faust#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
7124*c217d954SCole Faust#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
7125*c217d954SCole Faust#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
7126*c217d954SCole Faust#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
7127*c217d954SCole Faust#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
7128*c217d954SCole Faust#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
7129*c217d954SCole Faust#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
7130*c217d954SCole Faust#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
7131*c217d954SCole Faust#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
7132*c217d954SCole Faust#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
7133*c217d954SCole Faust#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
7134*c217d954SCole Faust#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
7135*c217d954SCole Faust#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
7136*c217d954SCole Faust#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
7137*c217d954SCole Faust#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
7138*c217d954SCole Faust#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
7139*c217d954SCole Faust#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
7140*c217d954SCole Faust#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
7141*c217d954SCole Faust#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
7142*c217d954SCole Faust#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
7143*c217d954SCole Faust#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
7144*c217d954SCole Faust#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
7145*c217d954SCole Faust#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
7146*c217d954SCole Faust#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
7147*c217d954SCole Faust#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
7148*c217d954SCole Faust#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
7149*c217d954SCole Faust#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
7150*c217d954SCole Faust#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
7151*c217d954SCole Faust#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
7152*c217d954SCole Faust#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
7153*c217d954SCole Faust#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
7154*c217d954SCole Faust#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
7155*c217d954SCole Faust#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
7156*c217d954SCole Faust#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
7157*c217d954SCole Faust#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
7158*c217d954SCole Faust#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
7159*c217d954SCole Faust#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
7160*c217d954SCole Faust#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
7161*c217d954SCole Faust#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
7162*c217d954SCole Faust#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
7163*c217d954SCole Faust#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
7164*c217d954SCole Faust#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
7165*c217d954SCole Faust#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
7166*c217d954SCole Faust#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
7167*c217d954SCole Faust#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
7168*c217d954SCole Faust#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
7169*c217d954SCole Faust#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
7170*c217d954SCole Faust#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
7171*c217d954SCole Faust
7172*c217d954SCole Faust#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
7173*c217d954SCole Faust    {                                                          \
7174*c217d954SCole Faust        type idx = start;                                      \
7175*c217d954SCole Faust        LOOP_UNROLLING_##num(idx, step, macro);                \
7176*c217d954SCole Faust    }
7177*c217d954SCole Faust#else
7178*c217d954SCole Faust#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
7179*c217d954SCole Faust    {                                                          \
7180*c217d954SCole Faust        _Pragma("unroll")                                      \
7181*c217d954SCole Faust        for(type idx = start; idx < (num * step); idx += step) \
7182*c217d954SCole Faust        {                                                      \
7183*c217d954SCole Faust            (macro);                                           \
7184*c217d954SCole Faust        }                                                      \
7185*c217d954SCole Faust    }
7186*c217d954SCole Faust#endif
7187*c217d954SCole Faust#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
7188*c217d954SCole Faust
7189*c217d954SCole Faust
7190*c217d954SCole Faust#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
7191*c217d954SCole Faust
7192*c217d954SCole Faust
7193*c217d954SCole Faust#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
7194*c217d954SCole Faust#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
7195*c217d954SCole Faust#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7196*c217d954SCole Faust    ({                                                \
7197*c217d954SCole Faust        c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b);     \
7198*c217d954SCole Faust    })
7199*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
7200*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
7201*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
7202*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
7203*c217d954SCole Faust#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
7204*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
7205*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
7206*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
7207*c217d954SCole Faust#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
7208*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
7209*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
7210*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
7211*c217d954SCole Faust#else
7212*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
7213*c217d954SCole Faust    ({                                                  \
7214*c217d954SCole Faust        c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
7215*c217d954SCole Faust        c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
7216*c217d954SCole Faust    })
7217*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)   \
7218*c217d954SCole Faust    ({                                                  \
7219*c217d954SCole Faust        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c);  \
7220*c217d954SCole Faust        c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
7221*c217d954SCole Faust    })
7222*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val)   \
7223*c217d954SCole Faust    ({                                                    \
7224*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
7225*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
7226*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
7227*c217d954SCole Faust        val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
7228*c217d954SCole Faust    })
7229*c217d954SCole Faust#endif
7230*c217d954SCole Faust#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7231*c217d954SCole Faust    ({                                                \
7232*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
7233*c217d954SCole Faust        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c);     \
7234*c217d954SCole Faust    })
7235*c217d954SCole Faust#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7236*c217d954SCole Faust    ({                                                \
7237*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
7238*c217d954SCole Faust        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c);     \
7239*c217d954SCole Faust    })
7240*c217d954SCole Faust#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7241*c217d954SCole Faust    ({                                                \
7242*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c);     \
7243*c217d954SCole Faust        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c);     \
7244*c217d954SCole Faust    })
7245*c217d954SCole Faust#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7246*c217d954SCole Faust    ({                                                \
7247*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);     \
7248*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);     \
7249*c217d954SCole Faust    })
7250*c217d954SCole Faust#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7251*c217d954SCole Faust    ({                                                \
7252*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7253*c217d954SCole Faust        DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c);     \
7254*c217d954SCole Faust    })
7255*c217d954SCole Faust#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7256*c217d954SCole Faust    ({                                                \
7257*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7258*c217d954SCole Faust        DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c);     \
7259*c217d954SCole Faust    })
7260*c217d954SCole Faust#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7261*c217d954SCole Faust    ({                                                \
7262*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7263*c217d954SCole Faust        DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c);     \
7264*c217d954SCole Faust    })
7265*c217d954SCole Faust#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7266*c217d954SCole Faust    ({                                                \
7267*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7268*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c);     \
7269*c217d954SCole Faust    })
7270*c217d954SCole Faust#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7271*c217d954SCole Faust    ({                                                \
7272*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7273*c217d954SCole Faust        DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c);     \
7274*c217d954SCole Faust    })
7275*c217d954SCole Faust#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7276*c217d954SCole Faust    ({                                                \
7277*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7278*c217d954SCole Faust        DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c);     \
7279*c217d954SCole Faust    })
7280*c217d954SCole Faust#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7281*c217d954SCole Faust    ({                                                \
7282*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c);     \
7283*c217d954SCole Faust        DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c);     \
7284*c217d954SCole Faust    })
7285*c217d954SCole Faust#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
7286*c217d954SCole Faust    ({                                                 \
7287*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c);      \
7288*c217d954SCole Faust        DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c);      \
7289*c217d954SCole Faust    })
7290*c217d954SCole Faust
7291*c217d954SCole Faust
7292*c217d954SCole Faust#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
7293*c217d954SCole Faust#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
7294*c217d954SCole Faust
7295*c217d954SCole Faust
7296*c217d954SCole Faust#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
7297*c217d954SCole Faust#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
7298*c217d954SCole Faust#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
7299*c217d954SCole Faust    VLOAD(WIDTH)                                                \
7300*c217d954SCole Faust    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
7301*c217d954SCole Faust#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
7302*c217d954SCole Faust
7303*c217d954SCole Faust
7304*c217d954SCole Faust#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES)
7305*c217d954SCole Faust#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES)
7306*c217d954SCole Faust#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \
7307*c217d954SCole Faust    VSTORE(WIDTH)                                                \
7308*c217d954SCole Faust    (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
7309*c217d954SCole Faust#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES)
7310*c217d954SCole Faust
7311*c217d954SCole Faust
7312*c217d954SCole Faust#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst)                      \
7313*c217d954SCole Faust    ({                                                                                                                 \
7314*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                          \
7315*c217d954SCole Faust        {                                                                                                              \
7316*c217d954SCole Faust            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
7317*c217d954SCole Faust        })                                                                                                             \
7318*c217d954SCole Faust    })
7319*c217d954SCole Faust
7320*c217d954SCole Faust
7321*c217d954SCole Faust#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
7322*c217d954SCole Faust    ({                                                                                                  \
7323*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                           \
7324*c217d954SCole Faust        {                                                                                               \
7325*c217d954SCole Faust            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
7326*c217d954SCole Faust        })                                                                                              \
7327*c217d954SCole Faust    })
7328*c217d954SCole Faust
7329*c217d954SCole Faust
7330*c217d954SCole Faust#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y)                                                      \
7331*c217d954SCole Faust    ({                                                                                                                                                                                             \
7332*c217d954SCole Faust        if(WIDTH1_CONDITION)                                                                                                                                                                       \
7333*c217d954SCole Faust        {                                                                                                                                                                                          \
7334*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7335*c217d954SCole Faust            {                                                                                                                                                                                      \
7336*c217d954SCole Faust                VLOAD_PARTIAL(WIDTH0, WIDTH1)                                                         \
7337*c217d954SCole Faust                (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y));               \
7338*c217d954SCole Faust            })                                                                                                                                                                                     \
7339*c217d954SCole Faust        }                                                                                                                                                                                          \
7340*c217d954SCole Faust        else                                                                                                                                                                                       \
7341*c217d954SCole Faust        {                                                                                                                                                                                          \
7342*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7343*c217d954SCole Faust            {                                                                                                                                                                                      \
7344*c217d954SCole Faust                dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
7345*c217d954SCole Faust            })                                                                                                                                                                                     \
7346*c217d954SCole Faust        }                                                                                                                                                                                          \
7347*c217d954SCole Faust    })
7348*c217d954SCole Faust
7349*c217d954SCole Faust#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst)   \
7350*c217d954SCole Faust    ({                                                                                                                                                \
7351*c217d954SCole Faust        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT,                                                                                                   \
7352*c217d954SCole Faust        {                                                                                                                                             \
7353*c217d954SCole Faust            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH,                                                                                                \
7354*c217d954SCole Faust            {                                                                                                                                         \
7355*c217d954SCole Faust                int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH);                                                                                \
7356*c217d954SCole Faust                _src_y    += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                        \
7357*c217d954SCole Faust                int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
7358*c217d954SCole Faust                if(_src_valid_y != 0)                                                                                                                 \
7359*c217d954SCole Faust                {                                                                                                                                     \
7360*c217d954SCole Faust                    dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                     \
7361*c217d954SCole Faust                }                                                                                                                                     \
7362*c217d954SCole Faust            })                                                                                                                                        \
7363*c217d954SCole Faust        })                                                                                                                                            \
7364*c217d954SCole Faust    })
7365*c217d954SCole Faust
7366*c217d954SCole Faust
7367*c217d954SCole Faust#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst)         \
7368*c217d954SCole Faust    ({ \
7369*c217d954SCole Faust        LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
7370*c217d954SCole Faust        { \
7371*c217d954SCole Faust            LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
7372*c217d954SCole Faust            { \
7373*c217d954SCole Faust                int _src_y = (X) + _xk * (DILATION_X); \
7374*c217d954SCole Faust                int _src_z = ((Y) + _yk * (DILATION_Y)); \
7375*c217d954SCole Faust                int _src_w    = (B); \
7376*c217d954SCole Faust                bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
7377*c217d954SCole Faust                if(!(BOUNDARY_CHECK)) \
7378*c217d954SCole Faust                { \
7379*c217d954SCole Faust                    dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
7380*c217d954SCole Faust                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
7381*c217d954SCole Faust                } \
7382*c217d954SCole Faust                else \
7383*c217d954SCole Faust                { \
7384*c217d954SCole Faust                    if(_src_valid_y) \
7385*c217d954SCole Faust                    { \
7386*c217d954SCole Faust                        dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS)                                                \
7387*c217d954SCole Faust                    (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
7388*c217d954SCole Faust                    }                                                                                                                                                                                                 \
7389*c217d954SCole Faust                } \
7390*c217d954SCole Faust            })                                                                                                                                                                                                             \
7391*c217d954SCole Faust        })                                                                                                                                                                                                             \
7392*c217d954SCole Faust    })
7393*c217d954SCole Faust
7394*c217d954SCole Faust
7395*c217d954SCole Faust#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst)                \
7396*c217d954SCole Faust    ({                                                                                                                                                                \
7397*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
7398*c217d954SCole Faust        {                                                                                                                                                             \
7399*c217d954SCole Faust            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH);                                                                                          \
7400*c217d954SCole Faust            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT);                                                                                               \
7401*c217d954SCole Faust            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
7402*c217d954SCole Faust            if(_src_valid_y != 0)                                                                                                                                     \
7403*c217d954SCole Faust            {                                                                                                                                                         \
7404*c217d954SCole Faust                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
7405*c217d954SCole Faust            }                                                                                                                                                         \
7406*c217d954SCole Faust        })                                                                                                                                                            \
7407*c217d954SCole Faust    })
7408*c217d954SCole Faust
7409*c217d954SCole Faust
7410*c217d954SCole Faust#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
7411*c217d954SCole Faust#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst)
7412*c217d954SCole Faust#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
7413*c217d954SCole Faust    ({ \
7414*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
7415*c217d954SCole Faust        { \
7416*c217d954SCole Faust            if(yi[0].s[_i] >= 0) \
7417*c217d954SCole Faust            { \
7418*c217d954SCole Faust                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
7419*c217d954SCole Faust            } \
7420*c217d954SCole Faust        }) \
7421*c217d954SCole Faust    })
7422*c217d954SCole Faust
7423*c217d954SCole Faust#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \
7424*c217d954SCole Faust    ({ \
7425*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
7426*c217d954SCole Faust        { \
7427*c217d954SCole Faust            dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \
7428*c217d954SCole Faust        }) \
7429*c217d954SCole Faust    })
7430*c217d954SCole Faust
7431*c217d954SCole Faust
7432*c217d954SCole Faust#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
7433*c217d954SCole Faust    ({                                                                                                                                                                \
7434*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA,                                                                                                                      \
7435*c217d954SCole Faust        {                                                                                                                                                             \
7436*c217d954SCole Faust            int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT);                                      \
7437*c217d954SCole Faust            _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH);                                                                         \
7438*c217d954SCole Faust            int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)   \
7439*c217d954SCole Faust                             && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH));                                                                     \
7440*c217d954SCole Faust            if(_src_valid_y != 0)                                                                                                                                     \
7441*c217d954SCole Faust            {                                                                                                                                                         \
7442*c217d954SCole Faust                dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y);                                                               \
7443*c217d954SCole Faust            }                                                                                                                                                         \
7444*c217d954SCole Faust        })                                                                                                                                                            \
7445*c217d954SCole Faust    })
7446*c217d954SCole Faust
7447*c217d954SCole Faust
7448*c217d954SCole Faust#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)                                                      \
7449*c217d954SCole Faust    ({                                                                                                                                                                                             \
7450*c217d954SCole Faust        if(WIDTH1_CONDITION)                                                                                                                                                                       \
7451*c217d954SCole Faust        {                                                                                                                                                                                          \
7452*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7453*c217d954SCole Faust            {                                                                                                                                                                                      \
7454*c217d954SCole Faust                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                                                                     \
7455*c217d954SCole Faust                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
7456*c217d954SCole Faust            })                                                                                                                                                                                     \
7457*c217d954SCole Faust        }                                                                                                                                                                                          \
7458*c217d954SCole Faust        else                                                                                                                                                                                       \
7459*c217d954SCole Faust        {                                                                                                                                                                                          \
7460*c217d954SCole Faust            LOOP_UNROLLING(int, _i, 0, 1, HEIGHT,                                                                                                                                                  \
7461*c217d954SCole Faust            {                                                                                                                                                                                      \
7462*c217d954SCole Faust                VSTORE(WIDTH0)                                                                                                                                                                     \
7463*c217d954SCole Faust                (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
7464*c217d954SCole Faust            })                                                                                                                                                                                     \
7465*c217d954SCole Faust        }                                                                                                                                                                                          \
7466*c217d954SCole Faust    })
7467*c217d954SCole Faust
7468*c217d954SCole Faust
7469*c217d954SCole Faust#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)        \
7470*c217d954SCole Faust    ({                                                                                               \
7471*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                           \
7472*c217d954SCole Faust        {                                                                                            \
7473*c217d954SCole Faust            ACC_DATA_TYPE _tm = 0;                                                                   \
7474*c217d954SCole Faust            LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                       \
7475*c217d954SCole Faust            {                                                                                        \
7476*c217d954SCole Faust                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET);                 \
7477*c217d954SCole Faust            })                                                                                       \
7478*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                       \
7479*c217d954SCole Faust            {                                                                                        \
7480*c217d954SCole Faust                dst[_m0].s[_n0] += _tm;                                                              \
7481*c217d954SCole Faust                LOOP_UNROLLING(int, _k0, 0, 1, K0,                                                   \
7482*c217d954SCole Faust                {                                                                                    \
7483*c217d954SCole Faust                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
7484*c217d954SCole Faust                })                                                                                   \
7485*c217d954SCole Faust            })                                                                                       \
7486*c217d954SCole Faust        })                                                                                          \
7487*c217d954SCole Faust    })
7488*c217d954SCole Faust
7489*c217d954SCole Faust
7490*c217d954SCole Faust#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
7491*c217d954SCole Faust#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
7492*c217d954SCole Faust
7493*c217d954SCole Faust
7494*c217d954SCole Faust#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
7495*c217d954SCole Faust    ({ \
7496*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7497*c217d954SCole Faust        { \
7498*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7499*c217d954SCole Faust            { \
7500*c217d954SCole Faust                SRC_DATA_TYPE _tmp = 0; \
7501*c217d954SCole Faust                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7502*c217d954SCole Faust                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
7503*c217d954SCole Faust                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
7504*c217d954SCole Faust                long a_64 = (long)(_src); \
7505*c217d954SCole Faust                long b_64 = (long)(DST_MULTIPLIER); \
7506*c217d954SCole Faust                long ab_64 = a_64 * b_64; \
7507*c217d954SCole Faust                long mask1 = 1 << 30; \
7508*c217d954SCole Faust                long mask2 = 1 - (1 << 30); \
7509*c217d954SCole Faust                long is_positive_or_zero = ab_64 >= 0; \
7510*c217d954SCole Faust                long nudge = select(mask2, mask1, is_positive_or_zero); \
7511*c217d954SCole Faust                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7512*c217d954SCole Faust                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7513*c217d954SCole Faust                if(DST_SHIFT >= 0) \
7514*c217d954SCole Faust                { \
7515*c217d954SCole Faust                    long mask = ((((int)1) << DST_SHIFT) - (long)1); \
7516*c217d954SCole Faust                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
7517*c217d954SCole Faust                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
7518*c217d954SCole Faust                } \
7519*c217d954SCole Faust                _tmp += DST_OFFSET; \
7520*c217d954SCole Faust                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7521*c217d954SCole Faust            })                                                                                                                                          \
7522*c217d954SCole Faust        })                                                                                                                                          \
7523*c217d954SCole Faust    })
7524*c217d954SCole Faust
7525*c217d954SCole Faust
7526*c217d954SCole Faust#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)                          \
7527*c217d954SCole Faust    ({ \
7528*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7529*c217d954SCole Faust        { \
7530*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7531*c217d954SCole Faust            { \
7532*c217d954SCole Faust                SRC_DATA_TYPE _tmp = 0; \
7533*c217d954SCole Faust                SRC_DATA_TYPE _tmp2 = 0; \
7534*c217d954SCole Faust                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7535*c217d954SCole Faust                SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
7536*c217d954SCole Faust                SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
7537*c217d954SCole Faust                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
7538*c217d954SCole Faust                SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
7539*c217d954SCole Faust                long a_64 = (long)(_src); \
7540*c217d954SCole Faust                long b_64 = (long)(_dst_multiplier); \
7541*c217d954SCole Faust                long ab_64 = a_64 * b_64; \
7542*c217d954SCole Faust                long mask1 = 1 << 30; \
7543*c217d954SCole Faust                long mask2 = 1 - (1 << 30); \
7544*c217d954SCole Faust                long is_positive_or_zero = ab_64 >= 0; \
7545*c217d954SCole Faust                long nudge = select(mask2, mask1, is_positive_or_zero); \
7546*c217d954SCole Faust                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7547*c217d954SCole Faust                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7548*c217d954SCole Faust                long mask = ((((int)1) << _dst_shift) - (int)1); \
7549*c217d954SCole Faust                long threshold = (mask >> 1) + any(_tmp); \
7550*c217d954SCole Faust                _tmp2 = _tmp >> _dst_shift; \
7551*c217d954SCole Faust                _tmp2 += select(0, 1, (_tmp & mask) > threshold); \
7552*c217d954SCole Faust                _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \
7553*c217d954SCole Faust                _tmp += DST_OFFSET; \
7554*c217d954SCole Faust                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7555*c217d954SCole Faust            })                                                                                                                                          \
7556*c217d954SCole Faust        })                                                                                                                                         \
7557*c217d954SCole Faust    })
7558*c217d954SCole Faust
7559*c217d954SCole Faust
7560*c217d954SCole Faust#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
7561*c217d954SCole Faust    ({ \
7562*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7563*c217d954SCole Faust        { \
7564*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0, \
7565*c217d954SCole Faust            { \
7566*c217d954SCole Faust                SRC_DATA_TYPE _tmp = 0; \
7567*c217d954SCole Faust                SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
7568*c217d954SCole Faust                _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
7569*c217d954SCole Faust                SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
7570*c217d954SCole Faust                long a_64 = (long)(_src); \
7571*c217d954SCole Faust                long b_64 = (long)(DST_MULTIPLIER); \
7572*c217d954SCole Faust                long ab_64 = a_64 * b_64; \
7573*c217d954SCole Faust                long mask1 = 1 << 30; \
7574*c217d954SCole Faust                long mask2 = 1 - (1 << 30); \
7575*c217d954SCole Faust                long is_positive_or_zero = ab_64 >= 0; \
7576*c217d954SCole Faust                long nudge = select(mask2, mask1, is_positive_or_zero); \
7577*c217d954SCole Faust                SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
7578*c217d954SCole Faust                _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
7579*c217d954SCole Faust                if(DST_SHIFT >= 0) \
7580*c217d954SCole Faust                { \
7581*c217d954SCole Faust                    long mask = ((((int)1) << DST_SHIFT) - (int)1); \
7582*c217d954SCole Faust                    long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
7583*c217d954SCole Faust                    _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
7584*c217d954SCole Faust                } \
7585*c217d954SCole Faust                _tmp += DST_OFFSET; \
7586*c217d954SCole Faust                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
7587*c217d954SCole Faust            })                                                                                                                                          \
7588*c217d954SCole Faust        })                                                                                                                                          \
7589*c217d954SCole Faust    })
7590*c217d954SCole Faust
7591*c217d954SCole Faust
7592*c217d954SCole Faust#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
7593*c217d954SCole Faust    ({                                                                                                                                                     \
7594*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                                                                                 \
7595*c217d954SCole Faust        {                                                                                                                                                  \
7596*c217d954SCole Faust            LOOP_UNROLLING(int, _n0, 0, 1, N0,                                                                                                             \
7597*c217d954SCole Faust            {                                                                                                                                              \
7598*c217d954SCole Faust                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
7599*c217d954SCole Faust            })                                                                                                                                             \
7600*c217d954SCole Faust        })                                                                                                                                                 \
7601*c217d954SCole Faust    })
7602*c217d954SCole Faust
7603*c217d954SCole Faust
7604*c217d954SCole Faust#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst)               \
7605*c217d954SCole Faust    ({                                                                                         \
7606*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                                                     \
7607*c217d954SCole Faust        {                                                                                      \
7608*c217d954SCole Faust            dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
7609*c217d954SCole Faust        })                                                                                     \
7610*c217d954SCole Faust    })
7611*c217d954SCole Faust
7612*c217d954SCole Faust
7613*c217d954SCole Faust#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
7614*c217d954SCole Faust
7615*c217d954SCole Faust#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
7616*c217d954SCole Faust
7617*c217d954SCole Faust#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
7618*c217d954SCole Faust
7619*c217d954SCole Faust#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
7620*c217d954SCole Faust
7621*c217d954SCole Faust#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
7622*c217d954SCole Faust
7623*c217d954SCole Faust#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
7624*c217d954SCole Faust#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
7625*c217d954SCole Faust
7626*c217d954SCole Faust#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
7627*c217d954SCole Faust#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL))
7628*c217d954SCole Faust#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
7629*c217d954SCole Faust#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL))
7630*c217d954SCole Faust
7631*c217d954SCole Faust
7632*c217d954SCole Faust#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst)               \
7633*c217d954SCole Faust    ({ \
7634*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0, \
7635*c217d954SCole Faust        { \
7636*c217d954SCole Faust            dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
7637*c217d954SCole Faust        })                                                                                          \
7638*c217d954SCole Faust    })
7639*c217d954SCole Faust
7640*c217d954SCole Faust
7641*c217d954SCole Faust#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
7642*c217d954SCole Faust    ({                                                            \
7643*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7644*c217d954SCole Faust        {                                                         \
7645*c217d954SCole Faust            dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
7646*c217d954SCole Faust        })                                                        \
7647*c217d954SCole Faust    })
7648*c217d954SCole Faust
7649*c217d954SCole Faust
7650*c217d954SCole Faust#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
7651*c217d954SCole Faust    ({                                                            \
7652*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7653*c217d954SCole Faust        {                                                         \
7654*c217d954SCole Faust            dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant;               \
7655*c217d954SCole Faust        })                                                        \
7656*c217d954SCole Faust    })
7657*c217d954SCole Faust
7658*c217d954SCole Faust#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7659*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7660*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7661*c217d954SCole Faust
7662*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7663*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7664*c217d954SCole Faust
7665*c217d954SCole Faust#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7666*c217d954SCole Faust
7667*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7668*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7669*c217d954SCole Faust
7670*c217d954SCole Faust
7671*c217d954SCole Faust#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
7672*c217d954SCole Faust    ({                                                            \
7673*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                        \
7674*c217d954SCole Faust        {                                                         \
7675*c217d954SCole Faust            dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
7676*c217d954SCole Faust        })                                                        \
7677*c217d954SCole Faust    })
7678*c217d954SCole Faust
7679*c217d954SCole Faust
7680*c217d954SCole Faust#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7681*c217d954SCole Faust    ({                                                      \
7682*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7683*c217d954SCole Faust        {                                                   \
7684*c217d954SCole Faust            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7685*c217d954SCole Faust        })                                                  \
7686*c217d954SCole Faust    })
7687*c217d954SCole Faust
7688*c217d954SCole Faust
7689*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7690*c217d954SCole Faust    ({                                                      \
7691*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7692*c217d954SCole Faust        {                                                   \
7693*c217d954SCole Faust            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7694*c217d954SCole Faust        })                                                  \
7695*c217d954SCole Faust    })
7696*c217d954SCole Faust
7697*c217d954SCole Faust#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7698*c217d954SCole Faust#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7699*c217d954SCole Faust#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7700*c217d954SCole Faust#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
7701*c217d954SCole Faust
7702*c217d954SCole Faust
7703*c217d954SCole Faust#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
7704*c217d954SCole Faust    ({                                                      \
7705*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7706*c217d954SCole Faust        {                                                   \
7707*c217d954SCole Faust            dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7708*c217d954SCole Faust        })                                                  \
7709*c217d954SCole Faust    })
7710*c217d954SCole Faust
7711*c217d954SCole Faust
7712*c217d954SCole Faust#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
7713*c217d954SCole Faust    ({                                                      \
7714*c217d954SCole Faust        LOOP_UNROLLING(int, _m0, 0, 1, M0,                  \
7715*c217d954SCole Faust        {                                                   \
7716*c217d954SCole Faust            dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)));             \
7717*c217d954SCole Faust        })                                                  \
7718*c217d954SCole Faust    })
7719*c217d954SCole Faust
7720*c217d954SCole Faust
7721*c217d954SCole Faust#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7722*c217d954SCole Faust#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7723*c217d954SCole Faust#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7724*c217d954SCole Faust#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7725*c217d954SCole Faust#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7726*c217d954SCole Faust#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7727*c217d954SCole Faust#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7728*c217d954SCole Faust#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
7729*c217d954SCole Faust#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
7730*c217d954SCole Faust    {                                                                                     \
7731*c217d954SCole Faust        LOOP_UNROLLING(int, _m, 0, 1, M0,                                                 \
7732*c217d954SCole Faust        {                                                                                 \
7733*c217d954SCole Faust            LOOP_UNROLLING(int, _n, 0, 1, N0,                                             \
7734*c217d954SCole Faust            {                                                                             \
7735*c217d954SCole Faust                LOOP_UNROLLING(int, _k, 0, 1, K0,                                         \
7736*c217d954SCole Faust                {                                                                         \
7737*c217d954SCole Faust                    dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
7738*c217d954SCole Faust                })                                                                        \
7739*c217d954SCole Faust            })                                                                            \
7740*c217d954SCole Faust        })                                                                                \
7741*c217d954SCole Faust    }
7742*c217d954SCole Faust
7743*c217d954SCole Faust#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
7744*c217d954SCole Faust    ({ \
7745*c217d954SCole Faust        LOOP_UNROLLING(int, _m, 0, 1, M0, \
7746*c217d954SCole Faust        { \
7747*c217d954SCole Faust            LOOP_UNROLLING(int, _n, 0, 1, N0, \
7748*c217d954SCole Faust            { \
7749*c217d954SCole Faust                DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
7750*c217d954SCole Faust            })                                                                                             \
7751*c217d954SCole Faust        })                                                                                             \
7752*c217d954SCole Faust    })
7753*c217d954SCole Faust
7754*c217d954SCole Faust#endif
7755*c217d954SCole Faust
7756*c217d954SCole Faust#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
7757*c217d954SCole Faust
7758*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
7759*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
7760*c217d954SCole Faust#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
7761*c217d954SCole Faust#else
7762*c217d954SCole Faust#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
7763*c217d954SCole Faust#endif
7764*c217d954SCole Faust#endif
7765*c217d954SCole Faust
7766*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
7767*c217d954SCole Faust
7768*c217d954SCole Faust#define ARM_DOT1(a, b, c)                                                                                                                               \
7769*c217d954SCole Faust    ({                                                                                                                                                  \
7770*c217d954SCole Faust        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
7771*c217d954SCole Faust    })
7772*c217d954SCole Faust#define ARM_DOT2(a, b, c)                                                                                                                               \
7773*c217d954SCole Faust    ({                                                                                                                                                  \
7774*c217d954SCole Faust        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
7775*c217d954SCole Faust    })
7776*c217d954SCole Faust#define ARM_DOT3(a, b, c)                                                                                           \
7777*c217d954SCole Faust    ({                                                                                                              \
7778*c217d954SCole Faust        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
7779*c217d954SCole Faust    })
7780*c217d954SCole Faust#define ARM_DOT4(a, b, c) \
7781*c217d954SCole Faust    ({                    \
7782*c217d954SCole Faust        ARM_DOT(a, b, c); \
7783*c217d954SCole Faust    })
7784*c217d954SCole Faust#define ARM_DOT8(a, b, c)            \
7785*c217d954SCole Faust    ({                               \
7786*c217d954SCole Faust        ARM_DOT4((a.lo), (b.lo), c); \
7787*c217d954SCole Faust        ARM_DOT4((a.hi), (b.hi), c); \
7788*c217d954SCole Faust    })
7789*c217d954SCole Faust#define ARM_DOT16(a, b, c)           \
7790*c217d954SCole Faust    ({                               \
7791*c217d954SCole Faust        ARM_DOT8((a.lo), (b.lo), c); \
7792*c217d954SCole Faust        ARM_DOT8((a.hi), (b.hi), c); \
7793*c217d954SCole Faust    })
7794*c217d954SCole Faust
7795*c217d954SCole Faust#else
7796*c217d954SCole Faust
7797*c217d954SCole Faust
7798*c217d954SCole Faust#define ARM_DOT1(a, b, c)          \
7799*c217d954SCole Faust    ({                             \
7800*c217d954SCole Faust        c += (ACC_DATA_TYPE)a * b; \
7801*c217d954SCole Faust    })
7802*c217d954SCole Faust#define ARM_DOT2(a, b, c)                \
7803*c217d954SCole Faust    ({                                   \
7804*c217d954SCole Faust        c += (ACC_DATA_TYPE)a.s0 * b.s0; \
7805*c217d954SCole Faust        c += (ACC_DATA_TYPE)a.s1 * b.s1; \
7806*c217d954SCole Faust    })
7807*c217d954SCole Faust#define ARM_DOT3(a, b, c)                \
7808*c217d954SCole Faust    ({                                   \
7809*c217d954SCole Faust        ARM_DOT2(a, b, c);               \
7810*c217d954SCole Faust        c += (ACC_DATA_TYPE)a.s2 * b.s2; \
7811*c217d954SCole Faust    })
7812*c217d954SCole Faust#define ARM_DOT4(a, b, c)                \
7813*c217d954SCole Faust    ({                                   \
7814*c217d954SCole Faust        ARM_DOT3(a, b, c);               \
7815*c217d954SCole Faust        c += (ACC_DATA_TYPE)a.s3 * b.s3; \
7816*c217d954SCole Faust    })
7817*c217d954SCole Faust#define ARM_DOT8(a, b, c)            \
7818*c217d954SCole Faust    ({                               \
7819*c217d954SCole Faust        ARM_DOT4((a.lo), (b.lo), c); \
7820*c217d954SCole Faust        ARM_DOT4((a.hi), (b.hi), c); \
7821*c217d954SCole Faust    })
7822*c217d954SCole Faust#define ARM_DOT16(a, b, c)           \
7823*c217d954SCole Faust    ({                               \
7824*c217d954SCole Faust        ARM_DOT8((a.lo), (b.lo), c); \
7825*c217d954SCole Faust        ARM_DOT8((a.hi), (b.hi), c); \
7826*c217d954SCole Faust    })
7827*c217d954SCole Faust#endif
7828*c217d954SCole Faust
7829*c217d954SCole Faust
7830*c217d954SCole Faust#define ARM_DOT_K0X1(k0, a, b, c)         \
7831*c217d954SCole Faust    ({                                    \
7832*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##0), (c)); \
7833*c217d954SCole Faust    })
7834*c217d954SCole Faust#define ARM_DOT_K0X2(k0, a, b, c)            \
7835*c217d954SCole Faust    ({                                       \
7836*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
7837*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
7838*c217d954SCole Faust    })
7839*c217d954SCole Faust#define ARM_DOT_K0X3(k0, a, b, c)            \
7840*c217d954SCole Faust    ({                                       \
7841*c217d954SCole Faust        ARM_DOT_K0X2(k0, a, b, c);           \
7842*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
7843*c217d954SCole Faust    })
7844*c217d954SCole Faust#define ARM_DOT_K0X4(k0, a, b, c)            \
7845*c217d954SCole Faust    ({                                       \
7846*c217d954SCole Faust        ARM_DOT_K0X3(k0, a, b, c);           \
7847*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
7848*c217d954SCole Faust    })
7849*c217d954SCole Faust#define ARM_DOT_K0X8(k0, a, b, c)            \
7850*c217d954SCole Faust    ({                                       \
7851*c217d954SCole Faust        ARM_DOT_K0X4(k0, a, b, c);           \
7852*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
7853*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
7854*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
7855*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
7856*c217d954SCole Faust    })
7857*c217d954SCole Faust#define ARM_DOT_K0X16(k0, a, b, c)           \
7858*c217d954SCole Faust    ({                                       \
7859*c217d954SCole Faust        ARM_DOT_K0X8(k0, a, b, c);           \
7860*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
7861*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
7862*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
7863*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
7864*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
7865*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
7866*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
7867*c217d954SCole Faust        ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
7868*c217d954SCole Faust    })
7869*c217d954SCole Faust
7870*c217d954SCole Faust
7871*c217d954SCole Faust#define ARM_MM_K0XN0X1(n0, k0, a, b, c)           \
7872*c217d954SCole Faust    ({                                            \
7873*c217d954SCole Faust        ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
7874*c217d954SCole Faust    })
7875*c217d954SCole Faust#define ARM_MM_K0XN0X2(n0, k0, a, b, c)           \
7876*c217d954SCole Faust    ({                                            \
7877*c217d954SCole Faust        ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
7878*c217d954SCole Faust        ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
7879*c217d954SCole Faust    })
7880*c217d954SCole Faust#define ARM_MM_K0XN0X3(n0, k0, a, b, c)           \
7881*c217d954SCole Faust    ({                                            \
7882*c217d954SCole Faust        ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
7883*c217d954SCole Faust        ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
7884*c217d954SCole Faust    })
7885*c217d954SCole Faust#define ARM_MM_K0XN0X4(n0, k0, a, b, c)           \
7886*c217d954SCole Faust    ({                                            \
7887*c217d954SCole Faust        ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
7888*c217d954SCole Faust        ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
7889*c217d954SCole Faust    })
7890*c217d954SCole Faust#define ARM_MM_K0XN0X5(n0, k0, a, b, c)           \
7891*c217d954SCole Faust    ({                                            \
7892*c217d954SCole Faust        ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
7893*c217d954SCole Faust        ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
7894*c217d954SCole Faust    })
7895*c217d954SCole Faust#define ARM_MM_K0XN0X6(n0, k0, a, b, c)           \
7896*c217d954SCole Faust    ({                                            \
7897*c217d954SCole Faust        ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
7898*c217d954SCole Faust        ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
7899*c217d954SCole Faust    })
7900*c217d954SCole Faust#define ARM_MM_K0XN0X7(n0, k0, a, b, c)           \
7901*c217d954SCole Faust    ({                                            \
7902*c217d954SCole Faust        ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
7903*c217d954SCole Faust        ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
7904*c217d954SCole Faust    })
7905*c217d954SCole Faust#define ARM_MM_K0XN0X8(n0, k0, a, b, c)           \
7906*c217d954SCole Faust    ({                                            \
7907*c217d954SCole Faust        ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
7908*c217d954SCole Faust        ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
7909*c217d954SCole Faust    })
7910*c217d954SCole Faust
7911*c217d954SCole Faust#define ARM_DOT_K0(k0, a, b, c) \
7912*c217d954SCole Faust    ({                          \
7913*c217d954SCole Faust        CONCAT(ARM_DOT, k0)     \
7914*c217d954SCole Faust        ((a), (b), (c));        \
7915*c217d954SCole Faust    })
7916*c217d954SCole Faust
7917*c217d954SCole Faust#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
7918*c217d954SCole Faust    ({                                 \
7919*c217d954SCole Faust        CONCAT(ARM_DOT_K0X, n0)        \
7920*c217d954SCole Faust        (k0, (a), b, (c));             \
7921*c217d954SCole Faust    })
7922*c217d954SCole Faust
7923*c217d954SCole Faust#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
7924*c217d954SCole Faust    ({                                       \
7925*c217d954SCole Faust        CONCAT(ARM_MM_K0XN0X, m0)            \
7926*c217d954SCole Faust        (n0, k0, a, b, c);                   \
7927*c217d954SCole Faust    })
7928*c217d954SCole Faust
7929*c217d954SCole Faust
7930*c217d954SCole Faust#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c)   \
7931*c217d954SCole Faust    ({                                           \
7932*c217d954SCole Faust        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \
7933*c217d954SCole Faust    })
7934*c217d954SCole Faust#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)        \
7935*c217d954SCole Faust    ({                                                \
7936*c217d954SCole Faust        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
7937*c217d954SCole Faust        c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
7938*c217d954SCole Faust    })
7939*c217d954SCole Faust#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)        \
7940*c217d954SCole Faust    ({                                                \
7941*c217d954SCole Faust        ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
7942*c217d954SCole Faust        c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
7943*c217d954SCole Faust    })
7944*c217d954SCole Faust#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)        \
7945*c217d954SCole Faust    ({                                                \
7946*c217d954SCole Faust        ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
7947*c217d954SCole Faust        c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
7948*c217d954SCole Faust    })
7949*c217d954SCole Faust#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)        \
7950*c217d954SCole Faust    ({                                                \
7951*c217d954SCole Faust        ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
7952*c217d954SCole Faust        c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
7953*c217d954SCole Faust        c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
7954*c217d954SCole Faust        c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
7955*c217d954SCole Faust        c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
7956*c217d954SCole Faust    })
7957*c217d954SCole Faust#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)       \
7958*c217d954SCole Faust    ({                                                \
7959*c217d954SCole Faust        ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
7960*c217d954SCole Faust        c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
7961*c217d954SCole Faust        c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
7962*c217d954SCole Faust        c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
7963*c217d954SCole Faust        c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
7964*c217d954SCole Faust        c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
7965*c217d954SCole Faust        c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
7966*c217d954SCole Faust        c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
7967*c217d954SCole Faust        c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
7968*c217d954SCole Faust    })
7969*c217d954SCole Faust
7970*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c)    \
7971*c217d954SCole Faust    ({                                                         \
7972*c217d954SCole Faust        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \
7973*c217d954SCole Faust    })
7974*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c)    \
7975*c217d954SCole Faust    ({                                                         \
7976*c217d954SCole Faust        ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);   \
7977*c217d954SCole Faust        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
7978*c217d954SCole Faust    })
7979*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c)    \
7980*c217d954SCole Faust    ({                                                         \
7981*c217d954SCole Faust        ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);   \
7982*c217d954SCole Faust        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
7983*c217d954SCole Faust    })
7984*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c)    \
7985*c217d954SCole Faust    ({                                                         \
7986*c217d954SCole Faust        ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);   \
7987*c217d954SCole Faust        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
7988*c217d954SCole Faust    })
7989*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c)    \
7990*c217d954SCole Faust    ({                                                         \
7991*c217d954SCole Faust        ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);   \
7992*c217d954SCole Faust        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
7993*c217d954SCole Faust    })
7994*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c)    \
7995*c217d954SCole Faust    ({                                                         \
7996*c217d954SCole Faust        ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);   \
7997*c217d954SCole Faust        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
7998*c217d954SCole Faust    })
7999*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c)    \
8000*c217d954SCole Faust    ({                                                         \
8001*c217d954SCole Faust        ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);   \
8002*c217d954SCole Faust        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
8003*c217d954SCole Faust    })
8004*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c)    \
8005*c217d954SCole Faust    ({                                                         \
8006*c217d954SCole Faust        ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);   \
8007*c217d954SCole Faust        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
8008*c217d954SCole Faust    })
8009*c217d954SCole Faust#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
8010*c217d954SCole Faust    ({                                              \
8011*c217d954SCole Faust        CONCAT(ARM_MUL_N0X, k0)                     \
8012*c217d954SCole Faust        (VECTOR_ACC_TYPE, (a), b, (c));             \
8013*c217d954SCole Faust    })
8014*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
8015*c217d954SCole Faust    ({                                                           \
8016*c217d954SCole Faust        CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                         \
8017*c217d954SCole Faust        (VECTOR_ACC_TYPE, k0, a, b, c);                          \
8018*c217d954SCole Faust    })
8019*c217d954SCole Faust
8020*c217d954SCole Faust#if defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
8021*c217d954SCole Faust
8022*c217d954SCole Faust__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
8023*c217d954SCole Faust                                                IMAGE_DECLARATION(rhs),
8024*c217d954SCole Faust                                                IMAGE_DECLARATION(dst),
8025*c217d954SCole Faust                                                uint k,
8026*c217d954SCole Faust                                                uint lhs_stride_z,
8027*c217d954SCole Faust                                                uint rhs_stride_z,
8028*c217d954SCole Faust                                                uint dst_stride_z
8029*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
8030*c217d954SCole Faust                                                ,
8031*c217d954SCole Faust                                                uint dst_cross_plane_pad
8032*c217d954SCole Faust#endif
8033*c217d954SCole Faust                                               )
8034*c217d954SCole Faust{
8035*c217d954SCole Faust
8036*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0))
8037*c217d954SCole Faust
8038*c217d954SCole Faust#if defined(LHS_INTERLEAVE)
8039*c217d954SCole Faust#define LHS_OFFSET_X (K0)
8040*c217d954SCole Faust#define LHS_STEP_X ((K0) * (V0))
8041*c217d954SCole Faust#define LHS_STEP_LOOP (1)
8042*c217d954SCole Faust#else
8043*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
8044*c217d954SCole Faust#define LHS_STEP_X (K0)
8045*c217d954SCole Faust#define LHS_STEP_LOOP (V0)
8046*c217d954SCole Faust#endif
8047*c217d954SCole Faust
8048*c217d954SCole Faust
8049*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0))
8050*c217d954SCole Faust
8051*c217d954SCole Faust
8052*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
8053*c217d954SCole Faust#define RHS_OFFSET_X (K0)
8054*c217d954SCole Faust#define RHS_STEP_X ((K0) * (H0))
8055*c217d954SCole Faust#define RHS_STEP_LOOP (1)
8056*c217d954SCole Faust#else
8057*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
8058*c217d954SCole Faust#define RHS_STEP_X (K0)
8059*c217d954SCole Faust#define RHS_STEP_LOOP (H0)
8060*c217d954SCole Faust#endif
8061*c217d954SCole Faust
8062*c217d954SCole Faust    uint x = get_global_id(0);
8063*c217d954SCole Faust    uint y = get_global_id(1);
8064*c217d954SCole Faust    uint z = get_global_id(2);
8065*c217d954SCole Faust
8066*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
8067*c217d954SCole Faust    if((x * N0 >= N) || (y * M0 >= M))
8068*c217d954SCole Faust    {
8069*c217d954SCole Faust        return;
8070*c217d954SCole Faust    }
8071*c217d954SCole Faust#endif
8072*c217d954SCole Faust
8073*c217d954SCole Faust
8074*c217d954SCole Faust    __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z));
8075*c217d954SCole Faust
8076*c217d954SCole Faust
8077*c217d954SCole Faust    __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y);
8078*c217d954SCole Faust
8079*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
8080*c217d954SCole Faust
8081*c217d954SCole Faust    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
8082*c217d954SCole Faust#else
8083*c217d954SCole Faust    rhs_addr += z * rhs_stride_z;
8084*c217d954SCole Faust#endif
8085*c217d954SCole Faust
8086*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
8087*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
8088*c217d954SCole Faust
8089*c217d954SCole Faust
8090*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0);
8091*c217d954SCole Faust
8092*c217d954SCole Faust    for(int i = 0; i < k; i += K0)
8093*c217d954SCole Faust    {
8094*c217d954SCole Faust
8095*c217d954SCole Faust        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
8096*c217d954SCole Faust
8097*c217d954SCole Faust
8098*c217d954SCole Faust        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
8099*c217d954SCole Faust
8100*c217d954SCole Faust
8101*c217d954SCole Faust        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
8102*c217d954SCole Faust
8103*c217d954SCole Faust
8104*c217d954SCole Faust        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
8105*c217d954SCole Faust        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
8106*c217d954SCole Faust    }
8107*c217d954SCole Faust
8108*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
8109*c217d954SCole Faust
8110*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0);
8111*c217d954SCole Faust
8112*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
8113*c217d954SCole Faust
8114*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
8115*c217d954SCole Faust
8116*c217d954SCole Faust
8117*c217d954SCole Faust
8118*c217d954SCole Faust    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
8119*c217d954SCole Faust
8120*c217d954SCole Faust#else
8121*c217d954SCole Faust
8122*c217d954SCole Faust
8123*c217d954SCole Faust    dst_addr += z * dst_stride_z;
8124*c217d954SCole Faust
8125*c217d954SCole Faust#endif
8126*c217d954SCole Faust
8127*c217d954SCole Faust
8128*c217d954SCole Faust    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
8129*c217d954SCole Faust    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
8130*c217d954SCole Faust
8131*c217d954SCole Faust
8132*c217d954SCole Faust    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
8133*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
8134*c217d954SCole Faust
8135*c217d954SCole Faust#undef LHS_BLOCK_SIZE
8136*c217d954SCole Faust#undef LHS_OFFSET_X
8137*c217d954SCole Faust#undef LHS_STEP_X
8138*c217d954SCole Faust#undef RHS_BLOCK_SIZE
8139*c217d954SCole Faust#undef RHS_OFFSET_X
8140*c217d954SCole Faust#undef RHS_STEP_X
8141*c217d954SCole Faust}
8142*c217d954SCole Faust#endif
8143*c217d954SCole Faust
8144*c217d954SCole Faust#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
8145*c217d954SCole Faust#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
8146*c217d954SCole Faust#define FUSED_OUTPUT_STAGE_FIXED_POINT
8147*c217d954SCole Faust#endif
8148*c217d954SCole Faust
8149*c217d954SCole Faust
8150*c217d954SCole Faust#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
8151*c217d954SCole Faust__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint
8152*c217d954SCole Faust#elif defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
8153*c217d954SCole Faust__kernel void gemmlowp_mm_reshaped_only_rhs_t
8154*c217d954SCole Faust#endif
8155*c217d954SCole Faust(IMAGE_DECLARATION(lhs),
8156*c217d954SCole Faust IMAGE_DECLARATION(rhs),
8157*c217d954SCole Faust IMAGE_DECLARATION(dst),
8158*c217d954SCole Faust uint lhs_stride_z,
8159*c217d954SCole Faust uint rhs_stride_z,
8160*c217d954SCole Faust uint dst_stride_z
8161*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
8162*c217d954SCole Faust ,
8163*c217d954SCole Faust uint lhs_cross_plane_pad
8164*c217d954SCole Faust#endif
8165*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
8166*c217d954SCole Faust ,
8167*c217d954SCole Faust uint dst_cross_plane_pad
8168*c217d954SCole Faust#endif
8169*c217d954SCole Faust#if defined(A_OFFSET)
8170*c217d954SCole Faust ,
8171*c217d954SCole Faust IMAGE_DECLARATION(sum_col)
8172*c217d954SCole Faust#endif
8173*c217d954SCole Faust#if defined(B_OFFSET)
8174*c217d954SCole Faust ,
8175*c217d954SCole Faust IMAGE_DECLARATION(sum_row)
8176*c217d954SCole Faust#endif
8177*c217d954SCole Faust#if defined(ADD_BIAS)
8178*c217d954SCole Faust ,
8179*c217d954SCole Faust VECTOR_DECLARATION(biases)
8180*c217d954SCole Faust#endif
8181*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION)
8182*c217d954SCole Faust ,
8183*c217d954SCole Faust VECTOR_DECLARATION(result_multipliers),
8184*c217d954SCole Faust VECTOR_DECLARATION(result_shifts)
8185*c217d954SCole Faust#endif
8186*c217d954SCole Faust)
8187*c217d954SCole Faust{
8188*c217d954SCole Faust
8189*c217d954SCole Faust#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y)
8190*c217d954SCole Faust#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y)
8191*c217d954SCole Faust
8192*c217d954SCole Faust
8193*c217d954SCole Faust#if defined(RHS_INTERLEAVE)
8194*c217d954SCole Faust#define RHS_OFFSET_X (K0)
8195*c217d954SCole Faust#define RHS_STEP_X (K0 * H0)
8196*c217d954SCole Faust#else
8197*c217d954SCole Faust#define RHS_OFFSET_X (K0 * N0)
8198*c217d954SCole Faust#define RHS_STEP_X (K0)
8199*c217d954SCole Faust#endif
8200*c217d954SCole Faust#define RHS_STEP_LOOP (N0 * K0 * H0)
8201*c217d954SCole Faust
8202*c217d954SCole Faust    uint x  = GET_SPATIAL_IDX(0, 1, 1);
8203*c217d954SCole Faust    uint y  = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
8204*c217d954SCole Faust    uint z  = GET_SPATIAL_IDX(2, 1, 1);
8205*c217d954SCole Faust    int  xo = (x * N0);
8206*c217d954SCole Faust
8207*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
8208*c217d954SCole Faust    if((xo >= N) || (y >= M))
8209*c217d954SCole Faust    {
8210*c217d954SCole Faust        return;
8211*c217d954SCole Faust    }
8212*c217d954SCole Faust#endif
8213*c217d954SCole Faust
8214*c217d954SCole Faust
8215*c217d954SCole Faust    uint lhs_y = y + z * FULL_LHS_HEIGHT;
8216*c217d954SCole Faust
8217*c217d954SCole Faust
8218*c217d954SCole Faust    uint rhs_offset_x = (x % H0) * RHS_OFFSET_X;
8219*c217d954SCole Faust    uint rhs_offset_y = (x / H0) * rhs_stride_y;
8220*c217d954SCole Faust
8221*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
8222*c217d954SCole Faust
8223*c217d954SCole Faust    rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z;
8224*c217d954SCole Faust#else
8225*c217d954SCole Faust    rhs_offset_y += z * rhs_stride_z;
8226*c217d954SCole Faust#endif
8227*c217d954SCole Faust
8228*c217d954SCole Faust
8229*c217d954SCole Faust    TILE(ACC_DATA_TYPE, M0, N0, c);
8230*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
8231*c217d954SCole Faust    {
8232*c217d954SCole Faust        c[i].v = 0;
8233*c217d954SCole Faust    })
8234*c217d954SCole Faust
8235*c217d954SCole Faust    int i = 0;
8236*c217d954SCole Faust    for(; i <= (K - K0); i += K0)
8237*c217d954SCole Faust    {
8238*c217d954SCole Faust        TILE(DATA_TYPE, M0, K0, a);
8239*c217d954SCole Faust        TILE(DATA_TYPE, N0, K0, b);
8240*c217d954SCole Faust
8241*c217d954SCole Faust
8242*c217d954SCole Faust        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
8243*c217d954SCole Faust
8244*c217d954SCole Faust
8245*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, N0,
8246*c217d954SCole Faust        {
8247*c217d954SCole Faust            b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X));
8248*c217d954SCole Faust        })
8249*c217d954SCole Faust
8250*c217d954SCole Faust
8251*c217d954SCole Faust        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
8252*c217d954SCole Faust
8253*c217d954SCole Faust        rhs_offset_x += RHS_STEP_LOOP;
8254*c217d954SCole Faust    }
8255*c217d954SCole Faust
8256*c217d954SCole Faust#if((K % K0) != 0)
8257*c217d954SCole Faust
8258*c217d954SCole Faust
8259*c217d954SCole Faust    for(; i < K; ++i)
8260*c217d954SCole Faust    {
8261*c217d954SCole Faust        TILE(DATA_TYPE, M0, 1, a);
8262*c217d954SCole Faust        TILE(DATA_TYPE, N0, 1, b);
8263*c217d954SCole Faust
8264*c217d954SCole Faust
8265*c217d954SCole Faust        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
8266*c217d954SCole Faust
8267*c217d954SCole Faust        LOOP_UNROLLING(int, _i, 0, 1, N0,
8268*c217d954SCole Faust        {
8269*c217d954SCole Faust            b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X);
8270*c217d954SCole Faust        })
8271*c217d954SCole Faust
8272*c217d954SCole Faust        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
8273*c217d954SCole Faust
8274*c217d954SCole Faust        rhs_offset_x += 1;
8275*c217d954SCole Faust    }
8276*c217d954SCole Faust#endif
8277*c217d954SCole Faust
8278*c217d954SCole Faust#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
8279*c217d954SCole Faust
8280*c217d954SCole Faust    TILE(int, M0, N0, c_int);
8281*c217d954SCole Faust    TILE(int, M0, N0, offset_s32);
8282*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
8283*c217d954SCole Faust    {
8284*c217d954SCole Faust        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
8285*c217d954SCole Faust    })
8286*c217d954SCole Faust
8287*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
8288*c217d954SCole Faust    {
8289*c217d954SCole Faust        c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
8290*c217d954SCole Faust    })
8291*c217d954SCole Faust
8292*c217d954SCole Faust#if defined(A_OFFSET)
8293*c217d954SCole Faust
8294*c217d954SCole Faust#if defined(SUM_COL_HAS_BATCHES)
8295*c217d954SCole Faust    int sum_col_y = z;
8296*c217d954SCole Faust#else
8297*c217d954SCole Faust    int sum_col_y = 0;
8298*c217d954SCole Faust#endif
8299*c217d954SCole Faust    TILE(int, 1, N0, a_offset_s32);
8300*c217d954SCole Faust
8301*c217d954SCole Faust    T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32);
8302*c217d954SCole Faust
8303*c217d954SCole Faust    a_offset_s32[0].v *= A_OFFSET;
8304*c217d954SCole Faust
8305*c217d954SCole Faust    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
8306*c217d954SCole Faust#endif
8307*c217d954SCole Faust
8308*c217d954SCole Faust#if defined(B_OFFSET)
8309*c217d954SCole Faust
8310*c217d954SCole Faust
8311*c217d954SCole Faust
8312*c217d954SCole Faust
8313*c217d954SCole Faust    TILE(int, M0, N0, b_offset_s32);
8314*c217d954SCole Faust
8315*c217d954SCole Faust    T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32);
8316*c217d954SCole Faust
8317*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
8318*c217d954SCole Faust    {
8319*c217d954SCole Faust        offset_s32[i].v += b_offset_s32[i].v *B_OFFSET;
8320*c217d954SCole Faust    })
8321*c217d954SCole Faust
8322*c217d954SCole Faust#endif
8323*c217d954SCole Faust
8324*c217d954SCole Faust#if defined(ADD_BIAS)
8325*c217d954SCole Faust
8326*c217d954SCole Faust    TILE(int, 1, N0, bias);
8327*c217d954SCole Faust
8328*c217d954SCole Faust    T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
8329*c217d954SCole Faust
8330*c217d954SCole Faust    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
8331*c217d954SCole Faust#endif
8332*c217d954SCole Faust
8333*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
8334*c217d954SCole Faust    {
8335*c217d954SCole Faust        c_int[i].v += offset_s32[i].v;
8336*c217d954SCole Faust    })
8337*c217d954SCole Faust
8338*c217d954SCole Faust    TILE(DATA_TYPE, M0, N0, c_lp);
8339*c217d954SCole Faust
8340*c217d954SCole Faust
8341*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION)
8342*c217d954SCole Faust    TILE(int, 1, N0, res_mul);
8343*c217d954SCole Faust    TILE(int, 1, N0, res_shift);
8344*c217d954SCole Faust
8345*c217d954SCole Faust    T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul);
8346*c217d954SCole Faust    T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift);
8347*c217d954SCole Faust
8348*c217d954SCole Faust    T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp);
8349*c217d954SCole Faust#else
8350*c217d954SCole Faust    T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp);
8351*c217d954SCole Faust#endif
8352*c217d954SCole Faust
8353*c217d954SCole Faust#if defined(MIN_BOUND)
8354*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
8355*c217d954SCole Faust    {
8356*c217d954SCole Faust        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND);
8357*c217d954SCole Faust    })
8358*c217d954SCole Faust#endif
8359*c217d954SCole Faust#if defined(MAX_BOUND)
8360*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
8361*c217d954SCole Faust    {
8362*c217d954SCole Faust        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND);
8363*c217d954SCole Faust    })
8364*c217d954SCole Faust#endif
8365*c217d954SCole Faust
8366*c217d954SCole Faust#else
8367*c217d954SCole Faust    TILE(int, M0, N0, c_lp);
8368*c217d954SCole Faust
8369*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
8370*c217d954SCole Faust    {
8371*c217d954SCole Faust        c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
8372*c217d954SCole Faust    })
8373*c217d954SCole Faust#endif
8374*c217d954SCole Faust
8375*c217d954SCole Faust    TILE(uint, M0, 1, dst_indirect_y);
8376*c217d954SCole Faust
8377*c217d954SCole Faust    LOOP_UNROLLING(int, i, 0, 1, M0,
8378*c217d954SCole Faust    {
8379*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
8380*c217d954SCole Faust        dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1);
8381*c217d954SCole Faust        dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT;
8382*c217d954SCole Faust        dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D;
8383*c217d954SCole Faust#else
8384*c217d954SCole Faust        dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT;
8385*c217d954SCole Faust#endif
8386*c217d954SCole Faust    })
8387*c217d954SCole Faust
8388*c217d954SCole Faust    const bool cond_x = (xo > (N - N0)) & (PARTIAL_STORE_N0 != 0);
8389*c217d954SCole Faust
8390*c217d954SCole Faust#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
8391*c217d954SCole Faust    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
8392*c217d954SCole Faust#else
8393*c217d954SCole Faust    T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
8394*c217d954SCole Faust#endif
8395*c217d954SCole Faust
8396*c217d954SCole Faust#undef RHS_OFFSET_X
8397*c217d954SCole Faust#undef RHS_STEP_X
8398*c217d954SCole Faust#undef RHS_STEP_LOOP
8399*c217d954SCole Faust}
8400*c217d954SCole Faust#endif
8401*c217d954SCole Faust
8402*c217d954SCole Faust#if defined(GEMMLOWP_MM_NATIVE)
8403*c217d954SCole Faust
8404*c217d954SCole Faust
8405*c217d954SCole Faust__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
8406*c217d954SCole Faust                                 IMAGE_DECLARATION(rhs),
8407*c217d954SCole Faust                                 IMAGE_DECLARATION(dst),
8408*c217d954SCole Faust                                 uint lhs_stride_z,
8409*c217d954SCole Faust                                 uint rhs_stride_z,
8410*c217d954SCole Faust                                 uint dst_stride_z
8411*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
8412*c217d954SCole Faust                                 ,
8413*c217d954SCole Faust                                 uint lhs_cross_plane_pad
8414*c217d954SCole Faust#endif
8415*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
8416*c217d954SCole Faust                                 ,
8417*c217d954SCole Faust                                 uint dst_cross_plane_pad
8418*c217d954SCole Faust#endif
8419*c217d954SCole Faust                                )
8420*c217d954SCole Faust{
8421*c217d954SCole Faust    uint x = get_global_id(0);
8422*c217d954SCole Faust    uint y = get_global_id(1);
8423*c217d954SCole Faust    uint z = get_global_id(2);
8424*c217d954SCole Faust
8425*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS)
8426*c217d954SCole Faust    if((x * N0 >= N) || (y * M0 >= M))
8427*c217d954SCole Faust    {
8428*c217d954SCole Faust        return;
8429*c217d954SCole Faust    }
8430*c217d954SCole Faust#endif
8431*c217d954SCole Faust
8432*c217d954SCole Faust
8433*c217d954SCole Faust    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
8434*c217d954SCole Faust
8435*c217d954SCole Faust
8436*c217d954SCole Faust    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
8437*c217d954SCole Faust
8438*c217d954SCole Faust#if defined(MATRIX_B_DEPTH)
8439*c217d954SCole Faust
8440*c217d954SCole Faust    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
8441*c217d954SCole Faust#else
8442*c217d954SCole Faust    rhs_offset += z * rhs_stride_z;
8443*c217d954SCole Faust#endif
8444*c217d954SCole Faust
8445*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
8446*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
8447*c217d954SCole Faust
8448*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D)
8449*c217d954SCole Faust
8450*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
8451*c217d954SCole Faust
8452*c217d954SCole Faust
8453*c217d954SCole Faust
8454*c217d954SCole Faust    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
8455*c217d954SCole Faust
8456*c217d954SCole Faust#else
8457*c217d954SCole Faust
8458*c217d954SCole Faust
8459*c217d954SCole Faust    lhs_offset += z * lhs_stride_z;
8460*c217d954SCole Faust
8461*c217d954SCole Faust#endif
8462*c217d954SCole Faust
8463*c217d954SCole Faust
8464*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0);
8465*c217d954SCole Faust
8466*c217d954SCole Faust    int i = 0;
8467*c217d954SCole Faust
8468*c217d954SCole Faust    for(; i <= (K - K0); i += K0)
8469*c217d954SCole Faust    {
8470*c217d954SCole Faust
8471*c217d954SCole Faust        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
8472*c217d954SCole Faust
8473*c217d954SCole Faust
8474*c217d954SCole Faust        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
8475*c217d954SCole Faust
8476*c217d954SCole Faust
8477*c217d954SCole Faust#if(GPU_ARCH == GPU_ARCH_MIDGARD)
8478*c217d954SCole Faust        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
8479*c217d954SCole Faust#else
8480*c217d954SCole Faust
8481*c217d954SCole Faust        TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
8482*c217d954SCole Faust
8483*c217d954SCole Faust        ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
8484*c217d954SCole Faust#endif
8485*c217d954SCole Faust
8486*c217d954SCole Faust
8487*c217d954SCole Faust        lhs_offset += K0;
8488*c217d954SCole Faust        rhs_offset += K0 * rhs_stride_y;
8489*c217d954SCole Faust    }
8490*c217d954SCole Faust
8491*c217d954SCole Faust
8492*c217d954SCole Faust    for(; i < K; ++i)
8493*c217d954SCole Faust    {
8494*c217d954SCole Faust
8495*c217d954SCole Faust        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
8496*c217d954SCole Faust
8497*c217d954SCole Faust
8498*c217d954SCole Faust        LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
8499*c217d954SCole Faust
8500*c217d954SCole Faust
8501*c217d954SCole Faust#if(GPU_ARCH == GPU_ARCH_MIDGARD)
8502*c217d954SCole Faust        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
8503*c217d954SCole Faust#else
8504*c217d954SCole Faust
8505*c217d954SCole Faust        TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
8506*c217d954SCole Faust
8507*c217d954SCole Faust        ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
8508*c217d954SCole Faust#endif
8509*c217d954SCole Faust
8510*c217d954SCole Faust
8511*c217d954SCole Faust        lhs_offset += 1;
8512*c217d954SCole Faust        rhs_offset += rhs_stride_y;
8513*c217d954SCole Faust    }
8514*c217d954SCole Faust
8515*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
8516*c217d954SCole Faust
8517*c217d954SCole Faust    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
8518*c217d954SCole Faust
8519*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D)
8520*c217d954SCole Faust
8521*c217d954SCole Faust    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
8522*c217d954SCole Faust
8523*c217d954SCole Faust
8524*c217d954SCole Faust
8525*c217d954SCole Faust    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
8526*c217d954SCole Faust
8527*c217d954SCole Faust#else
8528*c217d954SCole Faust
8529*c217d954SCole Faust
8530*c217d954SCole Faust    dst_addr += z * dst_stride_z;
8531*c217d954SCole Faust
8532*c217d954SCole Faust#endif
8533*c217d954SCole Faust    const bool cond_y = y == 0;
8534*c217d954SCole Faust    const bool cond_x = ((x + 1) * N0 >= N);
8535*c217d954SCole Faust
8536*c217d954SCole Faust
8537*c217d954SCole Faust    REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res);
8538*c217d954SCole Faust    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
8539*c217d954SCole Faust}
8540*c217d954SCole Faust#endif
8541*c217d954SCole Faust
8542*c217d954SCole Faust#if defined(GEMMLOWP_MATRIX_A_REDUCTION)
8543*c217d954SCole Faust
8544*c217d954SCole Faust__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
8545*c217d954SCole Faust                                          IMAGE_DECLARATION(dst))
8546*c217d954SCole Faust{
8547*c217d954SCole Faust
8548*c217d954SCole Faust    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
8549*c217d954SCole Faust    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
8550*c217d954SCole Faust
8551*c217d954SCole Faust    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
8552*c217d954SCole Faust    sum_row_32            = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
8553*c217d954SCole Faust    ACC_DATA_TYPE sum_row = 0;
8554*c217d954SCole Faust
8555*c217d954SCole Faust    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
8556*c217d954SCole Faust
8557*c217d954SCole Faust    int i = 0;
8558*c217d954SCole Faust
8559*c217d954SCole Faust
8560*c217d954SCole Faust    for(; i <= ((int)COLS_A - 16); i += 16)
8561*c217d954SCole Faust    {
8562*c217d954SCole Faust        const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
8563*c217d954SCole Faust
8564*c217d954SCole Faust        sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF,
8565*c217d954SCole Faust                      VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
8566*c217d954SCole Faust    }
8567*c217d954SCole Faust
8568*c217d954SCole Faust
8569*c217d954SCole Faust    for(; i < COLS_A; ++i)
8570*c217d954SCole Faust    {
8571*c217d954SCole Faust        sum_row += (ACC_DATA_TYPE)matrix_a[i];
8572*c217d954SCole Faust    }
8573*c217d954SCole Faust
8574*c217d954SCole Faust    sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
8575*c217d954SCole Faust
8576*c217d954SCole Faust#if defined(SCALAR)
8577*c217d954SCole Faust    sum_row *= (int)SCALAR;
8578*c217d954SCole Faust#endif
8579*c217d954SCole Faust    *((__global int *)dst.ptr) = (int)sum_row;
8580*c217d954SCole Faust}
8581*c217d954SCole Faust#endif
8582*c217d954SCole Faust
8583*c217d954SCole Faust#if defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
8584*c217d954SCole Faust
8585*c217d954SCole Faust__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
8586*c217d954SCole Faust                                               IMAGE_DECLARATION(dst))
8587*c217d954SCole Faust{
8588*c217d954SCole Faust
8589*c217d954SCole Faust    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
8590*c217d954SCole Faust    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
8591*c217d954SCole Faust
8592*c217d954SCole Faust    ACC_DATA_TYPE sum_row = 0;
8593*c217d954SCole Faust
8594*c217d954SCole Faust    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
8595*c217d954SCole Faust
8596*c217d954SCole Faust    int i = 0;
8597*c217d954SCole Faust
8598*c217d954SCole Faust
8599*c217d954SCole Faust    for(; i <= ((int)COLS_A - 32); i += 32)
8600*c217d954SCole Faust    {
8601*c217d954SCole Faust        VEC_DATA_TYPE(DATA_TYPE, 16)
8602*c217d954SCole Faust        a0 = vload16(0, matrix_a + i);
8603*c217d954SCole Faust
8604*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8605*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8606*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8607*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8608*c217d954SCole Faust
8609*c217d954SCole Faust        a0 = vload16(1, matrix_a + i);
8610*c217d954SCole Faust
8611*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8612*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8613*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8614*c217d954SCole Faust        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
8615*c217d954SCole Faust    }
8616*c217d954SCole Faust
8617*c217d954SCole Faust
8618*c217d954SCole Faust    for(; i < COLS_A; ++i)
8619*c217d954SCole Faust    {
8620*c217d954SCole Faust        sum_row += (ACC_DATA_TYPE)matrix_a[i];
8621*c217d954SCole Faust    }
8622*c217d954SCole Faust
8623*c217d954SCole Faust#if defined(SCALAR)
8624*c217d954SCole Faust    sum_row *= (int)SCALAR;
8625*c217d954SCole Faust#endif
8626*c217d954SCole Faust    *((__global int *)dst.ptr) = (int)sum_row;
8627*c217d954SCole Faust}
8628*c217d954SCole Faust#endif
8629*c217d954SCole Faust
8630*c217d954SCole Faust#if defined(GEMMLOWP_MATRIX_B_REDUCTION)
8631*c217d954SCole Faust
8632*c217d954SCole Faust__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
8633*c217d954SCole Faust                                          IMAGE_DECLARATION(dst))
8634*c217d954SCole Faust{
8635*c217d954SCole Faust
8636*c217d954SCole Faust    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
8637*c217d954SCole Faust    const uint y      = get_global_id(1);
8638*c217d954SCole Faust
8639*c217d954SCole Faust    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z);
8640*c217d954SCole Faust    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y;
8641*c217d954SCole Faust
8642*c217d954SCole Faust    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
8643*c217d954SCole Faust    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0;
8644*c217d954SCole Faust
8645*c217d954SCole Faust    int i = 0;
8646*c217d954SCole Faust
8647*c217d954SCole Faust    for(; i <= ((int)ROWS_B - 4); i += 4)
8648*c217d954SCole Faust    {
8649*c217d954SCole Faust        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8650*c217d954SCole Faust        b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y);
8651*c217d954SCole Faust        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8652*c217d954SCole Faust        b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y);
8653*c217d954SCole Faust        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8654*c217d954SCole Faust        b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y);
8655*c217d954SCole Faust        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8656*c217d954SCole Faust        b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y);
8657*c217d954SCole Faust
8658*c217d954SCole Faust        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3,
8659*c217d954SCole Faust                      VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
8660*c217d954SCole Faust
8661*c217d954SCole Faust        matrix_b += 4 * src_stride_y;
8662*c217d954SCole Faust    }
8663*c217d954SCole Faust
8664*c217d954SCole Faust
8665*c217d954SCole Faust    for(; i < (int)ROWS_B; ++i)
8666*c217d954SCole Faust    {
8667*c217d954SCole Faust        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
8668*c217d954SCole Faust        b0 = VLOAD(VEC_SIZE)(0, matrix_b);
8669*c217d954SCole Faust
8670*c217d954SCole Faust        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
8671*c217d954SCole Faust
8672*c217d954SCole Faust        matrix_b += src_stride_y;
8673*c217d954SCole Faust    }
8674*c217d954SCole Faust
8675*c217d954SCole Faust#if defined(SCALAR)
8676*c217d954SCole Faust    sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR;
8677*c217d954SCole Faust#endif
8678*c217d954SCole Faust    VEC_DATA_TYPE(int, VEC_SIZE)
8679*c217d954SCole Faust    res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE));
8680*c217d954SCole Faust
8681*c217d954SCole Faust    STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
8682*c217d954SCole Faust}
8683*c217d954SCole Faust#endif
8684*c217d954SCole Faust
8685*c217d954SCole Faust#endif
8686*c217d954SCole Faust
8687*c217d954SCole Faust#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
8688*c217d954SCole Faust
8689*c217d954SCole Faust#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
8690*c217d954SCole Faust
8691*c217d954SCole Faust
8692*c217d954SCole Faustinline VEC_INT offset_contribution(
8693*c217d954SCole Faust    int x,
8694*c217d954SCole Faust    int y,
8695*c217d954SCole Faust    int z
8696*c217d954SCole Faust#if defined(A_OFFSET)
8697*c217d954SCole Faust    ,
8698*c217d954SCole Faust    IMAGE_DECLARATION(sum_col)
8699*c217d954SCole Faust#endif
8700*c217d954SCole Faust#if defined(B_OFFSET)
8701*c217d954SCole Faust    ,
8702*c217d954SCole Faust    IMAGE_DECLARATION(sum_row)
8703*c217d954SCole Faust#endif
8704*c217d954SCole Faust#if defined(ADD_BIAS)
8705*c217d954SCole Faust    ,
8706*c217d954SCole Faust    VECTOR_DECLARATION(biases)
8707*c217d954SCole Faust#endif
8708*c217d954SCole Faust)
8709*c217d954SCole Faust{
8710*c217d954SCole Faust    VEC_INT a_offset_s32 = (VEC_INT)0;
8711*c217d954SCole Faust    VEC_INT b_offset_s32 = (VEC_INT)0;
8712*c217d954SCole Faust
8713*c217d954SCole Faust    int batch_id = z;
8714*c217d954SCole Faust#if defined(DEPTH_INPUT3D)
8715*c217d954SCole Faust    batch_id /= (int)DEPTH_INPUT3D;
8716*c217d954SCole Faust#endif
8717*c217d954SCole Faust
8718*c217d954SCole Faust#if defined(A_OFFSET)
8719*c217d954SCole Faust
8720*c217d954SCole Faust    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
8721*c217d954SCole Faust
8722*c217d954SCole Faust
8723*c217d954SCole Faust#if defined(SUM_COL_HAS_BATCHES)
8724*c217d954SCole Faust    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
8725*c217d954SCole Faust#else
8726*c217d954SCole Faust    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr);
8727*c217d954SCole Faust#endif
8728*c217d954SCole Faust
8729*c217d954SCole Faust    a_offset_s32 *= (VEC_INT)A_OFFSET;
8730*c217d954SCole Faust#endif
8731*c217d954SCole Faust
8732*c217d954SCole Faust#if defined(B_OFFSET)
8733*c217d954SCole Faust
8734*c217d954SCole Faust    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
8735*c217d954SCole Faust
8736*c217d954SCole Faust
8737*c217d954SCole Faust#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
8738*c217d954SCole Faust    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
8739*c217d954SCole Faust#else
8740*c217d954SCole Faust    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
8741*c217d954SCole Faust#endif
8742*c217d954SCole Faust    b_offset_s32 *= (VEC_INT)B_OFFSET;
8743*c217d954SCole Faust#endif
8744*c217d954SCole Faust
8745*c217d954SCole Faust#if defined(ADD_BIAS)
8746*c217d954SCole Faust
8747*c217d954SCole Faust    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
8748*c217d954SCole Faust
8749*c217d954SCole Faust    VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
8750*c217d954SCole Faust    b_offset_s32 += (VEC_INT)biases_values;
8751*c217d954SCole Faust#endif
8752*c217d954SCole Faust
8753*c217d954SCole Faust    return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
8754*c217d954SCole Faust}
8755*c217d954SCole Faust
8756*c217d954SCole Faust#if defined(GEMMLOWP_OFFSET_CONTRIBUTION)
8757*c217d954SCole Faust
8758*c217d954SCole Faust__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
8759*c217d954SCole Faust#if defined(A_OFFSET)
8760*c217d954SCole Faust                                           ,
8761*c217d954SCole Faust                                           IMAGE_DECLARATION(sum_col)
8762*c217d954SCole Faust#endif
8763*c217d954SCole Faust#if defined(B_OFFSET)
8764*c217d954SCole Faust                                           ,
8765*c217d954SCole Faust                                           IMAGE_DECLARATION(sum_row)
8766*c217d954SCole Faust#endif
8767*c217d954SCole Faust#if defined(ADD_BIAS)
8768*c217d954SCole Faust                                           ,
8769*c217d954SCole Faust                                           VECTOR_DECLARATION(biases)
8770*c217d954SCole Faust#endif
8771*c217d954SCole Faust                                          )
8772*c217d954SCole Faust{
8773*c217d954SCole Faust    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
8774*c217d954SCole Faust    const int y = get_global_id(1);
8775*c217d954SCole Faust    const int z = get_global_id(2);
8776*c217d954SCole Faust
8777*c217d954SCole Faust
8778*c217d954SCole Faust    VEC_INT offset_term_s32 = offset_contribution(
8779*c217d954SCole Faust                                  x, y, z
8780*c217d954SCole Faust#if defined(A_OFFSET)
8781*c217d954SCole Faust                                  ,
8782*c217d954SCole Faust                                  sum_col_ptr,
8783*c217d954SCole Faust                                  sum_col_stride_x,
8784*c217d954SCole Faust                                  sum_col_step_x,
8785*c217d954SCole Faust                                  sum_col_stride_y,
8786*c217d954SCole Faust                                  sum_col_step_y,
8787*c217d954SCole Faust                                  sum_col_offset_first_element_in_bytes
8788*c217d954SCole Faust#endif
8789*c217d954SCole Faust#if defined(B_OFFSET)
8790*c217d954SCole Faust                                  ,
8791*c217d954SCole Faust                                  sum_row_ptr,
8792*c217d954SCole Faust                                  sum_row_stride_x,
8793*c217d954SCole Faust                                  sum_row_step_x,
8794*c217d954SCole Faust                                  sum_row_stride_y,
8795*c217d954SCole Faust                                  sum_row_step_y,
8796*c217d954SCole Faust                                  sum_row_offset_first_element_in_bytes
8797*c217d954SCole Faust#endif
8798*c217d954SCole Faust#if defined(ADD_BIAS)
8799*c217d954SCole Faust                                  ,
8800*c217d954SCole Faust                                  biases_ptr,
8801*c217d954SCole Faust                                  biases_stride_x,
8802*c217d954SCole Faust                                  biases_step_x,
8803*c217d954SCole Faust                                  biases_offset_first_element_in_bytes
8804*c217d954SCole Faust#endif
8805*c217d954SCole Faust                              );
8806*c217d954SCole Faust
8807*c217d954SCole Faust    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
8808*c217d954SCole Faust
8809*c217d954SCole Faust    VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
8810*c217d954SCole Faust
8811*c217d954SCole Faust
8812*c217d954SCole Faust    in_s32_0 += offset_term_s32;
8813*c217d954SCole Faust
8814*c217d954SCole Faust
8815*c217d954SCole Faust    STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
8816*c217d954SCole Faust}
8817*c217d954SCole Faust#endif
8818*c217d954SCole Faust
8819*c217d954SCole Faust#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
8820*c217d954SCole Faust
8821*c217d954SCole Faust__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
8822*c217d954SCole Faust#if defined(A_OFFSET)
8823*c217d954SCole Faust                                                         ,
8824*c217d954SCole Faust                                                         IMAGE_DECLARATION(sum_col)
8825*c217d954SCole Faust#endif
8826*c217d954SCole Faust#if defined(B_OFFSET)
8827*c217d954SCole Faust                                                         ,
8828*c217d954SCole Faust                                                         IMAGE_DECLARATION(sum_row)
8829*c217d954SCole Faust#endif
8830*c217d954SCole Faust                                                         ,
8831*c217d954SCole Faust#if defined(ADD_BIAS)
8832*c217d954SCole Faust                                                         VECTOR_DECLARATION(biases),
8833*c217d954SCole Faust#endif
8834*c217d954SCole Faust                                                         TENSOR3D_DECLARATION(dst)
8835*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION)
8836*c217d954SCole Faust                                                         ,
8837*c217d954SCole Faust                                                         VECTOR_DECLARATION(result_multipliers),
8838*c217d954SCole Faust                                                         VECTOR_DECLARATION(result_shifts)
8839*c217d954SCole Faust#endif
8840*c217d954SCole Faust                                                        )
8841*c217d954SCole Faust{
8842*c217d954SCole Faust    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
8843*c217d954SCole Faust    const int y = get_global_id(1);
8844*c217d954SCole Faust    const int z = get_global_id(2);
8845*c217d954SCole Faust
8846*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
8847*c217d954SCole Faust
8848*c217d954SCole Faust
8849*c217d954SCole Faust    VEC_INT offset_term_s32 = offset_contribution(
8850*c217d954SCole Faust                                  x, y, z
8851*c217d954SCole Faust#if defined(A_OFFSET)
8852*c217d954SCole Faust                                  ,
8853*c217d954SCole Faust                                  sum_col_ptr,
8854*c217d954SCole Faust                                  sum_col_stride_x,
8855*c217d954SCole Faust                                  sum_col_step_x,
8856*c217d954SCole Faust                                  sum_col_stride_y,
8857*c217d954SCole Faust                                  sum_col_step_y,
8858*c217d954SCole Faust                                  sum_col_offset_first_element_in_bytes
8859*c217d954SCole Faust#endif
8860*c217d954SCole Faust#if defined(B_OFFSET)
8861*c217d954SCole Faust                                  ,
8862*c217d954SCole Faust                                  sum_row_ptr,
8863*c217d954SCole Faust                                  sum_row_stride_x,
8864*c217d954SCole Faust                                  sum_row_step_x,
8865*c217d954SCole Faust                                  sum_row_stride_y,
8866*c217d954SCole Faust                                  sum_row_step_y,
8867*c217d954SCole Faust                                  sum_row_offset_first_element_in_bytes
8868*c217d954SCole Faust#endif
8869*c217d954SCole Faust#if defined(ADD_BIAS)
8870*c217d954SCole Faust                                  ,
8871*c217d954SCole Faust                                  biases_ptr,
8872*c217d954SCole Faust                                  biases_stride_x,
8873*c217d954SCole Faust                                  biases_step_x,
8874*c217d954SCole Faust                                  biases_offset_first_element_in_bytes
8875*c217d954SCole Faust#endif
8876*c217d954SCole Faust                              );
8877*c217d954SCole Faust
8878*c217d954SCole Faust    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
8879*c217d954SCole Faust
8880*c217d954SCole Faust    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
8881*c217d954SCole Faust
8882*c217d954SCole Faust
8883*c217d954SCole Faust    in_s32 += offset_term_s32;
8884*c217d954SCole Faust
8885*c217d954SCole Faust
8886*c217d954SCole Faust
8887*c217d954SCole Faust
8888*c217d954SCole Faust    in_s32 += (VEC_INT)RESULT_OFFSET;
8889*c217d954SCole Faust
8890*c217d954SCole Faust
8891*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION)
8892*c217d954SCole Faust    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
8893*c217d954SCole Faust    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
8894*c217d954SCole Faust    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
8895*c217d954SCole Faust    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
8896*c217d954SCole Faust
8897*c217d954SCole Faust    in_s32 *= result_multipliers_values;
8898*c217d954SCole Faust    in_s32 >>= result_shifts_values;
8899*c217d954SCole Faust#else
8900*c217d954SCole Faust    in_s32 *= RESULT_MULTIPLIER;
8901*c217d954SCole Faust
8902*c217d954SCole Faust    in_s32 >>= RESULT_SHIFT;
8903*c217d954SCole Faust#endif
8904*c217d954SCole Faust
8905*c217d954SCole Faust    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
8906*c217d954SCole Faust    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
8907*c217d954SCole Faust
8908*c217d954SCole Faust#if defined(MIN_BOUND)
8909*c217d954SCole Faust    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
8910*c217d954SCole Faust#endif
8911*c217d954SCole Faust#if defined(MAX_BOUND)
8912*c217d954SCole Faust    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
8913*c217d954SCole Faust#endif
8914*c217d954SCole Faust
8915*c217d954SCole Faust
8916*c217d954SCole Faust    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
8917*c217d954SCole Faust}
8918*c217d954SCole Faust#endif
8919*c217d954SCole Faust
8920*c217d954SCole Faust#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
8921*c217d954SCole Faust
8922*c217d954SCole Faust__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
8923*c217d954SCole Faust#if defined(A_OFFSET)
8924*c217d954SCole Faust                                                                    ,
8925*c217d954SCole Faust                                                                    IMAGE_DECLARATION(sum_col)
8926*c217d954SCole Faust#endif
8927*c217d954SCole Faust#if defined(B_OFFSET)
8928*c217d954SCole Faust                                                                    ,
8929*c217d954SCole Faust                                                                    IMAGE_DECLARATION(sum_row)
8930*c217d954SCole Faust#endif
8931*c217d954SCole Faust                                                                    ,
8932*c217d954SCole Faust#if defined(ADD_BIAS)
8933*c217d954SCole Faust                                                                    VECTOR_DECLARATION(biases),
8934*c217d954SCole Faust#endif
8935*c217d954SCole Faust                                                                    TENSOR3D_DECLARATION(dst)
8936*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION)
8937*c217d954SCole Faust                                                                    ,
8938*c217d954SCole Faust                                                                    VECTOR_DECLARATION(result_multipliers),
8939*c217d954SCole Faust                                                                    VECTOR_DECLARATION(result_shifts)
8940*c217d954SCole Faust#endif
8941*c217d954SCole Faust                                                                   )
8942*c217d954SCole Faust{
8943*c217d954SCole Faust    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
8944*c217d954SCole Faust    const int y = get_global_id(1);
8945*c217d954SCole Faust    const int z = get_global_id(2);
8946*c217d954SCole Faust
8947*c217d954SCole Faust
8948*c217d954SCole Faust    VEC_INT offset_term_s32 = offset_contribution(
8949*c217d954SCole Faust                                  x, y, z
8950*c217d954SCole Faust#if defined(A_OFFSET)
8951*c217d954SCole Faust                                  ,
8952*c217d954SCole Faust                                  sum_col_ptr,
8953*c217d954SCole Faust                                  sum_col_stride_x,
8954*c217d954SCole Faust                                  sum_col_step_x,
8955*c217d954SCole Faust                                  sum_col_stride_y,
8956*c217d954SCole Faust                                  sum_col_step_y,
8957*c217d954SCole Faust                                  sum_col_offset_first_element_in_bytes
8958*c217d954SCole Faust#endif
8959*c217d954SCole Faust#if defined(B_OFFSET)
8960*c217d954SCole Faust                                  ,
8961*c217d954SCole Faust                                  sum_row_ptr,
8962*c217d954SCole Faust                                  sum_row_stride_x,
8963*c217d954SCole Faust                                  sum_row_step_x,
8964*c217d954SCole Faust                                  sum_row_stride_y,
8965*c217d954SCole Faust                                  sum_row_step_y,
8966*c217d954SCole Faust                                  sum_row_offset_first_element_in_bytes
8967*c217d954SCole Faust#endif
8968*c217d954SCole Faust#if defined(ADD_BIAS)
8969*c217d954SCole Faust                                  ,
8970*c217d954SCole Faust                                  biases_ptr,
8971*c217d954SCole Faust                                  biases_stride_x,
8972*c217d954SCole Faust                                  biases_step_x,
8973*c217d954SCole Faust                                  biases_offset_first_element_in_bytes
8974*c217d954SCole Faust#endif
8975*c217d954SCole Faust                              );
8976*c217d954SCole Faust
8977*c217d954SCole Faust    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
8978*c217d954SCole Faust
8979*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
8980*c217d954SCole Faust
8981*c217d954SCole Faust    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
8982*c217d954SCole Faust
8983*c217d954SCole Faust
8984*c217d954SCole Faust    in_s32 += offset_term_s32;
8985*c217d954SCole Faust
8986*c217d954SCole Faust
8987*c217d954SCole Faust
8988*c217d954SCole Faust
8989*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION)
8990*c217d954SCole Faust    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
8991*c217d954SCole Faust    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
8992*c217d954SCole Faust    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
8993*c217d954SCole Faust    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
8994*c217d954SCole Faust
8995*c217d954SCole Faust    VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
8996*c217d954SCole Faust    VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
8997*c217d954SCole Faust    in_s32                   = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
8998*c217d954SCole Faust#else
8999*c217d954SCole Faust
9000*c217d954SCole Faust#if RESULT_SHIFT < 0
9001*c217d954SCole Faust    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9002*c217d954SCole Faust#else
9003*c217d954SCole Faust    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9004*c217d954SCole Faust#endif
9005*c217d954SCole Faust
9006*c217d954SCole Faust#endif
9007*c217d954SCole Faust
9008*c217d954SCole Faust
9009*c217d954SCole Faust    in_s32 += (VEC_INT)RESULT_OFFSET;
9010*c217d954SCole Faust
9011*c217d954SCole Faust    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
9012*c217d954SCole Faust    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
9013*c217d954SCole Faust
9014*c217d954SCole Faust#if defined(MIN_BOUND)
9015*c217d954SCole Faust    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
9016*c217d954SCole Faust#endif
9017*c217d954SCole Faust#if defined(MAX_BOUND)
9018*c217d954SCole Faust    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
9019*c217d954SCole Faust#endif
9020*c217d954SCole Faust
9021*c217d954SCole Faust
9022*c217d954SCole Faust    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9023*c217d954SCole Faust}
9024*c217d954SCole Faust#endif
9025*c217d954SCole Faust
9026*c217d954SCole Faust#undef VEC_INT
9027*c217d954SCole Faust
9028*c217d954SCole Faust#endif
9029*c217d954SCole Faust
9030*c217d954SCole Faust#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
9031*c217d954SCole Faust
9032*c217d954SCole Faust__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
9033*c217d954SCole Faust#if defined(ADD_BIAS)
9034*c217d954SCole Faust                                                  VECTOR_DECLARATION(biases),
9035*c217d954SCole Faust#endif
9036*c217d954SCole Faust                                                  TENSOR3D_DECLARATION(dst))
9037*c217d954SCole Faust{
9038*c217d954SCole Faust
9039*c217d954SCole Faust    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
9040*c217d954SCole Faust    int y = get_global_id(1);
9041*c217d954SCole Faust    int z = get_global_id(2);
9042*c217d954SCole Faust
9043*c217d954SCole Faust    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
9044*c217d954SCole Faust
9045*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
9046*c217d954SCole Faust
9047*c217d954SCole Faust    VEC_DATA_TYPE(int, VEC_SIZE)
9048*c217d954SCole Faust    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
9049*c217d954SCole Faust
9050*c217d954SCole Faust#if defined(ADD_BIAS)
9051*c217d954SCole Faust
9052*c217d954SCole Faust    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
9053*c217d954SCole Faust
9054*c217d954SCole Faust    VEC_DATA_TYPE(int, VEC_SIZE)
9055*c217d954SCole Faust    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
9056*c217d954SCole Faust    input_values += biases_values;
9057*c217d954SCole Faust#endif
9058*c217d954SCole Faust
9059*c217d954SCole Faust
9060*c217d954SCole Faust    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
9061*c217d954SCole Faust
9062*c217d954SCole Faust
9063*c217d954SCole Faust    input_values *= RESULT_MULT_INT;
9064*c217d954SCole Faust
9065*c217d954SCole Faust#if RESULT_SHIFT < 0
9066*c217d954SCole Faust    input_values >>= -RESULT_SHIFT;
9067*c217d954SCole Faust#else
9068*c217d954SCole Faust    input_values >>= RESULT_SHIFT;
9069*c217d954SCole Faust#endif
9070*c217d954SCole Faust
9071*c217d954SCole Faust    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
9072*c217d954SCole Faust    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
9073*c217d954SCole Faust
9074*c217d954SCole Faust#if defined(MIN_BOUND)
9075*c217d954SCole Faust    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
9076*c217d954SCole Faust#endif
9077*c217d954SCole Faust#if defined(MAX_BOUND)
9078*c217d954SCole Faust    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
9079*c217d954SCole Faust#endif
9080*c217d954SCole Faust
9081*c217d954SCole Faust
9082*c217d954SCole Faust    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9083*c217d954SCole Faust}
9084*c217d954SCole Faust#endif
9085*c217d954SCole Faust
9086*c217d954SCole Faust#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
9087*c217d954SCole Faust
9088*c217d954SCole Faust__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
9089*c217d954SCole Faust#if defined(ADD_BIAS)
9090*c217d954SCole Faust                                                             VECTOR_DECLARATION(biases),
9091*c217d954SCole Faust#endif
9092*c217d954SCole Faust                                                             TENSOR3D_DECLARATION(dst))
9093*c217d954SCole Faust{
9094*c217d954SCole Faust
9095*c217d954SCole Faust    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
9096*c217d954SCole Faust    int y = get_global_id(1);
9097*c217d954SCole Faust    int z = get_global_id(2);
9098*c217d954SCole Faust
9099*c217d954SCole Faust    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
9100*c217d954SCole Faust
9101*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
9102*c217d954SCole Faust
9103*c217d954SCole Faust    VEC_DATA_TYPE(int, VEC_SIZE)
9104*c217d954SCole Faust    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
9105*c217d954SCole Faust
9106*c217d954SCole Faust#if defined(ADD_BIAS)
9107*c217d954SCole Faust
9108*c217d954SCole Faust    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
9109*c217d954SCole Faust
9110*c217d954SCole Faust    VEC_DATA_TYPE(int, VEC_SIZE)
9111*c217d954SCole Faust    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
9112*c217d954SCole Faust    input_values += biases_values;
9113*c217d954SCole Faust#endif
9114*c217d954SCole Faust
9115*c217d954SCole Faust
9116*c217d954SCole Faust#if RESULT_SHIFT < 0
9117*c217d954SCole Faust    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9118*c217d954SCole Faust#else
9119*c217d954SCole Faust    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9120*c217d954SCole Faust#endif
9121*c217d954SCole Faust
9122*c217d954SCole Faust
9123*c217d954SCole Faust    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
9124*c217d954SCole Faust
9125*c217d954SCole Faust    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
9126*c217d954SCole Faust    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
9127*c217d954SCole Faust
9128*c217d954SCole Faust#if defined(MIN_BOUND)
9129*c217d954SCole Faust    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
9130*c217d954SCole Faust#endif
9131*c217d954SCole Faust#if defined(MAX_BOUND)
9132*c217d954SCole Faust    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
9133*c217d954SCole Faust#endif
9134*c217d954SCole Faust
9135*c217d954SCole Faust
9136*c217d954SCole Faust    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9137*c217d954SCole Faust}
9138*c217d954SCole Faust#endif
9139*c217d954SCole Faust
9140*c217d954SCole Faust#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
9141*c217d954SCole Faust
9142*c217d954SCole Faust__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
9143*c217d954SCole Faust#if defined(ADD_BIAS)
9144*c217d954SCole Faust                                                                     VECTOR_DECLARATION(biases),
9145*c217d954SCole Faust#endif
9146*c217d954SCole Faust                                                                     TENSOR3D_DECLARATION(dst))
9147*c217d954SCole Faust{
9148*c217d954SCole Faust
9149*c217d954SCole Faust    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
9150*c217d954SCole Faust    int y = get_global_id(1);
9151*c217d954SCole Faust    int z = get_global_id(2);
9152*c217d954SCole Faust
9153*c217d954SCole Faust    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
9154*c217d954SCole Faust
9155*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
9156*c217d954SCole Faust
9157*c217d954SCole Faust    VEC_DATA_TYPE(int, VEC_SIZE)
9158*c217d954SCole Faust    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
9159*c217d954SCole Faust
9160*c217d954SCole Faust#if defined(ADD_BIAS)
9161*c217d954SCole Faust
9162*c217d954SCole Faust    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
9163*c217d954SCole Faust
9164*c217d954SCole Faust    VEC_DATA_TYPE(int, VEC_SIZE)
9165*c217d954SCole Faust    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
9166*c217d954SCole Faust    input_values += biases_values;
9167*c217d954SCole Faust#endif
9168*c217d954SCole Faust
9169*c217d954SCole Faust
9170*c217d954SCole Faust#if RESULT_SHIFT < 0
9171*c217d954SCole Faust    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9172*c217d954SCole Faust#else
9173*c217d954SCole Faust    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
9174*c217d954SCole Faust#endif
9175*c217d954SCole Faust
9176*c217d954SCole Faust    VEC_DATA_TYPE(short, VEC_SIZE)
9177*c217d954SCole Faust    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
9178*c217d954SCole Faust
9179*c217d954SCole Faust#if defined(MIN_BOUND)
9180*c217d954SCole Faust    res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
9181*c217d954SCole Faust#endif
9182*c217d954SCole Faust#if defined(MAX_BOUND)
9183*c217d954SCole Faust    res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
9184*c217d954SCole Faust#endif
9185*c217d954SCole Faust
9186*c217d954SCole Faust
9187*c217d954SCole Faust    STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9188*c217d954SCole Faust}
9189*c217d954SCole Faust#endif
9190*c217d954SCole Faust
9191*c217d954SCole Faust#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)
9192*c217d954SCole Faust
9193*c217d954SCole Faust__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
9194*c217d954SCole Faust#if defined(ADD_BIAS)
9195*c217d954SCole Faust                                                        VECTOR_DECLARATION(biases),
9196*c217d954SCole Faust#endif
9197*c217d954SCole Faust#if defined(DST_HEIGHT)
9198*c217d954SCole Faust                                                        TENSOR4D_DECLARATION(dst))
9199*c217d954SCole Faust#else
9200*c217d954SCole Faust                                                        TENSOR3D_DECLARATION(dst))
9201*c217d954SCole Faust#endif
9202*c217d954SCole Faust{
9203*c217d954SCole Faust
9204*c217d954SCole Faust    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
9205*c217d954SCole Faust    int y = get_global_id(1);
9206*c217d954SCole Faust    int z = get_global_id(2);
9207*c217d954SCole Faust
9208*c217d954SCole Faust    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
9209*c217d954SCole Faust
9210*c217d954SCole Faust    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
9211*c217d954SCole Faust
9212*c217d954SCole Faust    VEC_DATA_TYPE(int, VEC_SIZE)
9213*c217d954SCole Faust    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
9214*c217d954SCole Faust
9215*c217d954SCole Faust#if defined(ADD_BIAS)
9216*c217d954SCole Faust
9217*c217d954SCole Faust    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
9218*c217d954SCole Faust
9219*c217d954SCole Faust    VEC_DATA_TYPE(int, VEC_SIZE)
9220*c217d954SCole Faust    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
9221*c217d954SCole Faust    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values;
9222*c217d954SCole Faust#endif
9223*c217d954SCole Faust
9224*c217d954SCole Faust
9225*c217d954SCole Faust    VEC_DATA_TYPE(float, VEC_SIZE)
9226*c217d954SCole Faust    input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE));
9227*c217d954SCole Faust    input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
9228*c217d954SCole Faust
9229*c217d954SCole Faust    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
9230*c217d954SCole Faust    res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
9231*c217d954SCole Faust
9232*c217d954SCole Faust#if defined(MIN_BOUND)
9233*c217d954SCole Faust    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
9234*c217d954SCole Faust#endif
9235*c217d954SCole Faust#if defined(MAX_BOUND)
9236*c217d954SCole Faust    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
9237*c217d954SCole Faust#endif
9238*c217d954SCole Faust
9239*c217d954SCole Faust
9240*c217d954SCole Faust    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
9241*c217d954SCole Faust}
9242*c217d954SCole Faust#endif  )"