1R"( 2 3 4 5 6#ifndef ARM_COMPUTE_HELPER_H 7#define ARM_COMPUTE_HELPER_H 8 9 10 11 12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 13 VSTORE(N0) \ 14 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 15 16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 17 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 18 VSTORE(N0) \ 19 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 20 21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 22 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 23 VSTORE(N0) \ 24 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 25 26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 27 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 28 VSTORE(N0) \ 29 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 30 31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 32 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 33 VSTORE(N0) \ 34 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 35 36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 37 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 38 VSTORE(N0) \ 39 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 40 41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 42 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 43 VSTORE(N0) \ 44 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 45 46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 47 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 48 VSTORE(N0) \ 49 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 50 51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 52 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 53 VSTORE(N0) \ 54 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 55 56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 57 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 58 VSTORE(N0) \ 59 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 60 61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 62 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 63 VSTORE(N0) \ 64 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 65 66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 67 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 68 VSTORE(N0) \ 69 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 70 71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 72 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 73 VSTORE(N0) \ 74 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 75 76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 77 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 78 VSTORE(N0) \ 79 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 80 81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 82 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 83 VSTORE(N0) \ 84 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 85 86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 87 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 88 VSTORE(N0) \ 89 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 90 91 92 93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 94 VSTORE(N0) \ 95 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 96 97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 98 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 99 VSTORE(N0) \ 100 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 101 102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 103 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 104 VSTORE(N0) \ 105 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 106 107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 108 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 109 VSTORE(N0) \ 110 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 111 112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 113 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114 VSTORE(N0) \ 115 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 116 117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 118 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119 VSTORE(N0) \ 120 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 121 122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 123 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124 VSTORE(N0) \ 125 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 126 127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 128 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129 VSTORE(N0) \ 130 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 131 132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 133 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 134 VSTORE(N0) \ 135 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 136 137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 138 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139 VSTORE(N0) \ 140 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 141 142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 143 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 144 VSTORE(N0) \ 145 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 146 147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 148 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 149 VSTORE(N0) \ 150 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 151 152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 153 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 154 VSTORE(N0) \ 155 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 156 157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 158 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 159 VSTORE(N0) \ 160 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 161 162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 163 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 164 VSTORE(N0) \ 165 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 166 167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 168 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 169 VSTORE(N0) \ 170 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 171 172 173 174 175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 177 178 179 180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 182 183 184 185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186 VSTORE_PARTIAL(N0, STORE_N0) \ 187 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 188 189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 190 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 191 VSTORE_PARTIAL(N0, STORE_N0) \ 192 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 193 194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 195 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 196 VSTORE_PARTIAL(N0, STORE_N0) \ 197 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 198 199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 200 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 201 VSTORE_PARTIAL(N0, STORE_N0) \ 202 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 203 204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 205 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 206 VSTORE_PARTIAL(N0, STORE_N0) \ 207 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 208 209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 210 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 211 VSTORE_PARTIAL(N0, STORE_N0) \ 212 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 213 214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 215 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 216 VSTORE_PARTIAL(N0, STORE_N0) \ 217 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 218 219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 220 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 221 VSTORE_PARTIAL(N0, STORE_N0) \ 222 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 223 224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 225 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 226 VSTORE_PARTIAL(N0, STORE_N0) \ 227 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 228 229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 230 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 231 VSTORE_PARTIAL(N0, STORE_N0) \ 232 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 233 234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 235 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 236 VSTORE_PARTIAL(N0, STORE_N0) \ 237 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 238 239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 240 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 241 VSTORE_PARTIAL(N0, STORE_N0) \ 242 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 243 244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 245 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 246 VSTORE_PARTIAL(N0, STORE_N0) \ 247 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 248 249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 250 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 251 VSTORE_PARTIAL(N0, STORE_N0) \ 252 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 253 254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 255 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 256 VSTORE_PARTIAL(N0, STORE_N0) \ 257 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 258 259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 260 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 261 VSTORE_PARTIAL(N0, STORE_N0) \ 262 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 263 264 265 266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 268 269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 270 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 271 { \ 272 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 273 } \ 274 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 275 { \ 276 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 277 } \ 278 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 279 { \ 280 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 281 } \ 282 else \ 283 { \ 284 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 285 } 286 287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 288 if(!(PARTIAL_COND_X)) \ 289 { \ 290 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 291 } \ 292 else \ 293 { \ 294 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 295 } 296 297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 298 if(!(PARTIAL_COND_Y)) \ 299 { \ 300 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 301 } \ 302 else \ 303 { \ 304 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 305 } 306 307 308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 309 310 311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 312 313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 314 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 315 316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 317 318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 319 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 320 321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 322 323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 324 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 325 326#else 327 328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 329 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 330 331#endif 332 333#endif 334 335 336#if defined(PARTIAL_STORE_M0) 337 338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 339 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 340#else 341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 342 ((uint)(y * M0)) 343#endif 344 345 346 347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 348 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 349 350 351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 352#pragma OPENCL EXTENSION cl_khr_fp16 : enable 353#endif 354 355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 357#endif 358 359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 361#endif 362 363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 364#pragma OPENCL EXTENSION cl_arm_printf : enable 365#endif 366 367#define GPU_ARCH_MIDGARD 0x100 368#define GPU_ARCH_BIFROST 0x200 369#define GPU_ARCH_VALHALL 0x300 370 371 372#define CONCAT(a, b) a##b 373 374 375#define EXPAND(x) x 376 377 378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 379 380 381#define REV1(x) ((x)) 382#define REV2(x) ((x).s10) 383#define REV3(x) ((x).s210) 384#define REV4(x) ((x).s3210) 385#define REV8(x) ((x).s76543210) 386#define REV16(x) ((x).sFEDCBA9876543210) 387 388 389 390#define REVERSE_STR(x, s) REV##s((x)) 391#define REVERSE(x, s) REVERSE_STR(x, s) 392 393 394 395#define ROT1_0(x) ((x)) 396#define ROT1_1(x) ((x)) 397 398#define ROT2_0(x) ((x)) 399#define ROT2_1(x) ((x).s10) 400#define ROT2_2(x) ((x)) 401 402#define ROT3_0(x) ((x)) 403#define ROT3_1(x) ((x).s201) 404#define ROT3_2(x) ((x).s120) 405#define ROT3_3(x) ((x)) 406 407#define ROT4_0(x) ((x)) 408#define ROT4_1(x) ((x).s3012) 409#define ROT4_2(x) ((x).s2301) 410#define ROT4_3(x) ((x).s1230) 411#define ROT4_4(x) ((x)) 412 413#define ROT8_0(x) ((x)) 414#define ROT8_1(x) ((x).s70123456) 415#define ROT8_2(x) ((x).s67012345) 416#define ROT8_3(x) ((x).s56701234) 417#define ROT8_4(x) ((x).s45670123) 418#define ROT8_5(x) ((x).s34567012) 419#define ROT8_6(x) ((x).s23456701) 420#define ROT8_7(x) ((x).s12345670) 421#define ROT8_8(x) ((x)) 422 423#define ROT16_0(x) ((x)) 424#define ROT16_1(x) ((x).sF0123456789ABCDE) 425#define ROT16_2(x) ((x).sEF0123456789ABCD) 426#define ROT16_3(x) ((x).sDEF0123456789ABC) 427#define ROT16_4(x) ((x).sCDEF0123456789AB) 428#define ROT16_5(x) ((x).sBCDEF0123456789A) 429#define ROT16_6(x) ((x).sABCDEF0123456789) 430#define ROT16_7(x) ((x).s9ABCDEF012345678) 431#define ROT16_8(x) ((x).s89ABCDEF01234567) 432#define ROT16_9(x) ((x).s789ABCDEF0123456) 433#define ROT16_10(x) ((x).s6789ABCDEF012345) 434#define ROT16_11(x) ((x).s56789ABCDEF01234) 435#define ROT16_12(x) ((x).s456789ABCDEF0123) 436#define ROT16_13(x) ((x).s3456789ABCDEF012) 437#define ROT16_14(x) ((x).s23456789ABCDEF01) 438#define ROT16_15(x) ((x).s123456789ABCDEF0) 439#define ROT16_16(x) ((x)) 440 441 442 443#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 444#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 445 446 447 448#define V_OFFS1(dt) (dt##1)(0) 449#define V_OFFS2(dt) (dt##2)(0, 1) 450#define V_OFFS3(dt) (dt##3)(0, 1, 2) 451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 454 455 456 457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 459 460 461#define VLOAD_STR(size) vload##size 462#define VLOAD(size) VLOAD_STR(size) 463 464 465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 467 468#define NO_LOAD(data, offs, ptr) \ 469 { \ 470 } 471 472 473#define vload_partial_1_0 NO_LOAD 474#define vload_partial_1_1 vload1 475#define vload_partial_1_2 NO_LOAD 476#define vload_partial_1_3 NO_LOAD 477#define vload_partial_1_4 NO_LOAD 478#define vload_partial_1_5 NO_LOAD 479#define vload_partial_1_6 NO_LOAD 480#define vload_partial_1_7 NO_LOAD 481#define vload_partial_1_8 NO_LOAD 482#define vload_partial_1_9 NO_LOAD 483#define vload_partial_1_10 NO_LOAD 484#define vload_partial_1_11 NO_LOAD 485#define vload_partial_1_12 NO_LOAD 486#define vload_partial_1_13 NO_LOAD 487#define vload_partial_1_14 NO_LOAD 488#define vload_partial_1_15 NO_LOAD 489#define vload_partial_1_16 NO_LOAD 490 491#define vload_partial_2_0 NO_LOAD 492#define vload_partial_2_1 vload_partial_1 493#define vload_partial_2_2 vload_partial_2 494#define vload_partial_2_3 NO_LOAD 495#define vload_partial_2_4 NO_LOAD 496#define vload_partial_2_5 NO_LOAD 497#define vload_partial_2_6 NO_LOAD 498#define vload_partial_2_7 NO_LOAD 499#define vload_partial_2_8 NO_LOAD 500#define vload_partial_2_9 NO_LOAD 501#define vload_partial_2_10 NO_LOAD 502#define vload_partial_2_11 NO_LOAD 503#define vload_partial_2_12 NO_LOAD 504#define vload_partial_2_13 NO_LOAD 505#define vload_partial_2_14 NO_LOAD 506#define vload_partial_2_15 NO_LOAD 507#define vload_partial_2_16 NO_LOAD 508 509#define vload_partial_3_0 NO_LOAD 510#define vload_partial_3_1 vload_partial_1 511#define vload_partial_3_2 vload_partial_2 512#define vload_partial_3_3 vload_partial_3 513#define vload_partial_3_4 NO_LOAD 514#define vload_partial_3_5 NO_LOAD 515#define vload_partial_3_6 NO_LOAD 516#define vload_partial_3_7 NO_LOAD 517#define vload_partial_3_8 NO_LOAD 518#define vload_partial_3_9 NO_LOAD 519#define vload_partial_3_10 NO_LOAD 520#define vload_partial_3_11 NO_LOAD 521#define vload_partial_3_12 NO_LOAD 522#define vload_partial_3_13 NO_LOAD 523#define vload_partial_3_14 NO_LOAD 524#define vload_partial_3_15 NO_LOAD 525#define vload_partial_3_16 NO_LOAD 526 527#define vload_partial_4_0 NO_LOAD 528#define vload_partial_4_1 vload_partial_1 529#define vload_partial_4_2 vload_partial_2 530#define vload_partial_4_3 vload_partial_3 531#define vload_partial_4_4 vload_partial_4 532#define vload_partial_4_5 NO_LOAD 533#define vload_partial_4_6 NO_LOAD 534#define vload_partial_4_7 NO_LOAD 535#define vload_partial_4_8 NO_LOAD 536#define vload_partial_4_9 NO_LOAD 537#define vload_partial_4_10 NO_LOAD 538#define vload_partial_4_11 NO_LOAD 539#define vload_partial_4_12 NO_LOAD 540#define vload_partial_4_13 NO_LOAD 541#define vload_partial_4_14 NO_LOAD 542#define vload_partial_4_15 NO_LOAD 543#define vload_partial_4_16 NO_LOAD 544 545#define vload_partial_8_0 NO_LOAD 546#define vload_partial_8_1 vload_partial_1 547#define vload_partial_8_2 vload_partial_2 548#define vload_partial_8_3 vload_partial_3 549#define vload_partial_8_4 vload_partial_4 550#define vload_partial_8_5 vload_partial_5 551#define vload_partial_8_6 vload_partial_6 552#define vload_partial_8_7 vload_partial_7 553#define vload_partial_8_8 vload_partial_8 554#define vload_partial_8_9 NO_LOAD 555#define vload_partial_8_10 NO_LOAD 556#define vload_partial_8_11 NO_LOAD 557#define vload_partial_8_12 NO_LOAD 558#define vload_partial_8_13 NO_LOAD 559#define vload_partial_8_14 NO_LOAD 560#define vload_partial_8_15 NO_LOAD 561#define vload_partial_8_16 NO_LOAD 562 563#define vload_partial_16_0 NO_LOAD 564#define vload_partial_16_1 vload_partial_1 565#define vload_partial_16_2 vload_partial_2 566#define vload_partial_16_3 vload_partial_3 567#define vload_partial_16_4 vload_partial_4 568#define vload_partial_16_5 vload_partial_5 569#define vload_partial_16_6 vload_partial_6 570#define vload_partial_16_7 vload_partial_7 571#define vload_partial_16_8 vload_partial_8 572#define vload_partial_16_9 vload_partial_9 573#define vload_partial_16_10 vload_partial_10 574#define vload_partial_16_11 vload_partial_11 575#define vload_partial_16_12 vload_partial_12 576#define vload_partial_16_13 vload_partial_13 577#define vload_partial_16_14 vload_partial_14 578#define vload_partial_16_15 vload_partial_15 579#define vload_partial_16_16 vload_partial_16 580 581 582#define vload_partial_1(DATA, OFFSET, PTR) \ 583 DATA.s0 = vload1(OFFSET, PTR); 584 585#define vload_partial_2(DATA, OFFSET, PTR) \ 586 DATA.s01 = vload2(OFFSET, PTR); 587 588#define vload_partial_3(DATA, OFFSET, PTR) \ 589 DATA.s012 = vload3(OFFSET, PTR); 590 591#define vload_partial_4(DATA, OFFSET, PTR) \ 592 DATA.s0123 = vload4(OFFSET, PTR); 593 594#define vload_partial_5(DATA, OFFSET, PTR) \ 595 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 596 DATA.s4 = vload1(OFFSET, PTR + 4); 597 598#define vload_partial_6(DATA, OFFSET, PTR) \ 599 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 600 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 601 602#define vload_partial_7(DATA, OFFSET, PTR) \ 603 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 604 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 605 606#define vload_partial_8(DATA, OFFSET, PTR) \ 607 DATA.s01234567 = vload8(OFFSET, PTR); 608 609#define vload_partial_9(DATA, OFFSET, PTR) \ 610 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 611 DATA.s8 = vload1(OFFSET, PTR + 8); 612 613#define vload_partial_10(DATA, OFFSET, PTR) \ 614 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 615 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 616 617#define vload_partial_11(DATA, OFFSET, PTR) \ 618 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 619 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 620 621#define vload_partial_12(DATA, OFFSET, PTR) \ 622 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 623 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 624 625#define vload_partial_13(DATA, OFFSET, PTR) \ 626 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 627 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 628 629#define vload_partial_14(DATA, OFFSET, PTR) \ 630 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 631 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 632 633#define vload_partial_15(DATA, OFFSET, PTR) \ 634 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 635 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 636 637#define vload_partial_16(DATA, OFFSET, PTR) \ 638 DATA = vload16(OFFSET, PTR); 639 640 641 642#define PIXEL_UNIT4 1 643#define PIXEL_UNIT8 2 644#define PIXEL_UNIT16 4 645 646 647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 649 650 651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 654 655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 659#endif 660 661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 664 665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 669#endif 670 671 672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 674 675 676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 678 679#define VSTORE_STR(size) vstore##size 680#define VSTORE(size) VSTORE_STR(size) 681 682#define float1 float 683#define half1 half 684#define char1 char 685#define uchar1 uchar 686#define short1 short 687#define ushort1 ushort 688#define int1 int 689#define uint1 uint 690#define long1 long 691#define ulong1 ulong 692#define double1 double 693 694#define vload1(OFFSET, PTR) *(OFFSET + PTR) 695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 696 697 698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 700 701#define NO_STORE(data, offs, ptr) \ 702 { \ 703 } 704 705 706#define vstore_partial_1_0 NO_STORE 707#define vstore_partial_1_1 vstore1 708#define vstore_partial_1_2 NO_STORE 709#define vstore_partial_1_3 NO_STORE 710#define vstore_partial_1_4 NO_STORE 711#define vstore_partial_1_5 NO_STORE 712#define vstore_partial_1_6 NO_STORE 713#define vstore_partial_1_7 NO_STORE 714#define vstore_partial_1_8 NO_STORE 715#define vstore_partial_1_9 NO_STORE 716#define vstore_partial_1_10 NO_STORE 717#define vstore_partial_1_11 NO_STORE 718#define vstore_partial_1_12 NO_STORE 719#define vstore_partial_1_13 NO_STORE 720#define vstore_partial_1_14 NO_STORE 721#define vstore_partial_1_15 NO_STORE 722#define vstore_partial_1_16 NO_STORE 723 724#define vstore_partial_2_0 NO_STORE 725#define vstore_partial_2_1 vstore_partial_1 726#define vstore_partial_2_2 vstore_partial_2 727#define vstore_partial_2_3 NO_STORE 728#define vstore_partial_2_4 NO_STORE 729#define vstore_partial_2_5 NO_STORE 730#define vstore_partial_2_6 NO_STORE 731#define vstore_partial_2_7 NO_STORE 732#define vstore_partial_2_8 NO_STORE 733#define vstore_partial_2_9 NO_STORE 734#define vstore_partial_2_10 NO_STORE 735#define vstore_partial_2_11 NO_STORE 736#define vstore_partial_2_12 NO_STORE 737#define vstore_partial_2_13 NO_STORE 738#define vstore_partial_2_14 NO_STORE 739#define vstore_partial_2_15 NO_STORE 740#define vstore_partial_2_16 NO_STORE 741 742#define vstore_partial_3_0 NO_STORE 743#define vstore_partial_3_1 vstore_partial_1 744#define vstore_partial_3_2 vstore_partial_2 745#define vstore_partial_3_3 vstore_partial_3 746#define vstore_partial_3_4 NO_STORE 747#define vstore_partial_3_5 NO_STORE 748#define vstore_partial_3_6 NO_STORE 749#define vstore_partial_3_7 NO_STORE 750#define vstore_partial_3_8 NO_STORE 751#define vstore_partial_3_9 NO_STORE 752#define vstore_partial_3_10 NO_STORE 753#define vstore_partial_3_11 NO_STORE 754#define vstore_partial_3_12 NO_STORE 755#define vstore_partial_3_13 NO_STORE 756#define vstore_partial_3_14 NO_STORE 757#define vstore_partial_3_15 NO_STORE 758#define vstore_partial_3_16 NO_STORE 759 760#define vstore_partial_4_0 NO_STORE 761#define vstore_partial_4_1 vstore_partial_1 762#define vstore_partial_4_2 vstore_partial_2 763#define vstore_partial_4_3 vstore_partial_3 764#define vstore_partial_4_4 vstore_partial_4 765#define vstore_partial_4_5 NO_STORE 766#define vstore_partial_4_6 NO_STORE 767#define vstore_partial_4_7 NO_STORE 768#define vstore_partial_4_8 NO_STORE 769#define vstore_partial_4_9 NO_STORE 770#define vstore_partial_4_10 NO_STORE 771#define vstore_partial_4_11 NO_STORE 772#define vstore_partial_4_12 NO_STORE 773#define vstore_partial_4_13 NO_STORE 774#define vstore_partial_4_14 NO_STORE 775#define vstore_partial_4_15 NO_STORE 776#define vstore_partial_4_16 NO_STORE 777 778#define vstore_partial_8_0 NO_STORE 779#define vstore_partial_8_1 vstore_partial_1 780#define vstore_partial_8_2 vstore_partial_2 781#define vstore_partial_8_3 vstore_partial_3 782#define vstore_partial_8_4 vstore_partial_4 783#define vstore_partial_8_5 vstore_partial_5 784#define vstore_partial_8_6 vstore_partial_6 785#define vstore_partial_8_7 vstore_partial_7 786#define vstore_partial_8_8 vstore_partial_8 787#define vstore_partial_8_9 NO_STORE 788#define vstore_partial_8_10 NO_STORE 789#define vstore_partial_8_11 NO_STORE 790#define vstore_partial_8_12 NO_STORE 791#define vstore_partial_8_13 NO_STORE 792#define vstore_partial_8_14 NO_STORE 793#define vstore_partial_8_15 NO_STORE 794#define vstore_partial_8_16 NO_STORE 795 796#define vstore_partial_16_0 NO_STORE 797#define vstore_partial_16_1 vstore_partial_1 798#define vstore_partial_16_2 vstore_partial_2 799#define vstore_partial_16_3 vstore_partial_3 800#define vstore_partial_16_4 vstore_partial_4 801#define vstore_partial_16_5 vstore_partial_5 802#define vstore_partial_16_6 vstore_partial_6 803#define vstore_partial_16_7 vstore_partial_7 804#define vstore_partial_16_8 vstore_partial_8 805#define vstore_partial_16_9 vstore_partial_9 806#define vstore_partial_16_10 vstore_partial_10 807#define vstore_partial_16_11 vstore_partial_11 808#define vstore_partial_16_12 vstore_partial_12 809#define vstore_partial_16_13 vstore_partial_13 810#define vstore_partial_16_14 vstore_partial_14 811#define vstore_partial_16_15 vstore_partial_15 812#define vstore_partial_16_16 vstore_partial_16 813 814 815#define vstore_partial_1(DATA, OFFSET, PTR) \ 816 vstore1(DATA.s0, OFFSET, PTR); 817 818#define vstore_partial_2(DATA, OFFSET, PTR) \ 819 vstore2(DATA.s01, OFFSET, PTR); 820 821#define vstore_partial_3(DATA, OFFSET, PTR) \ 822 vstore3(DATA.s012, OFFSET, PTR); 823 824#define vstore_partial_4(DATA, OFFSET, PTR) \ 825 vstore4(DATA.s0123, OFFSET, PTR); 826 827#define vstore_partial_5(DATA, OFFSET, PTR) \ 828 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 829 vstore1(DATA.s4, OFFSET, PTR + 4); 830 831#define vstore_partial_6(DATA, OFFSET, PTR) \ 832 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 833 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 834 835#define vstore_partial_7(DATA, OFFSET, PTR) \ 836 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 837 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 838 839#define vstore_partial_8(DATA, OFFSET, PTR) \ 840 vstore8(DATA.s01234567, OFFSET, PTR); 841 842#define vstore_partial_9(DATA, OFFSET, PTR) \ 843 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 844 vstore1(DATA.s8, OFFSET, PTR + 8); 845 846#define vstore_partial_10(DATA, OFFSET, PTR) \ 847 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 848 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 849 850#define vstore_partial_11(DATA, OFFSET, PTR) \ 851 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 852 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 853 854#define vstore_partial_12(DATA, OFFSET, PTR) \ 855 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 856 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 857 858#define vstore_partial_13(DATA, OFFSET, PTR) \ 859 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 860 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 861 862#define vstore_partial_14(DATA, OFFSET, PTR) \ 863 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 864 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 865 866#define vstore_partial_15(DATA, OFFSET, PTR) \ 867 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 868 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 869 870#define vstore_partial_16(DATA, OFFSET, PTR) \ 871 vstore16(DATA, OFFSET, PTR); 872 873 874 875 876 877#define convert_float_sat convert_float 878#define convert_float1_sat convert_float 879#define convert_float2_sat convert_float2 880#define convert_float3_sat convert_float3 881#define convert_float4_sat convert_float4 882#define convert_float8_sat convert_float8 883#define convert_float16_sat convert_float16 884#define convert_half_sat convert_float 885#define convert_half1_sat convert_half 886#define convert_half2_sat convert_half2 887#define convert_half3_sat convert_half3 888#define convert_half4_sat convert_half4 889#define convert_half8_sat convert_half8 890#define convert_half16_sat convert_half16 891 892#define convert_float1 convert_float 893#define convert_half1 convert_half 894#define convert_char1 convert_char 895#define convert_uchar1 convert_uchar 896#define convert_short1 convert_short 897#define convert_ushort1 convert_ushort 898#define convert_int1 convert_int 899#define convert_uint1 convert_uint 900#define convert_long1 convert_long 901#define convert_ulong1 convert_ulong 902#define convert_double1 convert_double 903 904#define convert_char1_sat convert_char_sat 905#define convert_uchar1_sat convert_uchar_sat 906#define convert_uchar2_sat convert_uchar2_sat 907#define convert_uchar3_sat convert_uchar3_sat 908#define convert_uchar4_sat convert_uchar4_sat 909#define convert_uchar8_sat convert_uchar8_sat 910#define convert_uchar16_sat convert_uchar16_sat 911#define convert_short1_sat convert_short_sat 912#define convert_ushort1_sat convert_ushort_sat 913#define convert_int1_sat convert_int_sat 914#define convert_uint1_sat convert_uint_sat 915#define convert_long1_sat convert_long_sat 916#define convert_ulong1_sat convert_ulong_sat 917#define convert_double1_sat convert_double_sat 918 919#define VEC_DATA_TYPE_STR(type, size) type##size 920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 921 922#define CONVERT_STR(x, type) (convert_##type((x))) 923#define CONVERT(x, type) CONVERT_STR(x, type) 924 925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 927 928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 930 931#define select_vec_dt_uchar(size) uchar##size 932#define select_vec_dt_char(size) char##size 933#define select_vec_dt_ushort(size) ushort##size 934#define select_vec_dt_short(size) short##size 935#define select_vec_dt_half(size) short##size 936#define select_vec_dt_uint(size) uint##size 937#define select_vec_dt_int(size) int##size 938#define select_vec_dt_float(size) int##size 939#define select_vec_dt_ulong(size) ulong##size 940#define select_vec_dt_long(size) long##size 941 942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 945 946#define signed_int_vec_dt_uchar(size) char##size 947#define signed_int_vec_dt_char(size) char##size 948#define signed_int_vec_dt_ushort(size) short##size 949#define signed_int_vec_dt_short(size) short##size 950#define signed_int_vec_dt_half(size) short##size 951#define signed_int_vec_dt_uint(size) int##size 952#define signed_int_vec_dt_int(size) int##size 953#define signed_int_vec_dt_float(size) int##size 954#define signed_int_vec_dt_ulong(size) long##size 955#define signed_int_vec_dt_long(size) long##size 956 957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 960 961#define sum_reduce_1(x) (x) 962#define sum_reduce_2(x) ((x).s0) + ((x).s1) 963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 967 968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 970 971#define prod_reduce_1(x) (x) 972#define prod_reduce_2(x) ((x).s0) * ((x).s1) 973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 977 978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 980 981#define max_reduce_1(x) (x) 982#define max_reduce_2(x) max(((x).s0), ((x).s1)) 983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 987 988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 990 991#define VECTOR_DECLARATION(name) \ 992 __global uchar *name##_ptr, \ 993 uint name##_stride_x, \ 994 uint name##_step_x, \ 995 uint name##_offset_first_element_in_bytes 996 997#define IMAGE_DECLARATION(name) \ 998 __global uchar *name##_ptr, \ 999 uint name##_stride_x, \ 1000 uint name##_step_x, \ 1001 uint name##_stride_y, \ 1002 uint name##_step_y, \ 1003 uint name##_offset_first_element_in_bytes 1004 1005#define TENSOR3D_DECLARATION(name) \ 1006 __global uchar *name##_ptr, \ 1007 uint name##_stride_x, \ 1008 uint name##_step_x, \ 1009 uint name##_stride_y, \ 1010 uint name##_step_y, \ 1011 uint name##_stride_z, \ 1012 uint name##_step_z, \ 1013 uint name##_offset_first_element_in_bytes 1014 1015#define TENSOR4D_DECLARATION(name) \ 1016 __global uchar *name##_ptr, \ 1017 uint name##_stride_x, \ 1018 uint name##_step_x, \ 1019 uint name##_stride_y, \ 1020 uint name##_step_y, \ 1021 uint name##_stride_z, \ 1022 uint name##_step_z, \ 1023 uint name##_stride_w, \ 1024 uint name##_step_w, \ 1025 uint name##_offset_first_element_in_bytes 1026 1027#define TENSOR5D_DECLARATION(name) \ 1028 __global uchar *name##_ptr, \ 1029 uint name##_stride_x, \ 1030 uint name##_step_x, \ 1031 uint name##_stride_y, \ 1032 uint name##_step_y, \ 1033 uint name##_stride_z, \ 1034 uint name##_step_z, \ 1035 uint name##_stride_w, \ 1036 uint name##_step_w, \ 1037 uint name##_stride_v, \ 1038 uint name##_step_v, \ 1039 uint name##_offset_first_element_in_bytes 1040 1041#define CONVERT_TO_VECTOR_STRUCT(name) \ 1042 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1043 1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1045 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1046 1047#define CONVERT_TO_IMAGE_STRUCT(name) \ 1048 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1049 1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1051 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1052 1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1054 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1055 1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1057 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1058 1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1060 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1061 1062#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1063 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1064 name##_stride_z, name##_step_z) 1065 1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1067 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1068 1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1070 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1071 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1072 1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1074 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1075 1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1077 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1078 name##_stride_z, name##_step_z) 1079 1080 1081typedef struct Vector 1082{ 1083 __global uchar *ptr; 1084 int offset_first_element_in_bytes; 1085 int stride_x; 1086} Vector; 1087 1088 1089typedef struct Image 1090{ 1091 __global uchar *ptr; 1092 int offset_first_element_in_bytes; 1093 int stride_x; 1094 int stride_y; 1095} Image; 1096 1097 1098typedef struct Tensor3D 1099{ 1100 __global uchar *ptr; 1101 int offset_first_element_in_bytes; 1102 int stride_x; 1103 int stride_y; 1104 int stride_z; 1105} Tensor3D; 1106 1107 1108typedef struct Tensor4D 1109{ 1110 __global uchar *ptr; 1111 int offset_first_element_in_bytes; 1112 int stride_x; 1113 int stride_y; 1114 int stride_z; 1115 int stride_w; 1116} Tensor4D; 1117 1118 1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1120{ 1121 Vector vector = 1122 { 1123 .ptr = ptr, 1124 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1125 .stride_x = stride_x, 1126 }; 1127 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1128 return vector; 1129} 1130 1131 1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1133{ 1134 Image img = 1135 { 1136 .ptr = ptr, 1137 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1138 .stride_x = stride_x, 1139 .stride_y = stride_y 1140 }; 1141 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1142 return img; 1143} 1144 1145 1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1147{ 1148 Image img = 1149 { 1150 .ptr = ptr, 1151 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1152 .stride_x = stride_x, 1153 .stride_y = stride_y 1154 }; 1155 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1156 return img; 1157} 1158 1159 1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1161{ 1162 Tensor3D tensor = 1163 { 1164 .ptr = ptr, 1165 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1166 .stride_x = stride_x, 1167 .stride_y = stride_y, 1168 .stride_z = stride_z 1169 }; 1170 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1171 return tensor; 1172} 1173 1174 1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1176{ 1177 Tensor3D tensor = 1178 { 1179 .ptr = ptr, 1180 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1181 .stride_x = stride_x, 1182 .stride_y = stride_y, 1183 .stride_z = stride_z 1184 }; 1185 return tensor; 1186} 1187 1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1189 uint step_w, 1190 uint mod_size) 1191{ 1192 Tensor4D tensor = 1193 { 1194 .ptr = ptr, 1195 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1196 .stride_x = stride_x, 1197 .stride_y = stride_y, 1198 .stride_z = stride_z, 1199 .stride_w = stride_w 1200 }; 1201 1202 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1203 return tensor; 1204} 1205 1206 1207inline __global const uchar *vector_offset(const Vector *vec, int x) 1208{ 1209 return vec->ptr + x * vec->stride_x; 1210} 1211 1212 1213inline __global uchar *offset(const Image *img, int x, int y) 1214{ 1215 return img->ptr + x * img->stride_x + y * img->stride_y; 1216} 1217 1218 1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1220{ 1221 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1222} 1223 1224 1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1226{ 1227 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1228} 1229 1230 1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1232{ 1233 uint num_elements = width * height; 1234 1235 const uint z = index / num_elements; 1236 1237 index %= num_elements; 1238 1239 const uint y = index / width; 1240 1241 index %= width; 1242 1243 const uint x = index; 1244 1245 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1246} 1247 1248#endif 1249 1250#if GPU_ARCH == GPU_ARCH_BIFROST 1251#define MLA(a, b, c) (fma(c, b, a)) 1252#else 1253#define MLA(a, b, c) ((b) * (c) + (a)) 1254#endif 1255 1256 1257#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) 1258 1259 1260#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) 1261 1262 1263#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) 1264 1265 1266#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) 1267 1268 1269#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) 1270 1271 1272#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 1273 1274 1275#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) 1276 1277 1278#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) 1279 1280 1281#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) 1282 1283 1284#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) 1285 1286 1287#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x) 1288 1289 1290#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x)) 1291 1292 1293#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) 1294 1295 1296#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) 1297 1298 1299#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) 1300 1301#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1302 1303#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1304 1305#ifndef ARM_COMPUTE_HELPER_H 1306#define ARM_COMPUTE_HELPER_H 1307 1308 1309 1310 1311#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1312 VSTORE(N0) \ 1313 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1314 1315#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1316 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1317 VSTORE(N0) \ 1318 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1319 1320#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1321 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1322 VSTORE(N0) \ 1323 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1324 1325#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1326 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1327 VSTORE(N0) \ 1328 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1329 1330#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1331 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1332 VSTORE(N0) \ 1333 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1334 1335#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1336 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1337 VSTORE(N0) \ 1338 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1339 1340#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1341 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1342 VSTORE(N0) \ 1343 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1344 1345#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1346 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1347 VSTORE(N0) \ 1348 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1349 1350#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1351 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1352 VSTORE(N0) \ 1353 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1354 1355#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1356 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1357 VSTORE(N0) \ 1358 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1359 1360#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1361 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1362 VSTORE(N0) \ 1363 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1364 1365#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1366 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1367 VSTORE(N0) \ 1368 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1369 1370#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1371 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1372 VSTORE(N0) \ 1373 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1374 1375#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1376 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1377 VSTORE(N0) \ 1378 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1379 1380#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1381 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1382 VSTORE(N0) \ 1383 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1384 1385#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1386 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1387 VSTORE(N0) \ 1388 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1389 1390 1391 1392#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1393 VSTORE(N0) \ 1394 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1395 1396#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1397 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1398 VSTORE(N0) \ 1399 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1400 1401#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1402 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1403 VSTORE(N0) \ 1404 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1405 1406#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1407 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1408 VSTORE(N0) \ 1409 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1410 1411#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1412 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1413 VSTORE(N0) \ 1414 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1415 1416#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1417 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1418 VSTORE(N0) \ 1419 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1420 1421#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1422 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1423 VSTORE(N0) \ 1424 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1425 1426#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1427 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1428 VSTORE(N0) \ 1429 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1430 1431#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1432 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1433 VSTORE(N0) \ 1434 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1435 1436#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 1437 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1438 VSTORE(N0) \ 1439 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1440 1441#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1442 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1443 VSTORE(N0) \ 1444 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1445 1446#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1447 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1448 VSTORE(N0) \ 1449 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1450 1451#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1452 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1453 VSTORE(N0) \ 1454 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1455 1456#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1457 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1458 VSTORE(N0) \ 1459 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1460 1461#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1462 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1463 VSTORE(N0) \ 1464 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1465 1466#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1467 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1468 VSTORE(N0) \ 1469 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1470 1471 1472 1473 1474#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1475#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1476 1477 1478 1479#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1480#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1481 1482 1483 1484#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1485 VSTORE_PARTIAL(N0, STORE_N0) \ 1486 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1487 1488#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1489 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1490 VSTORE_PARTIAL(N0, STORE_N0) \ 1491 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1492 1493#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1494 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1495 VSTORE_PARTIAL(N0, STORE_N0) \ 1496 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1497 1498#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1499 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1500 VSTORE_PARTIAL(N0, STORE_N0) \ 1501 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1502 1503#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1504 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1505 VSTORE_PARTIAL(N0, STORE_N0) \ 1506 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1507 1508#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1509 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1510 VSTORE_PARTIAL(N0, STORE_N0) \ 1511 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1512 1513#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1514 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1515 VSTORE_PARTIAL(N0, STORE_N0) \ 1516 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1517 1518#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1519 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1520 VSTORE_PARTIAL(N0, STORE_N0) \ 1521 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1522 1523#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1524 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1525 VSTORE_PARTIAL(N0, STORE_N0) \ 1526 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1527 1528#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1529 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1530 VSTORE_PARTIAL(N0, STORE_N0) \ 1531 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1532 1533#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1534 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1535 VSTORE_PARTIAL(N0, STORE_N0) \ 1536 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1537 1538#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1539 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1540 VSTORE_PARTIAL(N0, STORE_N0) \ 1541 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1542 1543#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1544 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1545 VSTORE_PARTIAL(N0, STORE_N0) \ 1546 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1547 1548#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1549 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1550 VSTORE_PARTIAL(N0, STORE_N0) \ 1551 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1552 1553#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1554 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1555 VSTORE_PARTIAL(N0, STORE_N0) \ 1556 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1557 1558#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1559 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1560 VSTORE_PARTIAL(N0, STORE_N0) \ 1561 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1562 1563 1564 1565#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1566#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1567 1568#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1569 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 1570 { \ 1571 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1572 } \ 1573 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 1574 { \ 1575 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1576 } \ 1577 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 1578 { \ 1579 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1580 } \ 1581 else \ 1582 { \ 1583 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1584 } 1585 1586#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 1587 if(!(PARTIAL_COND_X)) \ 1588 { \ 1589 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1590 } \ 1591 else \ 1592 { \ 1593 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1594 } 1595 1596#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 1597 if(!(PARTIAL_COND_Y)) \ 1598 { \ 1599 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1600 } \ 1601 else \ 1602 { \ 1603 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1604 } 1605 1606 1607#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1608 1609 1610#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 1611 1612#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1613 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1614 1615#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 1616 1617#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1618 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 1619 1620#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 1621 1622#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1623 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 1624 1625#else 1626 1627#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1628 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 1629 1630#endif 1631 1632#endif 1633 1634 1635#if defined(PARTIAL_STORE_M0) 1636 1637#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1638 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 1639#else 1640#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1641 ((uint)(y * M0)) 1642#endif 1643 1644 1645 1646#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 1647 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 1648 1649 1650#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1651#pragma OPENCL EXTENSION cl_khr_fp16 : enable 1652#endif 1653 1654#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1655#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 1656#endif 1657 1658#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 1659#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 1660#endif 1661 1662#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 1663#pragma OPENCL EXTENSION cl_arm_printf : enable 1664#endif 1665 1666#define GPU_ARCH_MIDGARD 0x100 1667#define GPU_ARCH_BIFROST 0x200 1668#define GPU_ARCH_VALHALL 0x300 1669 1670 1671#define CONCAT(a, b) a##b 1672 1673 1674#define EXPAND(x) x 1675 1676 1677#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 1678 1679 1680#define REV1(x) ((x)) 1681#define REV2(x) ((x).s10) 1682#define REV3(x) ((x).s210) 1683#define REV4(x) ((x).s3210) 1684#define REV8(x) ((x).s76543210) 1685#define REV16(x) ((x).sFEDCBA9876543210) 1686 1687 1688 1689#define REVERSE_STR(x, s) REV##s((x)) 1690#define REVERSE(x, s) REVERSE_STR(x, s) 1691 1692 1693 1694#define ROT1_0(x) ((x)) 1695#define ROT1_1(x) ((x)) 1696 1697#define ROT2_0(x) ((x)) 1698#define ROT2_1(x) ((x).s10) 1699#define ROT2_2(x) ((x)) 1700 1701#define ROT3_0(x) ((x)) 1702#define ROT3_1(x) ((x).s201) 1703#define ROT3_2(x) ((x).s120) 1704#define ROT3_3(x) ((x)) 1705 1706#define ROT4_0(x) ((x)) 1707#define ROT4_1(x) ((x).s3012) 1708#define ROT4_2(x) ((x).s2301) 1709#define ROT4_3(x) ((x).s1230) 1710#define ROT4_4(x) ((x)) 1711 1712#define ROT8_0(x) ((x)) 1713#define ROT8_1(x) ((x).s70123456) 1714#define ROT8_2(x) ((x).s67012345) 1715#define ROT8_3(x) ((x).s56701234) 1716#define ROT8_4(x) ((x).s45670123) 1717#define ROT8_5(x) ((x).s34567012) 1718#define ROT8_6(x) ((x).s23456701) 1719#define ROT8_7(x) ((x).s12345670) 1720#define ROT8_8(x) ((x)) 1721 1722#define ROT16_0(x) ((x)) 1723#define ROT16_1(x) ((x).sF0123456789ABCDE) 1724#define ROT16_2(x) ((x).sEF0123456789ABCD) 1725#define ROT16_3(x) ((x).sDEF0123456789ABC) 1726#define ROT16_4(x) ((x).sCDEF0123456789AB) 1727#define ROT16_5(x) ((x).sBCDEF0123456789A) 1728#define ROT16_6(x) ((x).sABCDEF0123456789) 1729#define ROT16_7(x) ((x).s9ABCDEF012345678) 1730#define ROT16_8(x) ((x).s89ABCDEF01234567) 1731#define ROT16_9(x) ((x).s789ABCDEF0123456) 1732#define ROT16_10(x) ((x).s6789ABCDEF012345) 1733#define ROT16_11(x) ((x).s56789ABCDEF01234) 1734#define ROT16_12(x) ((x).s456789ABCDEF0123) 1735#define ROT16_13(x) ((x).s3456789ABCDEF012) 1736#define ROT16_14(x) ((x).s23456789ABCDEF01) 1737#define ROT16_15(x) ((x).s123456789ABCDEF0) 1738#define ROT16_16(x) ((x)) 1739 1740 1741 1742#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 1743#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 1744 1745 1746 1747#define V_OFFS1(dt) (dt##1)(0) 1748#define V_OFFS2(dt) (dt##2)(0, 1) 1749#define V_OFFS3(dt) (dt##3)(0, 1, 2) 1750#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 1751#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 1752#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 1753 1754 1755 1756#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 1757#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 1758 1759 1760#define VLOAD_STR(size) vload##size 1761#define VLOAD(size) VLOAD_STR(size) 1762 1763 1764#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 1765#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 1766 1767#define NO_LOAD(data, offs, ptr) \ 1768 { \ 1769 } 1770 1771 1772#define vload_partial_1_0 NO_LOAD 1773#define vload_partial_1_1 vload1 1774#define vload_partial_1_2 NO_LOAD 1775#define vload_partial_1_3 NO_LOAD 1776#define vload_partial_1_4 NO_LOAD 1777#define vload_partial_1_5 NO_LOAD 1778#define vload_partial_1_6 NO_LOAD 1779#define vload_partial_1_7 NO_LOAD 1780#define vload_partial_1_8 NO_LOAD 1781#define vload_partial_1_9 NO_LOAD 1782#define vload_partial_1_10 NO_LOAD 1783#define vload_partial_1_11 NO_LOAD 1784#define vload_partial_1_12 NO_LOAD 1785#define vload_partial_1_13 NO_LOAD 1786#define vload_partial_1_14 NO_LOAD 1787#define vload_partial_1_15 NO_LOAD 1788#define vload_partial_1_16 NO_LOAD 1789 1790#define vload_partial_2_0 NO_LOAD 1791#define vload_partial_2_1 vload_partial_1 1792#define vload_partial_2_2 vload_partial_2 1793#define vload_partial_2_3 NO_LOAD 1794#define vload_partial_2_4 NO_LOAD 1795#define vload_partial_2_5 NO_LOAD 1796#define vload_partial_2_6 NO_LOAD 1797#define vload_partial_2_7 NO_LOAD 1798#define vload_partial_2_8 NO_LOAD 1799#define vload_partial_2_9 NO_LOAD 1800#define vload_partial_2_10 NO_LOAD 1801#define vload_partial_2_11 NO_LOAD 1802#define vload_partial_2_12 NO_LOAD 1803#define vload_partial_2_13 NO_LOAD 1804#define vload_partial_2_14 NO_LOAD 1805#define vload_partial_2_15 NO_LOAD 1806#define vload_partial_2_16 NO_LOAD 1807 1808#define vload_partial_3_0 NO_LOAD 1809#define vload_partial_3_1 vload_partial_1 1810#define vload_partial_3_2 vload_partial_2 1811#define vload_partial_3_3 vload_partial_3 1812#define vload_partial_3_4 NO_LOAD 1813#define vload_partial_3_5 NO_LOAD 1814#define vload_partial_3_6 NO_LOAD 1815#define vload_partial_3_7 NO_LOAD 1816#define vload_partial_3_8 NO_LOAD 1817#define vload_partial_3_9 NO_LOAD 1818#define vload_partial_3_10 NO_LOAD 1819#define vload_partial_3_11 NO_LOAD 1820#define vload_partial_3_12 NO_LOAD 1821#define vload_partial_3_13 NO_LOAD 1822#define vload_partial_3_14 NO_LOAD 1823#define vload_partial_3_15 NO_LOAD 1824#define vload_partial_3_16 NO_LOAD 1825 1826#define vload_partial_4_0 NO_LOAD 1827#define vload_partial_4_1 vload_partial_1 1828#define vload_partial_4_2 vload_partial_2 1829#define vload_partial_4_3 vload_partial_3 1830#define vload_partial_4_4 vload_partial_4 1831#define vload_partial_4_5 NO_LOAD 1832#define vload_partial_4_6 NO_LOAD 1833#define vload_partial_4_7 NO_LOAD 1834#define vload_partial_4_8 NO_LOAD 1835#define vload_partial_4_9 NO_LOAD 1836#define vload_partial_4_10 NO_LOAD 1837#define vload_partial_4_11 NO_LOAD 1838#define vload_partial_4_12 NO_LOAD 1839#define vload_partial_4_13 NO_LOAD 1840#define vload_partial_4_14 NO_LOAD 1841#define vload_partial_4_15 NO_LOAD 1842#define vload_partial_4_16 NO_LOAD 1843 1844#define vload_partial_8_0 NO_LOAD 1845#define vload_partial_8_1 vload_partial_1 1846#define vload_partial_8_2 vload_partial_2 1847#define vload_partial_8_3 vload_partial_3 1848#define vload_partial_8_4 vload_partial_4 1849#define vload_partial_8_5 vload_partial_5 1850#define vload_partial_8_6 vload_partial_6 1851#define vload_partial_8_7 vload_partial_7 1852#define vload_partial_8_8 vload_partial_8 1853#define vload_partial_8_9 NO_LOAD 1854#define vload_partial_8_10 NO_LOAD 1855#define vload_partial_8_11 NO_LOAD 1856#define vload_partial_8_12 NO_LOAD 1857#define vload_partial_8_13 NO_LOAD 1858#define vload_partial_8_14 NO_LOAD 1859#define vload_partial_8_15 NO_LOAD 1860#define vload_partial_8_16 NO_LOAD 1861 1862#define vload_partial_16_0 NO_LOAD 1863#define vload_partial_16_1 vload_partial_1 1864#define vload_partial_16_2 vload_partial_2 1865#define vload_partial_16_3 vload_partial_3 1866#define vload_partial_16_4 vload_partial_4 1867#define vload_partial_16_5 vload_partial_5 1868#define vload_partial_16_6 vload_partial_6 1869#define vload_partial_16_7 vload_partial_7 1870#define vload_partial_16_8 vload_partial_8 1871#define vload_partial_16_9 vload_partial_9 1872#define vload_partial_16_10 vload_partial_10 1873#define vload_partial_16_11 vload_partial_11 1874#define vload_partial_16_12 vload_partial_12 1875#define vload_partial_16_13 vload_partial_13 1876#define vload_partial_16_14 vload_partial_14 1877#define vload_partial_16_15 vload_partial_15 1878#define vload_partial_16_16 vload_partial_16 1879 1880 1881#define vload_partial_1(DATA, OFFSET, PTR) \ 1882 DATA.s0 = vload1(OFFSET, PTR); 1883 1884#define vload_partial_2(DATA, OFFSET, PTR) \ 1885 DATA.s01 = vload2(OFFSET, PTR); 1886 1887#define vload_partial_3(DATA, OFFSET, PTR) \ 1888 DATA.s012 = vload3(OFFSET, PTR); 1889 1890#define vload_partial_4(DATA, OFFSET, PTR) \ 1891 DATA.s0123 = vload4(OFFSET, PTR); 1892 1893#define vload_partial_5(DATA, OFFSET, PTR) \ 1894 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1895 DATA.s4 = vload1(OFFSET, PTR + 4); 1896 1897#define vload_partial_6(DATA, OFFSET, PTR) \ 1898 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1899 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 1900 1901#define vload_partial_7(DATA, OFFSET, PTR) \ 1902 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1903 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 1904 1905#define vload_partial_8(DATA, OFFSET, PTR) \ 1906 DATA.s01234567 = vload8(OFFSET, PTR); 1907 1908#define vload_partial_9(DATA, OFFSET, PTR) \ 1909 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1910 DATA.s8 = vload1(OFFSET, PTR + 8); 1911 1912#define vload_partial_10(DATA, OFFSET, PTR) \ 1913 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1914 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 1915 1916#define vload_partial_11(DATA, OFFSET, PTR) \ 1917 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1918 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 1919 1920#define vload_partial_12(DATA, OFFSET, PTR) \ 1921 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1922 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 1923 1924#define vload_partial_13(DATA, OFFSET, PTR) \ 1925 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1926 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 1927 1928#define vload_partial_14(DATA, OFFSET, PTR) \ 1929 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1930 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 1931 1932#define vload_partial_15(DATA, OFFSET, PTR) \ 1933 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1934 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 1935 1936#define vload_partial_16(DATA, OFFSET, PTR) \ 1937 DATA = vload16(OFFSET, PTR); 1938 1939 1940 1941#define PIXEL_UNIT4 1 1942#define PIXEL_UNIT8 2 1943#define PIXEL_UNIT16 4 1944 1945 1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 1947#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 1948 1949 1950#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 1951#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 1952#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 1953 1954#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1955#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 1956#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 1957#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 1958#endif 1959 1960#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 1961#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1962#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1963 1964#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1965#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 1966#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1967#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1968#endif 1969 1970 1971#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 1972#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 1973 1974 1975#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 1976#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 1977 1978#define VSTORE_STR(size) vstore##size 1979#define VSTORE(size) VSTORE_STR(size) 1980 1981#define float1 float 1982#define half1 half 1983#define char1 char 1984#define uchar1 uchar 1985#define short1 short 1986#define ushort1 ushort 1987#define int1 int 1988#define uint1 uint 1989#define long1 long 1990#define ulong1 ulong 1991#define double1 double 1992 1993#define vload1(OFFSET, PTR) *(OFFSET + PTR) 1994#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 1995 1996 1997#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 1998#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 1999 2000#define NO_STORE(data, offs, ptr) \ 2001 { \ 2002 } 2003 2004 2005#define vstore_partial_1_0 NO_STORE 2006#define vstore_partial_1_1 vstore1 2007#define vstore_partial_1_2 NO_STORE 2008#define vstore_partial_1_3 NO_STORE 2009#define vstore_partial_1_4 NO_STORE 2010#define vstore_partial_1_5 NO_STORE 2011#define vstore_partial_1_6 NO_STORE 2012#define vstore_partial_1_7 NO_STORE 2013#define vstore_partial_1_8 NO_STORE 2014#define vstore_partial_1_9 NO_STORE 2015#define vstore_partial_1_10 NO_STORE 2016#define vstore_partial_1_11 NO_STORE 2017#define vstore_partial_1_12 NO_STORE 2018#define vstore_partial_1_13 NO_STORE 2019#define vstore_partial_1_14 NO_STORE 2020#define vstore_partial_1_15 NO_STORE 2021#define vstore_partial_1_16 NO_STORE 2022 2023#define vstore_partial_2_0 NO_STORE 2024#define vstore_partial_2_1 vstore_partial_1 2025#define vstore_partial_2_2 vstore_partial_2 2026#define vstore_partial_2_3 NO_STORE 2027#define vstore_partial_2_4 NO_STORE 2028#define vstore_partial_2_5 NO_STORE 2029#define vstore_partial_2_6 NO_STORE 2030#define vstore_partial_2_7 NO_STORE 2031#define vstore_partial_2_8 NO_STORE 2032#define vstore_partial_2_9 NO_STORE 2033#define vstore_partial_2_10 NO_STORE 2034#define vstore_partial_2_11 NO_STORE 2035#define vstore_partial_2_12 NO_STORE 2036#define vstore_partial_2_13 NO_STORE 2037#define vstore_partial_2_14 NO_STORE 2038#define vstore_partial_2_15 NO_STORE 2039#define vstore_partial_2_16 NO_STORE 2040 2041#define vstore_partial_3_0 NO_STORE 2042#define vstore_partial_3_1 vstore_partial_1 2043#define vstore_partial_3_2 vstore_partial_2 2044#define vstore_partial_3_3 vstore_partial_3 2045#define vstore_partial_3_4 NO_STORE 2046#define vstore_partial_3_5 NO_STORE 2047#define vstore_partial_3_6 NO_STORE 2048#define vstore_partial_3_7 NO_STORE 2049#define vstore_partial_3_8 NO_STORE 2050#define vstore_partial_3_9 NO_STORE 2051#define vstore_partial_3_10 NO_STORE 2052#define vstore_partial_3_11 NO_STORE 2053#define vstore_partial_3_12 NO_STORE 2054#define vstore_partial_3_13 NO_STORE 2055#define vstore_partial_3_14 NO_STORE 2056#define vstore_partial_3_15 NO_STORE 2057#define vstore_partial_3_16 NO_STORE 2058 2059#define vstore_partial_4_0 NO_STORE 2060#define vstore_partial_4_1 vstore_partial_1 2061#define vstore_partial_4_2 vstore_partial_2 2062#define vstore_partial_4_3 vstore_partial_3 2063#define vstore_partial_4_4 vstore_partial_4 2064#define vstore_partial_4_5 NO_STORE 2065#define vstore_partial_4_6 NO_STORE 2066#define vstore_partial_4_7 NO_STORE 2067#define vstore_partial_4_8 NO_STORE 2068#define vstore_partial_4_9 NO_STORE 2069#define vstore_partial_4_10 NO_STORE 2070#define vstore_partial_4_11 NO_STORE 2071#define vstore_partial_4_12 NO_STORE 2072#define vstore_partial_4_13 NO_STORE 2073#define vstore_partial_4_14 NO_STORE 2074#define vstore_partial_4_15 NO_STORE 2075#define vstore_partial_4_16 NO_STORE 2076 2077#define vstore_partial_8_0 NO_STORE 2078#define vstore_partial_8_1 vstore_partial_1 2079#define vstore_partial_8_2 vstore_partial_2 2080#define vstore_partial_8_3 vstore_partial_3 2081#define vstore_partial_8_4 vstore_partial_4 2082#define vstore_partial_8_5 vstore_partial_5 2083#define vstore_partial_8_6 vstore_partial_6 2084#define vstore_partial_8_7 vstore_partial_7 2085#define vstore_partial_8_8 vstore_partial_8 2086#define vstore_partial_8_9 NO_STORE 2087#define vstore_partial_8_10 NO_STORE 2088#define vstore_partial_8_11 NO_STORE 2089#define vstore_partial_8_12 NO_STORE 2090#define vstore_partial_8_13 NO_STORE 2091#define vstore_partial_8_14 NO_STORE 2092#define vstore_partial_8_15 NO_STORE 2093#define vstore_partial_8_16 NO_STORE 2094 2095#define vstore_partial_16_0 NO_STORE 2096#define vstore_partial_16_1 vstore_partial_1 2097#define vstore_partial_16_2 vstore_partial_2 2098#define vstore_partial_16_3 vstore_partial_3 2099#define vstore_partial_16_4 vstore_partial_4 2100#define vstore_partial_16_5 vstore_partial_5 2101#define vstore_partial_16_6 vstore_partial_6 2102#define vstore_partial_16_7 vstore_partial_7 2103#define vstore_partial_16_8 vstore_partial_8 2104#define vstore_partial_16_9 vstore_partial_9 2105#define vstore_partial_16_10 vstore_partial_10 2106#define vstore_partial_16_11 vstore_partial_11 2107#define vstore_partial_16_12 vstore_partial_12 2108#define vstore_partial_16_13 vstore_partial_13 2109#define vstore_partial_16_14 vstore_partial_14 2110#define vstore_partial_16_15 vstore_partial_15 2111#define vstore_partial_16_16 vstore_partial_16 2112 2113 2114#define vstore_partial_1(DATA, OFFSET, PTR) \ 2115 vstore1(DATA.s0, OFFSET, PTR); 2116 2117#define vstore_partial_2(DATA, OFFSET, PTR) \ 2118 vstore2(DATA.s01, OFFSET, PTR); 2119 2120#define vstore_partial_3(DATA, OFFSET, PTR) \ 2121 vstore3(DATA.s012, OFFSET, PTR); 2122 2123#define vstore_partial_4(DATA, OFFSET, PTR) \ 2124 vstore4(DATA.s0123, OFFSET, PTR); 2125 2126#define vstore_partial_5(DATA, OFFSET, PTR) \ 2127 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2128 vstore1(DATA.s4, OFFSET, PTR + 4); 2129 2130#define vstore_partial_6(DATA, OFFSET, PTR) \ 2131 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2132 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 2133 2134#define vstore_partial_7(DATA, OFFSET, PTR) \ 2135 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2136 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 2137 2138#define vstore_partial_8(DATA, OFFSET, PTR) \ 2139 vstore8(DATA.s01234567, OFFSET, PTR); 2140 2141#define vstore_partial_9(DATA, OFFSET, PTR) \ 2142 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2143 vstore1(DATA.s8, OFFSET, PTR + 8); 2144 2145#define vstore_partial_10(DATA, OFFSET, PTR) \ 2146 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2147 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 2148 2149#define vstore_partial_11(DATA, OFFSET, PTR) \ 2150 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2151 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 2152 2153#define vstore_partial_12(DATA, OFFSET, PTR) \ 2154 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2155 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 2156 2157#define vstore_partial_13(DATA, OFFSET, PTR) \ 2158 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2159 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 2160 2161#define vstore_partial_14(DATA, OFFSET, PTR) \ 2162 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2163 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 2164 2165#define vstore_partial_15(DATA, OFFSET, PTR) \ 2166 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2167 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 2168 2169#define vstore_partial_16(DATA, OFFSET, PTR) \ 2170 vstore16(DATA, OFFSET, PTR); 2171 2172 2173 2174 2175 2176#define convert_float_sat convert_float 2177#define convert_float1_sat convert_float 2178#define convert_float2_sat convert_float2 2179#define convert_float3_sat convert_float3 2180#define convert_float4_sat convert_float4 2181#define convert_float8_sat convert_float8 2182#define convert_float16_sat convert_float16 2183#define convert_half_sat convert_float 2184#define convert_half1_sat convert_half 2185#define convert_half2_sat convert_half2 2186#define convert_half3_sat convert_half3 2187#define convert_half4_sat convert_half4 2188#define convert_half8_sat convert_half8 2189#define convert_half16_sat convert_half16 2190 2191#define convert_float1 convert_float 2192#define convert_half1 convert_half 2193#define convert_char1 convert_char 2194#define convert_uchar1 convert_uchar 2195#define convert_short1 convert_short 2196#define convert_ushort1 convert_ushort 2197#define convert_int1 convert_int 2198#define convert_uint1 convert_uint 2199#define convert_long1 convert_long 2200#define convert_ulong1 convert_ulong 2201#define convert_double1 convert_double 2202 2203#define convert_char1_sat convert_char_sat 2204#define convert_uchar1_sat convert_uchar_sat 2205#define convert_uchar2_sat convert_uchar2_sat 2206#define convert_uchar3_sat convert_uchar3_sat 2207#define convert_uchar4_sat convert_uchar4_sat 2208#define convert_uchar8_sat convert_uchar8_sat 2209#define convert_uchar16_sat convert_uchar16_sat 2210#define convert_short1_sat convert_short_sat 2211#define convert_ushort1_sat convert_ushort_sat 2212#define convert_int1_sat convert_int_sat 2213#define convert_uint1_sat convert_uint_sat 2214#define convert_long1_sat convert_long_sat 2215#define convert_ulong1_sat convert_ulong_sat 2216#define convert_double1_sat convert_double_sat 2217 2218#define VEC_DATA_TYPE_STR(type, size) type##size 2219#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 2220 2221#define CONVERT_STR(x, type) (convert_##type((x))) 2222#define CONVERT(x, type) CONVERT_STR(x, type) 2223 2224#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 2225#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 2226 2227#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 2228#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 2229 2230#define select_vec_dt_uchar(size) uchar##size 2231#define select_vec_dt_char(size) char##size 2232#define select_vec_dt_ushort(size) ushort##size 2233#define select_vec_dt_short(size) short##size 2234#define select_vec_dt_half(size) short##size 2235#define select_vec_dt_uint(size) uint##size 2236#define select_vec_dt_int(size) int##size 2237#define select_vec_dt_float(size) int##size 2238#define select_vec_dt_ulong(size) ulong##size 2239#define select_vec_dt_long(size) long##size 2240 2241#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 2242#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 2243#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 2244 2245#define signed_int_vec_dt_uchar(size) char##size 2246#define signed_int_vec_dt_char(size) char##size 2247#define signed_int_vec_dt_ushort(size) short##size 2248#define signed_int_vec_dt_short(size) short##size 2249#define signed_int_vec_dt_half(size) short##size 2250#define signed_int_vec_dt_uint(size) int##size 2251#define signed_int_vec_dt_int(size) int##size 2252#define signed_int_vec_dt_float(size) int##size 2253#define signed_int_vec_dt_ulong(size) long##size 2254#define signed_int_vec_dt_long(size) long##size 2255 2256#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 2257#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 2258#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 2259 2260#define sum_reduce_1(x) (x) 2261#define sum_reduce_2(x) ((x).s0) + ((x).s1) 2262#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 2263#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 2264#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 2265#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 2266 2267#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 2268#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 2269 2270#define prod_reduce_1(x) (x) 2271#define prod_reduce_2(x) ((x).s0) * ((x).s1) 2272#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 2273#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 2274#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 2275#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 2276 2277#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 2278#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 2279 2280#define max_reduce_1(x) (x) 2281#define max_reduce_2(x) max(((x).s0), ((x).s1)) 2282#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 2283#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 2284#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 2285#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 2286 2287#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 2288#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 2289 2290#define VECTOR_DECLARATION(name) \ 2291 __global uchar *name##_ptr, \ 2292 uint name##_stride_x, \ 2293 uint name##_step_x, \ 2294 uint name##_offset_first_element_in_bytes 2295 2296#define IMAGE_DECLARATION(name) \ 2297 __global uchar *name##_ptr, \ 2298 uint name##_stride_x, \ 2299 uint name##_step_x, \ 2300 uint name##_stride_y, \ 2301 uint name##_step_y, \ 2302 uint name##_offset_first_element_in_bytes 2303 2304#define TENSOR3D_DECLARATION(name) \ 2305 __global uchar *name##_ptr, \ 2306 uint name##_stride_x, \ 2307 uint name##_step_x, \ 2308 uint name##_stride_y, \ 2309 uint name##_step_y, \ 2310 uint name##_stride_z, \ 2311 uint name##_step_z, \ 2312 uint name##_offset_first_element_in_bytes 2313 2314#define TENSOR4D_DECLARATION(name) \ 2315 __global uchar *name##_ptr, \ 2316 uint name##_stride_x, \ 2317 uint name##_step_x, \ 2318 uint name##_stride_y, \ 2319 uint name##_step_y, \ 2320 uint name##_stride_z, \ 2321 uint name##_step_z, \ 2322 uint name##_stride_w, \ 2323 uint name##_step_w, \ 2324 uint name##_offset_first_element_in_bytes 2325 2326#define TENSOR5D_DECLARATION(name) \ 2327 __global uchar *name##_ptr, \ 2328 uint name##_stride_x, \ 2329 uint name##_step_x, \ 2330 uint name##_stride_y, \ 2331 uint name##_step_y, \ 2332 uint name##_stride_z, \ 2333 uint name##_step_z, \ 2334 uint name##_stride_w, \ 2335 uint name##_step_w, \ 2336 uint name##_stride_v, \ 2337 uint name##_step_v, \ 2338 uint name##_offset_first_element_in_bytes 2339 2340#define CONVERT_TO_VECTOR_STRUCT(name) \ 2341 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 2342 2343#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 2344 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 2345 2346#define CONVERT_TO_IMAGE_STRUCT(name) \ 2347 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 2348 2349#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 2350 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 2351 2352#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2353 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2354 2355#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 2356 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 2357 2358#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2359 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2360 2361#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 2362 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2363 name##_stride_z, name##_step_z) 2364 2365#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 2366 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 2367 2368#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 2369 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2370 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 2371 2372#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 2373 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 2374 2375#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 2376 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2377 name##_stride_z, name##_step_z) 2378 2379 2380typedef struct Vector 2381{ 2382 __global uchar *ptr; 2383 int offset_first_element_in_bytes; 2384 int stride_x; 2385} Vector; 2386 2387 2388typedef struct Image 2389{ 2390 __global uchar *ptr; 2391 int offset_first_element_in_bytes; 2392 int stride_x; 2393 int stride_y; 2394} Image; 2395 2396 2397typedef struct Tensor3D 2398{ 2399 __global uchar *ptr; 2400 int offset_first_element_in_bytes; 2401 int stride_x; 2402 int stride_y; 2403 int stride_z; 2404} Tensor3D; 2405 2406 2407typedef struct Tensor4D 2408{ 2409 __global uchar *ptr; 2410 int offset_first_element_in_bytes; 2411 int stride_x; 2412 int stride_y; 2413 int stride_z; 2414 int stride_w; 2415} Tensor4D; 2416 2417 2418inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 2419{ 2420 Vector vector = 2421 { 2422 .ptr = ptr, 2423 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2424 .stride_x = stride_x, 2425 }; 2426 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 2427 return vector; 2428} 2429 2430 2431inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 2432{ 2433 Image img = 2434 { 2435 .ptr = ptr, 2436 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2437 .stride_x = stride_x, 2438 .stride_y = stride_y 2439 }; 2440 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 2441 return img; 2442} 2443 2444 2445inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2446{ 2447 Image img = 2448 { 2449 .ptr = ptr, 2450 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2451 .stride_x = stride_x, 2452 .stride_y = stride_y 2453 }; 2454 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2455 return img; 2456} 2457 2458 2459inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2460{ 2461 Tensor3D tensor = 2462 { 2463 .ptr = ptr, 2464 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2465 .stride_x = stride_x, 2466 .stride_y = stride_y, 2467 .stride_z = stride_z 2468 }; 2469 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2470 return tensor; 2471} 2472 2473 2474inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2475{ 2476 Tensor3D tensor = 2477 { 2478 .ptr = ptr, 2479 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2480 .stride_x = stride_x, 2481 .stride_y = stride_y, 2482 .stride_z = stride_z 2483 }; 2484 return tensor; 2485} 2486 2487inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 2488 uint step_w, 2489 uint mod_size) 2490{ 2491 Tensor4D tensor = 2492 { 2493 .ptr = ptr, 2494 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2495 .stride_x = stride_x, 2496 .stride_y = stride_y, 2497 .stride_z = stride_z, 2498 .stride_w = stride_w 2499 }; 2500 2501 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 2502 return tensor; 2503} 2504 2505 2506inline __global const uchar *vector_offset(const Vector *vec, int x) 2507{ 2508 return vec->ptr + x * vec->stride_x; 2509} 2510 2511 2512inline __global uchar *offset(const Image *img, int x, int y) 2513{ 2514 return img->ptr + x * img->stride_x + y * img->stride_y; 2515} 2516 2517 2518inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 2519{ 2520 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 2521} 2522 2523 2524inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 2525{ 2526 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 2527} 2528 2529 2530inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 2531{ 2532 uint num_elements = width * height; 2533 2534 const uint z = index / num_elements; 2535 2536 index %= num_elements; 2537 2538 const uint y = index / width; 2539 2540 index %= width; 2541 2542 const uint x = index; 2543 2544 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 2545} 2546 2547#endif 2548 2549 2550#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) 2551#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) 2552 2553 2554#define scalar_access_0_1(x) ((x).s0) 2555#define scalar_access_0_2(x) ((x).s01) 2556#define scalar_access_0_3(x) ((x).s012) 2557#define scalar_access_0_4(x) ((x).s0123) 2558#define scalar_access_0_8(x) ((x).s01234567) 2559#define scalar_access_0_16(x) ((x).s0123456789ABCDEF) 2560 2561 2562#define scalar_access_1_1(x) ((x).s1) 2563#define scalar_access_1_2(x) ((x).s12) 2564#define scalar_access_1_3(x) ((x).s123) 2565#define scalar_access_1_4(x) ((x).s1234) 2566#define scalar_access_1_8(x) ((x).s12345678) 2567 2568 2569#define scalar_access_2_1(x) ((x).s2) 2570#define scalar_access_2_2(x) ((x).s23) 2571#define scalar_access_2_3(x) ((x).s234) 2572#define scalar_access_2_4(x) ((x).s2345) 2573#define scalar_access_2_8(x) ((x).s23456789) 2574 2575 2576#define scalar_access_3_1(x) ((x).s3) 2577#define scalar_access_3_2(x) ((x).s34) 2578#define scalar_access_3_3(x) ((x).s345) 2579#define scalar_access_3_4(x) ((x).s3456) 2580#define scalar_access_3_8(x) ((x).s3456789A) 2581 2582 2583#define scalar_access_4_1(x) ((x).s4) 2584#define scalar_access_4_2(x) ((x).s45) 2585#define scalar_access_4_3(x) ((x).s456) 2586#define scalar_access_4_4(x) ((x).s4567) 2587#define scalar_access_4_8(x) ((x).s456789AB) 2588 2589 2590#define scalar_access_8_1(x) ((x).s8) 2591#define scalar_access_8_2(x) ((x).s89) 2592#define scalar_access_8_3(x) ((x).s89A) 2593#define scalar_access_8_4(x) ((x).s89AB) 2594#define scalar_access_8_8(x) ((x).s89ABCDEF) 2595 2596 2597#define scalar_access_12_1(x) ((x).sC) 2598#define scalar_access_12_2(x) ((x).sCD) 2599#define scalar_access_12_3(x) ((x).sCDE) 2600#define scalar_access_12_4(x) ((x).sCDEF) 2601 2602 2603#define scalar_access_16_1(x) ((x).sF) 2604 2605 2606#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2607 ({}) 2608 2609#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2610 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2611 2612#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2613 LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2614 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2615 2616#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2617 LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2618 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2619 2620#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2621 LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2622 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2623 2624#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2625 LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2626 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2627 2628#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2629 LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2630 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2631 2632#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2633 LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2634 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2635 2636#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2637 LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2638 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2639 2640#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2641 LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2642 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2643 2644#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2645 LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2646 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2647 2648#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2649 LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2650 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2651 2652#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2653 LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2654 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2655 2656#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2657 LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2658 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2659 2660#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2661 LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2662 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2663 2664#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2665 LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2666 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2667 2668#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2669 LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2670 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2671 2672 2673 2674#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2675#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2676 2677 2678 2679#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2680 ({}) 2681 2682#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2683 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2684 2685#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2686 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2687 2688#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2689 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2690 2691#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2692 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2693 2694#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2695 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2696 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2697 2698#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2699 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2700 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2701 2702#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2703 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2704 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2705 2706#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2707 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2708 2709#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2710 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2711 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2712 2713#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2714 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2715 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2716 2717#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2718 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2719 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2720 2721#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2722 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2723 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2724 2725#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2726 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2727 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2728 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2729 2730#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2731 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2732 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2733 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2734 2735#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2736 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2737 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2738 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2739 2740#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2741 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2742 2743 2744 2745#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2746#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2747 2748 2749#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2750 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2751 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2752 2753#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2754 LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2755 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2756 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2757 2758#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2759 LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2760 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2761 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2762 2763#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2764 LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2765 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2766 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2767 2768#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2769 LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2770 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2771 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2772 2773#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2774 LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2775 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2776 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2777 2778#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2779 LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2780 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2781 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2782 2783#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2784 LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2785 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2786 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2787 2788#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2789 LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2790 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2791 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2792 2793#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2794 LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2795 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2796 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2797 2798#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2799 LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2800 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2801 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2802 2803#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2804 LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2805 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2806 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2807 2808#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2809 LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2810 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2811 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2812 2813#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2814 LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2815 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2816 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2817 2818#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2819 LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2820 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2821 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2822 2823#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2824 LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2825 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2826 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2827 2828 2829 2830 2831#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2832#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2833 2834 2835 2836#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2837 VLOAD_PARTIAL(N0, LOAD_N0) \ 2838 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2839 2840#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2841 LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2842 VLOAD_PARTIAL(N0, LOAD_N0) \ 2843 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2844 2845#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2846 LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2847 VLOAD_PARTIAL(N0, LOAD_N0) \ 2848 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2849 2850#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2851 LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2852 VLOAD_PARTIAL(N0, LOAD_N0) \ 2853 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2854 2855#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2856 LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2857 VLOAD_PARTIAL(N0, LOAD_N0) \ 2858 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2859 2860#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2861 LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2862 VLOAD_PARTIAL(N0, LOAD_N0) \ 2863 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2864 2865#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2866 LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2867 VLOAD_PARTIAL(N0, LOAD_N0) \ 2868 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2869 2870#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2871 LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2872 VLOAD_PARTIAL(N0, LOAD_N0) \ 2873 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2874 2875#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2876 LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2877 VLOAD_PARTIAL(N0, LOAD_N0) \ 2878 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2879 2880#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2881 LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2882 VLOAD_PARTIAL(N0, LOAD_N0) \ 2883 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2884 2885#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2886 LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2887 VLOAD_PARTIAL(N0, LOAD_N0) \ 2888 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2889 2890#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2891 LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2892 VLOAD_PARTIAL(N0, LOAD_N0) \ 2893 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2894 2895#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2896 LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2897 VLOAD_PARTIAL(N0, LOAD_N0) \ 2898 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2899 2900#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2901 LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2902 VLOAD_PARTIAL(N0, LOAD_N0) \ 2903 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2904 2905#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2906 LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2907 VLOAD_PARTIAL(N0, LOAD_N0) \ 2908 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2909 2910#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2911 LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2912 VLOAD_PARTIAL(N0, LOAD_N0) \ 2913 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2914 2915 2916 2917#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2918#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2919 2920#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2921 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 2922 { \ 2923 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2924 } \ 2925 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 2926 { \ 2927 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2928 } \ 2929 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 2930 { \ 2931 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2932 } \ 2933 else \ 2934 { \ 2935 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2936 } 2937 2938#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 2939 if(!(PARTIAL_COND_X)) \ 2940 { \ 2941 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2942 } \ 2943 else \ 2944 { \ 2945 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2946 } 2947 2948#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 2949 if(!(PARTIAL_COND_Y)) \ 2950 { \ 2951 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2952 } \ 2953 else \ 2954 { \ 2955 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2956 } 2957 2958 2959#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2960 2961#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2962 LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2963 2964#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 2965 2966#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2967 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2968 LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 2969 2970#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 2971 2972#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2973 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2974 LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 2975 2976#else 2977 2978#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2979 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2980 LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 2981 2982#endif 2983 2984 2985#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2986 BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW)) 2987 2988#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2989 LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2990 BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW)) 2991 2992#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2993 LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2994 BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW)) 2995 2996#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2997 LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2998 BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW)) 2999 3000#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3001 LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3002 BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW)) 3003 3004#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3005 LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3006 BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW)) 3007 3008#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3009 LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3010 BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW)) 3011 3012#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3013 LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3014 BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW)) 3015 3016#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3017 LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3018 BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW)) 3019 3020#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3021 LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3022 BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW)) 3023 3024#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3025 LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3026 BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW)) 3027 3028#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3029 LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3030 BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW)) 3031 3032#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3033 LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3034 BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW)) 3035 3036#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3037 LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3038 BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW)) 3039 3040#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3041 LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3042 BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW)) 3043 3044#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3045 LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3046 BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW)) 3047 3048 3049 3050#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3051#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3052 3053 3054 3055#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3056 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3057 BASENAME##0; \ 3058 if(Y_MASK##0 != 0) \ 3059 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ 3060 else \ 3061 BASENAME##0 = 0; 3062 3063#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3064 LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3065 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3066 BASENAME##1; \ 3067 if(Y_MASK##1 != 0) \ 3068 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ 3069 else \ 3070 BASENAME##1 = 0; 3071 3072#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3073 LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3074 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3075 BASENAME##2; \ 3076 if(Y_MASK##2 != 0) \ 3077 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ 3078 else \ 3079 BASENAME##2 = 0; 3080 3081#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3082 LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3083 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3084 BASENAME##3; \ 3085 if(Y_MASK##3 != 0) \ 3086 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ 3087 else \ 3088 BASENAME##3 = 0; 3089 3090#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3091 LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3092 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3093 BASENAME##4; \ 3094 if(Y_MASK##4 != 0) \ 3095 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ 3096 else \ 3097 BASENAME##4 = 0; 3098 3099#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3100 LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3101 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3102 BASENAME##5; \ 3103 if(Y_MASK##5 != 0) \ 3104 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ 3105 else \ 3106 BASENAME##5 = 0; 3107 3108#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3109 LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3110 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3111 BASENAME##6; \ 3112 if(Y_MASK##6 != 0) \ 3113 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ 3114 else \ 3115 BASENAME##6 = 0; 3116 3117#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3118 LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3119 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3120 BASENAME##7; \ 3121 if(Y_MASK##7 != 0) \ 3122 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ 3123 else \ 3124 BASENAME##7 = 0; 3125 3126#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3127 LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3128 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3129 BASENAME##8; \ 3130 if(Y_MASK##8 != 0) \ 3131 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ 3132 else \ 3133 BASENAME##8 = 0; 3134 3135#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3136 LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3137 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3138 BASENAME##9; \ 3139 if(Y_MASK##9 != 0) \ 3140 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ 3141 else \ 3142 BASENAME##9 = 0; 3143 3144#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3145 LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3146 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3147 BASENAME##A; \ 3148 if(Y_MASK##A != 0) \ 3149 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ 3150 else \ 3151 BASENAME##A = 0; 3152 3153#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3154 LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3155 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3156 BASENAME##B; \ 3157 if(Y_MASK##B != 0) \ 3158 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ 3159 else \ 3160 BASENAME##B = 0; 3161 3162#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3163 LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3164 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3165 BASENAME##C; \ 3166 if(Y_MASK##C != 0) \ 3167 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ 3168 else \ 3169 BASENAME##C = 0; 3170 3171#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3172 LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3173 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3174 BASENAME##D; \ 3175 if(Y_MASK##D != 0) \ 3176 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ 3177 else \ 3178 BASENAME##D = 0; 3179 3180#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3181 LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3182 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3183 BASENAME##E; \ 3184 if(Y_MASK##E != 0) \ 3185 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ 3186 else \ 3187 BASENAME##E = 0; 3188 3189#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3190 LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3191 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3192 BASENAME##F; \ 3193 if(Y_MASK##F != 0) \ 3194 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ 3195 else \ 3196 BASENAME##F = 0; 3197 3198 3199#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3200#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3201 3202 3203#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3204 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3205 BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y)); 3206 3207#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3208 LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3209 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3210 BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y)); 3211 3212#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3213 LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3214 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3215 BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y)); 3216 3217#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3218 LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3219 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3220 BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y)); 3221 3222#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3223 LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3224 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3225 BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y)); 3226 3227#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3228 LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3229 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3230 BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y)); 3231 3232#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3233 LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3234 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3235 BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y)); 3236 3237#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3238 LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3239 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3240 BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y)); 3241 3242#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3243 LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3244 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3245 BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y)); 3246 3247#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3248 LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3249 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3250 BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y)); 3251 3252#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3253 LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3254 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3255 BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y)); 3256 3257#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3258 LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3259 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3260 BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y)); 3261 3262#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3263 LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3264 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3265 BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y)); 3266 3267#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3268 LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3269 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3270 BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y)); 3271 3272#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3273 LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3274 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3275 BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y)); 3276 3277#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3278 LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3279 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3280 BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y)); 3281 3282 3283 3284 3285#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3286#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3287 3288 3289 3290#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3291 Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3292 Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ 3293 Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); 3294 3295#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3296 CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3297 Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3298 Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ 3299 Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); 3300 3301#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3302 CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3303 Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3304 Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ 3305 Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); 3306 3307#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3308 CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3309 Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3310 Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ 3311 Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); 3312 3313#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3314 CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3315 Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3316 Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ 3317 Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); 3318 3319#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3320 CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3321 Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3322 Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ 3323 Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); 3324 3325#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3326 CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3327 Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3328 Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ 3329 Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); 3330 3331#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3332 CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3333 Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3334 Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ 3335 Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); 3336 3337 3338 3339 3340#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3341#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3342 3343 3344 3345#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3346 BASENAME##0 *= (DATA_TYPE)SCALE; 3347 3348#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3349 SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3350 BASENAME##1 *= (DATA_TYPE)SCALE; 3351 3352#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3353 SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3354 BASENAME##2 *= (DATA_TYPE)SCALE; 3355 3356#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3357 SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3358 BASENAME##3 *= (DATA_TYPE)SCALE; 3359 3360#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3361 SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3362 BASENAME##4 *= (DATA_TYPE)SCALE; 3363 3364#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3365 SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3366 BASENAME##5 *= (DATA_TYPE)SCALE; 3367 3368#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3369 SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3370 BASENAME##6 *= (DATA_TYPE)SCALE; 3371 3372#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3373 SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3374 BASENAME##7 *= (DATA_TYPE)SCALE; 3375 3376#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3377 SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3378 BASENAME##8 *= (DATA_TYPE)SCALE; 3379 3380#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3381 SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3382 BASENAME##9 *= (DATA_TYPE)SCALE; 3383 3384#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3385 SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3386 BASENAME##A *= (DATA_TYPE)SCALE; 3387 3388#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3389 SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3390 BASENAME##B *= (DATA_TYPE)SCALE; 3391 3392#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3393 SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3394 BASENAME##C *= (DATA_TYPE)SCALE; 3395 3396#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3397 SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3398 BASENAME##D *= (DATA_TYPE)SCALE; 3399 3400#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3401 SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3402 BASENAME##E *= (DATA_TYPE)SCALE; 3403 3404#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \ 3405 SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3406 BASENAME##F *= (DATA_TYPE)SCALE; 3407 3408 3409 3410#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) 3411#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) 3412 3413 3414 3415#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ 3416 TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); 3417#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ 3418 VEC_DATA_TYPE(TYPE, 2) \ 3419 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); 3420#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \ 3421 VEC_DATA_TYPE(TYPE, 3) \ 3422 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); 3423#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ 3424 VEC_DATA_TYPE(TYPE, 4) \ 3425 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); 3426#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ 3427 VEC_DATA_TYPE(TYPE, 8) \ 3428 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); 3429#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ 3430 VEC_DATA_TYPE(TYPE, 16) \ 3431 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); 3432 3433 3434 3435#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ 3436 TYPE BASENAME##IDX_COL = (TYPE)((X##0)); 3437#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ 3438 VEC_DATA_TYPE(TYPE, 2) \ 3439 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); 3440#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \ 3441 VEC_DATA_TYPE(TYPE, 3) \ 3442 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2)); 3443#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \ 3444 VEC_DATA_TYPE(TYPE, 4) \ 3445 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3)); 3446#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ 3447 VEC_DATA_TYPE(TYPE, 8) \ 3448 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); 3449#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ 3450 VEC_DATA_TYPE(TYPE, 16) \ 3451 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); 3452 3453 3454 3455#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ 3456 COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); 3457#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ 3458 COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ 3459 COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); 3460#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \ 3461 TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE); \ 3462 COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE); 3463#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \ 3464 TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE); \ 3465 COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE); 3466#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \ 3467 TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE); \ 3468 COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE); \ 3469 COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE); \ 3470 COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE); \ 3471 COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE); 3472#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \ 3473 TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE); \ 3474 COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE); \ 3475 COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE); \ 3476 COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE); \ 3477 COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE); \ 3478 COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE); \ 3479 COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE); \ 3480 COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE); \ 3481 COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE); 3482 3483 3484 3485 3486#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3487 CONCAT(COLUMN_VECTOR, K0) \ 3488 (IDX_COL, BASENAME, BS, TYPE); 3489 3490 3491#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3492 CONCAT(COLUMN_VECTOR_SCALAR, K0) \ 3493 (IDX_COL, BASENAME, BS, TYPE); 3494 3495 3496#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \ 3497 CONCAT(TRANSPOSE_K0X, N0) \ 3498 (K0, BASENAME, BS, TYPE); 3499 3500 3501#define ADD_ROW_1(BASENAME, BIAS) \ 3502 BASENAME##0 += BIAS##0; 3503 3504#define ADD_ROW_2(BASENAME, BIAS) \ 3505 ADD_ROW_1(BASENAME, BIAS) \ 3506 BASENAME##1 += BIAS##1; 3507 3508#define ADD_ROW_3(BASENAME, BIAS) \ 3509 ADD_ROW_2(BASENAME, BIAS) \ 3510 BASENAME##2 += BIAS##2; 3511 3512#define ADD_ROW_4(BASENAME, BIAS) \ 3513 ADD_ROW_3(BASENAME, BIAS) \ 3514 BASENAME##3 += BIAS##3; 3515 3516#define ADD_ROW_5(BASENAME, BIAS) \ 3517 ADD_ROW_4(BASENAME, BIAS) \ 3518 BASENAME##4 += BIAS##4; 3519 3520#define ADD_ROW_6(BASENAME, BIAS) \ 3521 ADD_ROW_5(BASENAME, BIAS) \ 3522 BASENAME##5 += BIAS##5; 3523 3524#define ADD_ROW_7(BASENAME, BIAS) \ 3525 ADD_ROW_6(BASENAME, BIAS) \ 3526 BASENAME##6 += BIAS##6; 3527 3528#define ADD_ROW_8(BASENAME, BIAS) \ 3529 ADD_ROW_7(BASENAME, BIAS) \ 3530 BASENAME##7 += BIAS##7; 3531 3532#define ADD_ROW_9(BASENAME, BIAS) \ 3533 ADD_ROW_8(BASENAME, BIAS) \ 3534 BASENAME##8 += BIAS##8; 3535 3536#define ADD_ROW_10(BASENAME, BIAS) \ 3537 ADD_ROW_9(BASENAME, BIAS) \ 3538 BASENAME##9 += BIAS##9; 3539 3540#define ADD_ROW_11(BASENAME, BIAS) \ 3541 ADD_ROW_10(BASENAME, BIAS) \ 3542 BASENAME##A += BIAS##A; 3543 3544#define ADD_ROW_12(BASENAME, BIAS) \ 3545 ADD_ROW_11(BASENAME, BIAS) \ 3546 BASENAME##B += BIAS##B; 3547 3548#define ADD_ROW_13(BASENAME, BIAS) \ 3549 ADD_ROW_12(BASENAME, BIAS) \ 3550 BASENAME##C += BIAS##C; 3551 3552#define ADD_ROW_14(BASENAME, BIAS) \ 3553 ADD_ROW_13(BASENAME, BIAS) \ 3554 BASENAME##D += BIAS##D; 3555 3556#define ADD_ROW_15(BASENAME, BIAS) \ 3557 ADD_ROW_14(BASENAME, BIAS) \ 3558 BASENAME##E += BIAS##E; 3559 3560#define ADD_ROW_16(BASENAME, BIAS) \ 3561 ADD_ROW_15(BASENAME, BIAS) \ 3562 BASENAME##F += BIAS##F; 3563 3564 3565 3566 3567#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) 3568#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) 3569 3570 3571 3572#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3573 BASENAME##0 += BIAS; 3574 3575#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3576 ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3577 BASENAME##1 += BIAS; 3578 3579#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3580 ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3581 BASENAME##2 += BIAS; 3582 3583#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3584 ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3585 BASENAME##3 += BIAS; 3586 3587#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3588 ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3589 BASENAME##4 += BIAS; 3590 3591#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3592 ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3593 BASENAME##5 += BIAS; 3594 3595#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3596 ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3597 BASENAME##6 += BIAS; 3598 3599#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3600 ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3601 BASENAME##7 += BIAS; 3602 3603#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3604 ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3605 BASENAME##8 += BIAS; 3606 3607#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3608 ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3609 BASENAME##9 += BIAS; 3610 3611#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3612 ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3613 BASENAME##A += BIAS; 3614 3615#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3616 ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3617 BASENAME##B += BIAS; 3618 3619#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3620 ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3621 BASENAME##C += BIAS; 3622 3623#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3624 ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3625 BASENAME##D += BIAS; 3626 3627#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3628 ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3629 BASENAME##E += BIAS; 3630 3631#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \ 3632 ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3633 BASENAME##F += BIAS; 3634 3635 3636#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) 3637#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) 3638 3639 3640 3641#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3642 BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL); 3643 3644#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3645 ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3646 BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL); 3647 3648#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3649 ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3650 BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL); 3651 3652#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3653 ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3654 BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL); 3655 3656#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3657 ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3658 BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL); 3659 3660#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3661 ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3662 BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL); 3663 3664#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3665 ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3666 BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL); 3667 3668#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3669 ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3670 BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL); 3671 3672#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3673 ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3674 BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL); 3675 3676#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3677 ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3678 BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL); 3679 3680#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3681 ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3682 BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL); 3683 3684#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3685 ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3686 BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL); 3687 3688#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3689 ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3690 BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL); 3691 3692#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3693 ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3694 BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL); 3695 3696#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3697 ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3698 BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL); 3699 3700#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3701 ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3702 BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL); 3703 3704 3705 3706#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3707#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3708 3709 3710 3711#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3712 VEC_DATA_TYPE(DATA_TYPE, N) \ 3713 BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N)); 3714 3715#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3716 CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3717 VEC_DATA_TYPE(DATA_TYPE, N) \ 3718 BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N)); 3719 3720#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3721 CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3722 VEC_DATA_TYPE(DATA_TYPE, N) \ 3723 BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N)); 3724 3725#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3726 CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3727 VEC_DATA_TYPE(DATA_TYPE, N) \ 3728 BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N)); 3729 3730#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3731 CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3732 VEC_DATA_TYPE(DATA_TYPE, N) \ 3733 BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N)); 3734 3735#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3736 CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3737 VEC_DATA_TYPE(DATA_TYPE, N) \ 3738 BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N)); 3739 3740#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3741 CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3742 VEC_DATA_TYPE(DATA_TYPE, N) \ 3743 BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N)); 3744 3745#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3746 CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3747 VEC_DATA_TYPE(DATA_TYPE, N) \ 3748 BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N)); 3749 3750#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3751 CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3752 VEC_DATA_TYPE(DATA_TYPE, N) \ 3753 BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N)); 3754 3755#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3756 CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3757 VEC_DATA_TYPE(DATA_TYPE, N) \ 3758 BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N)); 3759 3760#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3761 CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3762 VEC_DATA_TYPE(DATA_TYPE, N) \ 3763 BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N)); 3764 3765#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3766 CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3767 VEC_DATA_TYPE(DATA_TYPE, N) \ 3768 BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N)); 3769 3770#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3771 CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3772 VEC_DATA_TYPE(DATA_TYPE, N) \ 3773 BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N)); 3774 3775#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3776 CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3777 VEC_DATA_TYPE(DATA_TYPE, N) \ 3778 BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N)); 3779 3780#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3781 CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3782 VEC_DATA_TYPE(DATA_TYPE, N) \ 3783 BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N)); 3784 3785#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3786 CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3787 VEC_DATA_TYPE(DATA_TYPE, N) \ 3788 BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N)); 3789 3790 3791 3792#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3793#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3794 3795 3796#ifndef ARM_COMPUTE_HELPERS_ASYMM_H 3797#define ARM_COMPUTE_HELPERS_ASYMM_H 3798 3799 3800#ifndef ARM_COMPUTE_HELPER_H 3801#define ARM_COMPUTE_HELPER_H 3802 3803 3804 3805 3806#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3807 VSTORE(N0) \ 3808 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3809 3810#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3811 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3812 VSTORE(N0) \ 3813 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3814 3815#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3816 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3817 VSTORE(N0) \ 3818 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3819 3820#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3821 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3822 VSTORE(N0) \ 3823 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3824 3825#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3826 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3827 VSTORE(N0) \ 3828 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3829 3830#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3831 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3832 VSTORE(N0) \ 3833 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3834 3835#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3836 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3837 VSTORE(N0) \ 3838 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3839 3840#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3841 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3842 VSTORE(N0) \ 3843 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3844 3845#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3846 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3847 VSTORE(N0) \ 3848 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3849 3850#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3851 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3852 VSTORE(N0) \ 3853 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3854 3855#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3856 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3857 VSTORE(N0) \ 3858 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3859 3860#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3861 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3862 VSTORE(N0) \ 3863 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3864 3865#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3866 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3867 VSTORE(N0) \ 3868 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3869 3870#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3871 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3872 VSTORE(N0) \ 3873 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3874 3875#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3876 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3877 VSTORE(N0) \ 3878 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3879 3880#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3881 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3882 VSTORE(N0) \ 3883 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3884 3885 3886 3887#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3888 VSTORE(N0) \ 3889 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3890 3891#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3892 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3893 VSTORE(N0) \ 3894 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3895 3896#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3897 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3898 VSTORE(N0) \ 3899 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3900 3901#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3902 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3903 VSTORE(N0) \ 3904 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3905 3906#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3907 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3908 VSTORE(N0) \ 3909 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3910 3911#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3912 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3913 VSTORE(N0) \ 3914 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3915 3916#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3917 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3918 VSTORE(N0) \ 3919 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3920 3921#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3922 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3923 VSTORE(N0) \ 3924 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3925 3926#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3927 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3928 VSTORE(N0) \ 3929 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3930 3931#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 3932 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3933 VSTORE(N0) \ 3934 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3935 3936#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3937 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3938 VSTORE(N0) \ 3939 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3940 3941#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3942 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3943 VSTORE(N0) \ 3944 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3945 3946#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3947 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3948 VSTORE(N0) \ 3949 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3950 3951#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3952 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3953 VSTORE(N0) \ 3954 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3955 3956#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3957 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3958 VSTORE(N0) \ 3959 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3960 3961#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3962 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3963 VSTORE(N0) \ 3964 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3965 3966 3967 3968 3969#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3970#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3971 3972 3973 3974#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3975#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3976 3977 3978 3979#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3980 VSTORE_PARTIAL(N0, STORE_N0) \ 3981 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3982 3983#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3984 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3985 VSTORE_PARTIAL(N0, STORE_N0) \ 3986 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3987 3988#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3989 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3990 VSTORE_PARTIAL(N0, STORE_N0) \ 3991 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3992 3993#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3994 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3995 VSTORE_PARTIAL(N0, STORE_N0) \ 3996 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3997 3998#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3999 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4000 VSTORE_PARTIAL(N0, STORE_N0) \ 4001 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 4002 4003#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4004 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4005 VSTORE_PARTIAL(N0, STORE_N0) \ 4006 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 4007 4008#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4009 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4010 VSTORE_PARTIAL(N0, STORE_N0) \ 4011 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 4012 4013#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4014 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4015 VSTORE_PARTIAL(N0, STORE_N0) \ 4016 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 4017 4018#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4019 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4020 VSTORE_PARTIAL(N0, STORE_N0) \ 4021 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 4022 4023#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4024 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4025 VSTORE_PARTIAL(N0, STORE_N0) \ 4026 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 4027 4028#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4029 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4030 VSTORE_PARTIAL(N0, STORE_N0) \ 4031 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 4032 4033#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4034 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4035 VSTORE_PARTIAL(N0, STORE_N0) \ 4036 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 4037 4038#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4039 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4040 VSTORE_PARTIAL(N0, STORE_N0) \ 4041 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 4042 4043#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4044 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4045 VSTORE_PARTIAL(N0, STORE_N0) \ 4046 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 4047 4048#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4049 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4050 VSTORE_PARTIAL(N0, STORE_N0) \ 4051 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 4052 4053#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4054 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4055 VSTORE_PARTIAL(N0, STORE_N0) \ 4056 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 4057 4058 4059 4060#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4061#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4062 4063#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4064 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 4065 { \ 4066 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4067 } \ 4068 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 4069 { \ 4070 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4071 } \ 4072 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 4073 { \ 4074 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4075 } \ 4076 else \ 4077 { \ 4078 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4079 } 4080 4081#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 4082 if(!(PARTIAL_COND_X)) \ 4083 { \ 4084 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4085 } \ 4086 else \ 4087 { \ 4088 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4089 } 4090 4091#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 4092 if(!(PARTIAL_COND_Y)) \ 4093 { \ 4094 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4095 } \ 4096 else \ 4097 { \ 4098 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4099 } 4100 4101 4102#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 4103 4104 4105#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 4106 4107#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4108 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4109 4110#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 4111 4112#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4113 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 4114 4115#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 4116 4117#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4118 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 4119 4120#else 4121 4122#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4123 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 4124 4125#endif 4126 4127#endif 4128 4129 4130#if defined(PARTIAL_STORE_M0) 4131 4132#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4133 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 4134#else 4135#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4136 ((uint)(y * M0)) 4137#endif 4138 4139 4140 4141#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 4142 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 4143 4144 4145#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4146#pragma OPENCL EXTENSION cl_khr_fp16 : enable 4147#endif 4148 4149#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 4150#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 4151#endif 4152 4153#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 4154#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 4155#endif 4156 4157#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 4158#pragma OPENCL EXTENSION cl_arm_printf : enable 4159#endif 4160 4161#define GPU_ARCH_MIDGARD 0x100 4162#define GPU_ARCH_BIFROST 0x200 4163#define GPU_ARCH_VALHALL 0x300 4164 4165 4166#define CONCAT(a, b) a##b 4167 4168 4169#define EXPAND(x) x 4170 4171 4172#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 4173 4174 4175#define REV1(x) ((x)) 4176#define REV2(x) ((x).s10) 4177#define REV3(x) ((x).s210) 4178#define REV4(x) ((x).s3210) 4179#define REV8(x) ((x).s76543210) 4180#define REV16(x) ((x).sFEDCBA9876543210) 4181 4182 4183 4184#define REVERSE_STR(x, s) REV##s((x)) 4185#define REVERSE(x, s) REVERSE_STR(x, s) 4186 4187 4188 4189#define ROT1_0(x) ((x)) 4190#define ROT1_1(x) ((x)) 4191 4192#define ROT2_0(x) ((x)) 4193#define ROT2_1(x) ((x).s10) 4194#define ROT2_2(x) ((x)) 4195 4196#define ROT3_0(x) ((x)) 4197#define ROT3_1(x) ((x).s201) 4198#define ROT3_2(x) ((x).s120) 4199#define ROT3_3(x) ((x)) 4200 4201#define ROT4_0(x) ((x)) 4202#define ROT4_1(x) ((x).s3012) 4203#define ROT4_2(x) ((x).s2301) 4204#define ROT4_3(x) ((x).s1230) 4205#define ROT4_4(x) ((x)) 4206 4207#define ROT8_0(x) ((x)) 4208#define ROT8_1(x) ((x).s70123456) 4209#define ROT8_2(x) ((x).s67012345) 4210#define ROT8_3(x) ((x).s56701234) 4211#define ROT8_4(x) ((x).s45670123) 4212#define ROT8_5(x) ((x).s34567012) 4213#define ROT8_6(x) ((x).s23456701) 4214#define ROT8_7(x) ((x).s12345670) 4215#define ROT8_8(x) ((x)) 4216 4217#define ROT16_0(x) ((x)) 4218#define ROT16_1(x) ((x).sF0123456789ABCDE) 4219#define ROT16_2(x) ((x).sEF0123456789ABCD) 4220#define ROT16_3(x) ((x).sDEF0123456789ABC) 4221#define ROT16_4(x) ((x).sCDEF0123456789AB) 4222#define ROT16_5(x) ((x).sBCDEF0123456789A) 4223#define ROT16_6(x) ((x).sABCDEF0123456789) 4224#define ROT16_7(x) ((x).s9ABCDEF012345678) 4225#define ROT16_8(x) ((x).s89ABCDEF01234567) 4226#define ROT16_9(x) ((x).s789ABCDEF0123456) 4227#define ROT16_10(x) ((x).s6789ABCDEF012345) 4228#define ROT16_11(x) ((x).s56789ABCDEF01234) 4229#define ROT16_12(x) ((x).s456789ABCDEF0123) 4230#define ROT16_13(x) ((x).s3456789ABCDEF012) 4231#define ROT16_14(x) ((x).s23456789ABCDEF01) 4232#define ROT16_15(x) ((x).s123456789ABCDEF0) 4233#define ROT16_16(x) ((x)) 4234 4235 4236 4237#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 4238#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 4239 4240 4241 4242#define V_OFFS1(dt) (dt##1)(0) 4243#define V_OFFS2(dt) (dt##2)(0, 1) 4244#define V_OFFS3(dt) (dt##3)(0, 1, 2) 4245#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 4246#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 4247#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 4248 4249 4250 4251#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 4252#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 4253 4254 4255#define VLOAD_STR(size) vload##size 4256#define VLOAD(size) VLOAD_STR(size) 4257 4258 4259#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 4260#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 4261 4262#define NO_LOAD(data, offs, ptr) \ 4263 { \ 4264 } 4265 4266 4267#define vload_partial_1_0 NO_LOAD 4268#define vload_partial_1_1 vload1 4269#define vload_partial_1_2 NO_LOAD 4270#define vload_partial_1_3 NO_LOAD 4271#define vload_partial_1_4 NO_LOAD 4272#define vload_partial_1_5 NO_LOAD 4273#define vload_partial_1_6 NO_LOAD 4274#define vload_partial_1_7 NO_LOAD 4275#define vload_partial_1_8 NO_LOAD 4276#define vload_partial_1_9 NO_LOAD 4277#define vload_partial_1_10 NO_LOAD 4278#define vload_partial_1_11 NO_LOAD 4279#define vload_partial_1_12 NO_LOAD 4280#define vload_partial_1_13 NO_LOAD 4281#define vload_partial_1_14 NO_LOAD 4282#define vload_partial_1_15 NO_LOAD 4283#define vload_partial_1_16 NO_LOAD 4284 4285#define vload_partial_2_0 NO_LOAD 4286#define vload_partial_2_1 vload_partial_1 4287#define vload_partial_2_2 vload_partial_2 4288#define vload_partial_2_3 NO_LOAD 4289#define vload_partial_2_4 NO_LOAD 4290#define vload_partial_2_5 NO_LOAD 4291#define vload_partial_2_6 NO_LOAD 4292#define vload_partial_2_7 NO_LOAD 4293#define vload_partial_2_8 NO_LOAD 4294#define vload_partial_2_9 NO_LOAD 4295#define vload_partial_2_10 NO_LOAD 4296#define vload_partial_2_11 NO_LOAD 4297#define vload_partial_2_12 NO_LOAD 4298#define vload_partial_2_13 NO_LOAD 4299#define vload_partial_2_14 NO_LOAD 4300#define vload_partial_2_15 NO_LOAD 4301#define vload_partial_2_16 NO_LOAD 4302 4303#define vload_partial_3_0 NO_LOAD 4304#define vload_partial_3_1 vload_partial_1 4305#define vload_partial_3_2 vload_partial_2 4306#define vload_partial_3_3 vload_partial_3 4307#define vload_partial_3_4 NO_LOAD 4308#define vload_partial_3_5 NO_LOAD 4309#define vload_partial_3_6 NO_LOAD 4310#define vload_partial_3_7 NO_LOAD 4311#define vload_partial_3_8 NO_LOAD 4312#define vload_partial_3_9 NO_LOAD 4313#define vload_partial_3_10 NO_LOAD 4314#define vload_partial_3_11 NO_LOAD 4315#define vload_partial_3_12 NO_LOAD 4316#define vload_partial_3_13 NO_LOAD 4317#define vload_partial_3_14 NO_LOAD 4318#define vload_partial_3_15 NO_LOAD 4319#define vload_partial_3_16 NO_LOAD 4320 4321#define vload_partial_4_0 NO_LOAD 4322#define vload_partial_4_1 vload_partial_1 4323#define vload_partial_4_2 vload_partial_2 4324#define vload_partial_4_3 vload_partial_3 4325#define vload_partial_4_4 vload_partial_4 4326#define vload_partial_4_5 NO_LOAD 4327#define vload_partial_4_6 NO_LOAD 4328#define vload_partial_4_7 NO_LOAD 4329#define vload_partial_4_8 NO_LOAD 4330#define vload_partial_4_9 NO_LOAD 4331#define vload_partial_4_10 NO_LOAD 4332#define vload_partial_4_11 NO_LOAD 4333#define vload_partial_4_12 NO_LOAD 4334#define vload_partial_4_13 NO_LOAD 4335#define vload_partial_4_14 NO_LOAD 4336#define vload_partial_4_15 NO_LOAD 4337#define vload_partial_4_16 NO_LOAD 4338 4339#define vload_partial_8_0 NO_LOAD 4340#define vload_partial_8_1 vload_partial_1 4341#define vload_partial_8_2 vload_partial_2 4342#define vload_partial_8_3 vload_partial_3 4343#define vload_partial_8_4 vload_partial_4 4344#define vload_partial_8_5 vload_partial_5 4345#define vload_partial_8_6 vload_partial_6 4346#define vload_partial_8_7 vload_partial_7 4347#define vload_partial_8_8 vload_partial_8 4348#define vload_partial_8_9 NO_LOAD 4349#define vload_partial_8_10 NO_LOAD 4350#define vload_partial_8_11 NO_LOAD 4351#define vload_partial_8_12 NO_LOAD 4352#define vload_partial_8_13 NO_LOAD 4353#define vload_partial_8_14 NO_LOAD 4354#define vload_partial_8_15 NO_LOAD 4355#define vload_partial_8_16 NO_LOAD 4356 4357#define vload_partial_16_0 NO_LOAD 4358#define vload_partial_16_1 vload_partial_1 4359#define vload_partial_16_2 vload_partial_2 4360#define vload_partial_16_3 vload_partial_3 4361#define vload_partial_16_4 vload_partial_4 4362#define vload_partial_16_5 vload_partial_5 4363#define vload_partial_16_6 vload_partial_6 4364#define vload_partial_16_7 vload_partial_7 4365#define vload_partial_16_8 vload_partial_8 4366#define vload_partial_16_9 vload_partial_9 4367#define vload_partial_16_10 vload_partial_10 4368#define vload_partial_16_11 vload_partial_11 4369#define vload_partial_16_12 vload_partial_12 4370#define vload_partial_16_13 vload_partial_13 4371#define vload_partial_16_14 vload_partial_14 4372#define vload_partial_16_15 vload_partial_15 4373#define vload_partial_16_16 vload_partial_16 4374 4375 4376#define vload_partial_1(DATA, OFFSET, PTR) \ 4377 DATA.s0 = vload1(OFFSET, PTR); 4378 4379#define vload_partial_2(DATA, OFFSET, PTR) \ 4380 DATA.s01 = vload2(OFFSET, PTR); 4381 4382#define vload_partial_3(DATA, OFFSET, PTR) \ 4383 DATA.s012 = vload3(OFFSET, PTR); 4384 4385#define vload_partial_4(DATA, OFFSET, PTR) \ 4386 DATA.s0123 = vload4(OFFSET, PTR); 4387 4388#define vload_partial_5(DATA, OFFSET, PTR) \ 4389 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4390 DATA.s4 = vload1(OFFSET, PTR + 4); 4391 4392#define vload_partial_6(DATA, OFFSET, PTR) \ 4393 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4394 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 4395 4396#define vload_partial_7(DATA, OFFSET, PTR) \ 4397 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4398 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 4399 4400#define vload_partial_8(DATA, OFFSET, PTR) \ 4401 DATA.s01234567 = vload8(OFFSET, PTR); 4402 4403#define vload_partial_9(DATA, OFFSET, PTR) \ 4404 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4405 DATA.s8 = vload1(OFFSET, PTR + 8); 4406 4407#define vload_partial_10(DATA, OFFSET, PTR) \ 4408 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4409 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 4410 4411#define vload_partial_11(DATA, OFFSET, PTR) \ 4412 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4413 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 4414 4415#define vload_partial_12(DATA, OFFSET, PTR) \ 4416 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4417 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 4418 4419#define vload_partial_13(DATA, OFFSET, PTR) \ 4420 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4421 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 4422 4423#define vload_partial_14(DATA, OFFSET, PTR) \ 4424 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4425 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 4426 4427#define vload_partial_15(DATA, OFFSET, PTR) \ 4428 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4429 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 4430 4431#define vload_partial_16(DATA, OFFSET, PTR) \ 4432 DATA = vload16(OFFSET, PTR); 4433 4434 4435 4436#define PIXEL_UNIT4 1 4437#define PIXEL_UNIT8 2 4438#define PIXEL_UNIT16 4 4439 4440 4441#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 4442#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 4443 4444 4445#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 4446#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 4447#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 4448 4449#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4450#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 4451#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 4452#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 4453#endif 4454 4455#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 4456#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4457#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4458 4459#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4460#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 4461#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4462#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4463#endif 4464 4465 4466#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 4467#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 4468 4469 4470#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 4471#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 4472 4473#define VSTORE_STR(size) vstore##size 4474#define VSTORE(size) VSTORE_STR(size) 4475 4476#define float1 float 4477#define half1 half 4478#define char1 char 4479#define uchar1 uchar 4480#define short1 short 4481#define ushort1 ushort 4482#define int1 int 4483#define uint1 uint 4484#define long1 long 4485#define ulong1 ulong 4486#define double1 double 4487 4488#define vload1(OFFSET, PTR) *(OFFSET + PTR) 4489#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 4490 4491 4492#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 4493#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 4494 4495#define NO_STORE(data, offs, ptr) \ 4496 { \ 4497 } 4498 4499 4500#define vstore_partial_1_0 NO_STORE 4501#define vstore_partial_1_1 vstore1 4502#define vstore_partial_1_2 NO_STORE 4503#define vstore_partial_1_3 NO_STORE 4504#define vstore_partial_1_4 NO_STORE 4505#define vstore_partial_1_5 NO_STORE 4506#define vstore_partial_1_6 NO_STORE 4507#define vstore_partial_1_7 NO_STORE 4508#define vstore_partial_1_8 NO_STORE 4509#define vstore_partial_1_9 NO_STORE 4510#define vstore_partial_1_10 NO_STORE 4511#define vstore_partial_1_11 NO_STORE 4512#define vstore_partial_1_12 NO_STORE 4513#define vstore_partial_1_13 NO_STORE 4514#define vstore_partial_1_14 NO_STORE 4515#define vstore_partial_1_15 NO_STORE 4516#define vstore_partial_1_16 NO_STORE 4517 4518#define vstore_partial_2_0 NO_STORE 4519#define vstore_partial_2_1 vstore_partial_1 4520#define vstore_partial_2_2 vstore_partial_2 4521#define vstore_partial_2_3 NO_STORE 4522#define vstore_partial_2_4 NO_STORE 4523#define vstore_partial_2_5 NO_STORE 4524#define vstore_partial_2_6 NO_STORE 4525#define vstore_partial_2_7 NO_STORE 4526#define vstore_partial_2_8 NO_STORE 4527#define vstore_partial_2_9 NO_STORE 4528#define vstore_partial_2_10 NO_STORE 4529#define vstore_partial_2_11 NO_STORE 4530#define vstore_partial_2_12 NO_STORE 4531#define vstore_partial_2_13 NO_STORE 4532#define vstore_partial_2_14 NO_STORE 4533#define vstore_partial_2_15 NO_STORE 4534#define vstore_partial_2_16 NO_STORE 4535 4536#define vstore_partial_3_0 NO_STORE 4537#define vstore_partial_3_1 vstore_partial_1 4538#define vstore_partial_3_2 vstore_partial_2 4539#define vstore_partial_3_3 vstore_partial_3 4540#define vstore_partial_3_4 NO_STORE 4541#define vstore_partial_3_5 NO_STORE 4542#define vstore_partial_3_6 NO_STORE 4543#define vstore_partial_3_7 NO_STORE 4544#define vstore_partial_3_8 NO_STORE 4545#define vstore_partial_3_9 NO_STORE 4546#define vstore_partial_3_10 NO_STORE 4547#define vstore_partial_3_11 NO_STORE 4548#define vstore_partial_3_12 NO_STORE 4549#define vstore_partial_3_13 NO_STORE 4550#define vstore_partial_3_14 NO_STORE 4551#define vstore_partial_3_15 NO_STORE 4552#define vstore_partial_3_16 NO_STORE 4553 4554#define vstore_partial_4_0 NO_STORE 4555#define vstore_partial_4_1 vstore_partial_1 4556#define vstore_partial_4_2 vstore_partial_2 4557#define vstore_partial_4_3 vstore_partial_3 4558#define vstore_partial_4_4 vstore_partial_4 4559#define vstore_partial_4_5 NO_STORE 4560#define vstore_partial_4_6 NO_STORE 4561#define vstore_partial_4_7 NO_STORE 4562#define vstore_partial_4_8 NO_STORE 4563#define vstore_partial_4_9 NO_STORE 4564#define vstore_partial_4_10 NO_STORE 4565#define vstore_partial_4_11 NO_STORE 4566#define vstore_partial_4_12 NO_STORE 4567#define vstore_partial_4_13 NO_STORE 4568#define vstore_partial_4_14 NO_STORE 4569#define vstore_partial_4_15 NO_STORE 4570#define vstore_partial_4_16 NO_STORE 4571 4572#define vstore_partial_8_0 NO_STORE 4573#define vstore_partial_8_1 vstore_partial_1 4574#define vstore_partial_8_2 vstore_partial_2 4575#define vstore_partial_8_3 vstore_partial_3 4576#define vstore_partial_8_4 vstore_partial_4 4577#define vstore_partial_8_5 vstore_partial_5 4578#define vstore_partial_8_6 vstore_partial_6 4579#define vstore_partial_8_7 vstore_partial_7 4580#define vstore_partial_8_8 vstore_partial_8 4581#define vstore_partial_8_9 NO_STORE 4582#define vstore_partial_8_10 NO_STORE 4583#define vstore_partial_8_11 NO_STORE 4584#define vstore_partial_8_12 NO_STORE 4585#define vstore_partial_8_13 NO_STORE 4586#define vstore_partial_8_14 NO_STORE 4587#define vstore_partial_8_15 NO_STORE 4588#define vstore_partial_8_16 NO_STORE 4589 4590#define vstore_partial_16_0 NO_STORE 4591#define vstore_partial_16_1 vstore_partial_1 4592#define vstore_partial_16_2 vstore_partial_2 4593#define vstore_partial_16_3 vstore_partial_3 4594#define vstore_partial_16_4 vstore_partial_4 4595#define vstore_partial_16_5 vstore_partial_5 4596#define vstore_partial_16_6 vstore_partial_6 4597#define vstore_partial_16_7 vstore_partial_7 4598#define vstore_partial_16_8 vstore_partial_8 4599#define vstore_partial_16_9 vstore_partial_9 4600#define vstore_partial_16_10 vstore_partial_10 4601#define vstore_partial_16_11 vstore_partial_11 4602#define vstore_partial_16_12 vstore_partial_12 4603#define vstore_partial_16_13 vstore_partial_13 4604#define vstore_partial_16_14 vstore_partial_14 4605#define vstore_partial_16_15 vstore_partial_15 4606#define vstore_partial_16_16 vstore_partial_16 4607 4608 4609#define vstore_partial_1(DATA, OFFSET, PTR) \ 4610 vstore1(DATA.s0, OFFSET, PTR); 4611 4612#define vstore_partial_2(DATA, OFFSET, PTR) \ 4613 vstore2(DATA.s01, OFFSET, PTR); 4614 4615#define vstore_partial_3(DATA, OFFSET, PTR) \ 4616 vstore3(DATA.s012, OFFSET, PTR); 4617 4618#define vstore_partial_4(DATA, OFFSET, PTR) \ 4619 vstore4(DATA.s0123, OFFSET, PTR); 4620 4621#define vstore_partial_5(DATA, OFFSET, PTR) \ 4622 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4623 vstore1(DATA.s4, OFFSET, PTR + 4); 4624 4625#define vstore_partial_6(DATA, OFFSET, PTR) \ 4626 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4627 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 4628 4629#define vstore_partial_7(DATA, OFFSET, PTR) \ 4630 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4631 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 4632 4633#define vstore_partial_8(DATA, OFFSET, PTR) \ 4634 vstore8(DATA.s01234567, OFFSET, PTR); 4635 4636#define vstore_partial_9(DATA, OFFSET, PTR) \ 4637 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4638 vstore1(DATA.s8, OFFSET, PTR + 8); 4639 4640#define vstore_partial_10(DATA, OFFSET, PTR) \ 4641 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4642 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 4643 4644#define vstore_partial_11(DATA, OFFSET, PTR) \ 4645 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4646 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 4647 4648#define vstore_partial_12(DATA, OFFSET, PTR) \ 4649 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4650 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 4651 4652#define vstore_partial_13(DATA, OFFSET, PTR) \ 4653 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4654 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 4655 4656#define vstore_partial_14(DATA, OFFSET, PTR) \ 4657 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4658 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 4659 4660#define vstore_partial_15(DATA, OFFSET, PTR) \ 4661 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4662 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 4663 4664#define vstore_partial_16(DATA, OFFSET, PTR) \ 4665 vstore16(DATA, OFFSET, PTR); 4666 4667 4668 4669 4670 4671#define convert_float_sat convert_float 4672#define convert_float1_sat convert_float 4673#define convert_float2_sat convert_float2 4674#define convert_float3_sat convert_float3 4675#define convert_float4_sat convert_float4 4676#define convert_float8_sat convert_float8 4677#define convert_float16_sat convert_float16 4678#define convert_half_sat convert_float 4679#define convert_half1_sat convert_half 4680#define convert_half2_sat convert_half2 4681#define convert_half3_sat convert_half3 4682#define convert_half4_sat convert_half4 4683#define convert_half8_sat convert_half8 4684#define convert_half16_sat convert_half16 4685 4686#define convert_float1 convert_float 4687#define convert_half1 convert_half 4688#define convert_char1 convert_char 4689#define convert_uchar1 convert_uchar 4690#define convert_short1 convert_short 4691#define convert_ushort1 convert_ushort 4692#define convert_int1 convert_int 4693#define convert_uint1 convert_uint 4694#define convert_long1 convert_long 4695#define convert_ulong1 convert_ulong 4696#define convert_double1 convert_double 4697 4698#define convert_char1_sat convert_char_sat 4699#define convert_uchar1_sat convert_uchar_sat 4700#define convert_uchar2_sat convert_uchar2_sat 4701#define convert_uchar3_sat convert_uchar3_sat 4702#define convert_uchar4_sat convert_uchar4_sat 4703#define convert_uchar8_sat convert_uchar8_sat 4704#define convert_uchar16_sat convert_uchar16_sat 4705#define convert_short1_sat convert_short_sat 4706#define convert_ushort1_sat convert_ushort_sat 4707#define convert_int1_sat convert_int_sat 4708#define convert_uint1_sat convert_uint_sat 4709#define convert_long1_sat convert_long_sat 4710#define convert_ulong1_sat convert_ulong_sat 4711#define convert_double1_sat convert_double_sat 4712 4713#define VEC_DATA_TYPE_STR(type, size) type##size 4714#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 4715 4716#define CONVERT_STR(x, type) (convert_##type((x))) 4717#define CONVERT(x, type) CONVERT_STR(x, type) 4718 4719#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 4720#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 4721 4722#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 4723#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 4724 4725#define select_vec_dt_uchar(size) uchar##size 4726#define select_vec_dt_char(size) char##size 4727#define select_vec_dt_ushort(size) ushort##size 4728#define select_vec_dt_short(size) short##size 4729#define select_vec_dt_half(size) short##size 4730#define select_vec_dt_uint(size) uint##size 4731#define select_vec_dt_int(size) int##size 4732#define select_vec_dt_float(size) int##size 4733#define select_vec_dt_ulong(size) ulong##size 4734#define select_vec_dt_long(size) long##size 4735 4736#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 4737#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 4738#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 4739 4740#define signed_int_vec_dt_uchar(size) char##size 4741#define signed_int_vec_dt_char(size) char##size 4742#define signed_int_vec_dt_ushort(size) short##size 4743#define signed_int_vec_dt_short(size) short##size 4744#define signed_int_vec_dt_half(size) short##size 4745#define signed_int_vec_dt_uint(size) int##size 4746#define signed_int_vec_dt_int(size) int##size 4747#define signed_int_vec_dt_float(size) int##size 4748#define signed_int_vec_dt_ulong(size) long##size 4749#define signed_int_vec_dt_long(size) long##size 4750 4751#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 4752#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 4753#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 4754 4755#define sum_reduce_1(x) (x) 4756#define sum_reduce_2(x) ((x).s0) + ((x).s1) 4757#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 4758#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 4759#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 4760#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 4761 4762#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 4763#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 4764 4765#define prod_reduce_1(x) (x) 4766#define prod_reduce_2(x) ((x).s0) * ((x).s1) 4767#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 4768#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 4769#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 4770#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 4771 4772#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 4773#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 4774 4775#define max_reduce_1(x) (x) 4776#define max_reduce_2(x) max(((x).s0), ((x).s1)) 4777#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 4778#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 4779#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 4780#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 4781 4782#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 4783#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 4784 4785#define VECTOR_DECLARATION(name) \ 4786 __global uchar *name##_ptr, \ 4787 uint name##_stride_x, \ 4788 uint name##_step_x, \ 4789 uint name##_offset_first_element_in_bytes 4790 4791#define IMAGE_DECLARATION(name) \ 4792 __global uchar *name##_ptr, \ 4793 uint name##_stride_x, \ 4794 uint name##_step_x, \ 4795 uint name##_stride_y, \ 4796 uint name##_step_y, \ 4797 uint name##_offset_first_element_in_bytes 4798 4799#define TENSOR3D_DECLARATION(name) \ 4800 __global uchar *name##_ptr, \ 4801 uint name##_stride_x, \ 4802 uint name##_step_x, \ 4803 uint name##_stride_y, \ 4804 uint name##_step_y, \ 4805 uint name##_stride_z, \ 4806 uint name##_step_z, \ 4807 uint name##_offset_first_element_in_bytes 4808 4809#define TENSOR4D_DECLARATION(name) \ 4810 __global uchar *name##_ptr, \ 4811 uint name##_stride_x, \ 4812 uint name##_step_x, \ 4813 uint name##_stride_y, \ 4814 uint name##_step_y, \ 4815 uint name##_stride_z, \ 4816 uint name##_step_z, \ 4817 uint name##_stride_w, \ 4818 uint name##_step_w, \ 4819 uint name##_offset_first_element_in_bytes 4820 4821#define TENSOR5D_DECLARATION(name) \ 4822 __global uchar *name##_ptr, \ 4823 uint name##_stride_x, \ 4824 uint name##_step_x, \ 4825 uint name##_stride_y, \ 4826 uint name##_step_y, \ 4827 uint name##_stride_z, \ 4828 uint name##_step_z, \ 4829 uint name##_stride_w, \ 4830 uint name##_step_w, \ 4831 uint name##_stride_v, \ 4832 uint name##_step_v, \ 4833 uint name##_offset_first_element_in_bytes 4834 4835#define CONVERT_TO_VECTOR_STRUCT(name) \ 4836 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 4837 4838#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 4839 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 4840 4841#define CONVERT_TO_IMAGE_STRUCT(name) \ 4842 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 4843 4844#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 4845 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 4846 4847#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4848 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4849 4850#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 4851 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 4852 4853#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4854 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4855 4856#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 4857 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4858 name##_stride_z, name##_step_z) 4859 4860#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 4861 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 4862 4863#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 4864 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4865 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 4866 4867#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 4868 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 4869 4870#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 4871 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4872 name##_stride_z, name##_step_z) 4873 4874 4875typedef struct Vector 4876{ 4877 __global uchar *ptr; 4878 int offset_first_element_in_bytes; 4879 int stride_x; 4880} Vector; 4881 4882 4883typedef struct Image 4884{ 4885 __global uchar *ptr; 4886 int offset_first_element_in_bytes; 4887 int stride_x; 4888 int stride_y; 4889} Image; 4890 4891 4892typedef struct Tensor3D 4893{ 4894 __global uchar *ptr; 4895 int offset_first_element_in_bytes; 4896 int stride_x; 4897 int stride_y; 4898 int stride_z; 4899} Tensor3D; 4900 4901 4902typedef struct Tensor4D 4903{ 4904 __global uchar *ptr; 4905 int offset_first_element_in_bytes; 4906 int stride_x; 4907 int stride_y; 4908 int stride_z; 4909 int stride_w; 4910} Tensor4D; 4911 4912 4913inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 4914{ 4915 Vector vector = 4916 { 4917 .ptr = ptr, 4918 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4919 .stride_x = stride_x, 4920 }; 4921 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 4922 return vector; 4923} 4924 4925 4926inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 4927{ 4928 Image img = 4929 { 4930 .ptr = ptr, 4931 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4932 .stride_x = stride_x, 4933 .stride_y = stride_y 4934 }; 4935 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 4936 return img; 4937} 4938 4939 4940inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4941{ 4942 Image img = 4943 { 4944 .ptr = ptr, 4945 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4946 .stride_x = stride_x, 4947 .stride_y = stride_y 4948 }; 4949 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4950 return img; 4951} 4952 4953 4954inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4955{ 4956 Tensor3D tensor = 4957 { 4958 .ptr = ptr, 4959 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4960 .stride_x = stride_x, 4961 .stride_y = stride_y, 4962 .stride_z = stride_z 4963 }; 4964 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4965 return tensor; 4966} 4967 4968 4969inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4970{ 4971 Tensor3D tensor = 4972 { 4973 .ptr = ptr, 4974 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4975 .stride_x = stride_x, 4976 .stride_y = stride_y, 4977 .stride_z = stride_z 4978 }; 4979 return tensor; 4980} 4981 4982inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 4983 uint step_w, 4984 uint mod_size) 4985{ 4986 Tensor4D tensor = 4987 { 4988 .ptr = ptr, 4989 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4990 .stride_x = stride_x, 4991 .stride_y = stride_y, 4992 .stride_z = stride_z, 4993 .stride_w = stride_w 4994 }; 4995 4996 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 4997 return tensor; 4998} 4999 5000 5001inline __global const uchar *vector_offset(const Vector *vec, int x) 5002{ 5003 return vec->ptr + x * vec->stride_x; 5004} 5005 5006 5007inline __global uchar *offset(const Image *img, int x, int y) 5008{ 5009 return img->ptr + x * img->stride_x + y * img->stride_y; 5010} 5011 5012 5013inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 5014{ 5015 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 5016} 5017 5018 5019inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 5020{ 5021 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 5022} 5023 5024 5025inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 5026{ 5027 uint num_elements = width * height; 5028 5029 const uint z = index / num_elements; 5030 5031 index %= num_elements; 5032 5033 const uint y = index / width; 5034 5035 index %= width; 5036 5037 const uint x = index; 5038 5039 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 5040} 5041 5042#endif 5043 5044 5045#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) 5046#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) 5047 5048 5049inline uchar quantize_qasymm8(float input, float offset, float scale) 5050{ 5051 float out_f32 = input / scale + offset; 5052 uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); 5053 return res_u8; 5054} 5055 5056 5057inline float dequantize_qasymm8(uchar input, float offset, float scale) 5058{ 5059 return ((float)input - offset) * scale; 5060} 5061 5062 5063inline float dequantize_qasymm8_signed(char input, float offset, float scale) 5064{ 5065 return ((float)input - offset) * scale; 5066} 5067 5068 5069#define QUANTIZE_IMPL(type, size) \ 5070 inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ 5071 { \ 5072 VEC_DATA_TYPE(float, size) \ 5073 out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ 5074 VEC_DATA_TYPE(type, size) \ 5075 res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ 5076 return res; \ 5077 } 5078 5079 5080#define DEQUANTIZE_IMPL(type, size) \ 5081 inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ 5082 { \ 5083 return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ 5084 } 5085 5086 5087#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ 5088 inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ 5089 { \ 5090 const VEC_DATA_TYPE(int, size) \ 5091 zero = (VEC_DATA_TYPE(int, size))0; \ 5092 const VEC_DATA_TYPE(int, size) \ 5093 one = (VEC_DATA_TYPE(int, size))1; \ 5094 VEC_DATA_TYPE(int, size) \ 5095 mask = (one << exponent) - one; \ 5096 VEC_DATA_TYPE(int, size) \ 5097 threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ 5098 return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ 5099 } 5100 5101 5102#define ASYMM_MULT_IMPL(size) \ 5103 inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 5104 { \ 5105 VEC_DATA_TYPE(int, size) \ 5106 overflow = a == b && a == INT_MIN; \ 5107 VEC_DATA_TYPE(long, size) \ 5108 a_64 = convert_long##size(a); \ 5109 VEC_DATA_TYPE(long, size) \ 5110 b_64 = convert_long##size(b); \ 5111 VEC_DATA_TYPE(long, size) \ 5112 ab_64 = a_64 * b_64; \ 5113 \ 5114 VEC_DATA_TYPE(long, size) \ 5115 mask1 = 1 << 30; \ 5116 VEC_DATA_TYPE(long, size) \ 5117 mask2 = 1 - (1 << 30); \ 5118 VEC_DATA_TYPE(long, size) \ 5119 is_positive_or_zero = ab_64 >= 0; \ 5120 VEC_DATA_TYPE(long, size) \ 5121 nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero)); \ 5122 VEC_DATA_TYPE(long, size) \ 5123 mask = 1ll << 31; \ 5124 VEC_DATA_TYPE(int, size) \ 5125 ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ 5126 return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow)); \ 5127 } 5128 5129 5130#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ 5131 inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ 5132 { \ 5133 const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ 5134 const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ 5135 const int k_fractional_bits = 31; \ 5136 VEC_DATA_TYPE(int, size) \ 5137 x = a + (1 << (k_fractional_bits - 3)); \ 5138 VEC_DATA_TYPE(int, size) \ 5139 x2 = ASYMM_MULT(x, x, size); \ 5140 VEC_DATA_TYPE(int, size) \ 5141 x3 = ASYMM_MULT(x2, x, size); \ 5142 VEC_DATA_TYPE(int, size) \ 5143 x4 = ASYMM_MULT(x2, x2, size); \ 5144 VEC_DATA_TYPE(int, size) \ 5145 x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ 5146 VEC_DATA_TYPE(int, size) \ 5147 x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ 5148 VEC_DATA_TYPE(int, size) \ 5149 x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ 5150 return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ 5151 } 5152 5153 5154#define ASYMM_SELECT_USING_MASK_IMPL(size) \ 5155 inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ 5156 { \ 5157 return (if_mask & then_val) ^ (~if_mask & else_val); \ 5158 } 5159 5160 5161#define ASYMM_MASK_IF_ZERO_IMPL(size) \ 5162 inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ 5163 { \ 5164 const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 5165 const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 5166 return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0)); \ 5167 } 5168 5169 5170#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ 5171 inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ 5172 { \ 5173 const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 5174 const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 5175 return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \ 5176 } 5177 5178#define EXP_BARREL_SHIFTER_IMPL(size) \ 5179 inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ 5180 { \ 5181 if(k_integer_bits > exponent) \ 5182 { \ 5183 const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ 5184 return ASYMM_SELECT_USING_MASK( \ 5185 ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ 5186 ASYMM_MULT(result, fp_multiplier, size), result, size); \ 5187 } \ 5188 \ 5189 return result; \ 5190 } 5191 5192 5193#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ 5194 inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ 5195 { \ 5196 const int k_fractional_bits = 31 - k_integer_bits; \ 5197 VEC_DATA_TYPE(int, size) \ 5198 k_one_quarter = 1 << (k_fractional_bits - 2); \ 5199 VEC_DATA_TYPE(int, size) \ 5200 mask = k_one_quarter - 1; \ 5201 VEC_DATA_TYPE(int, size) \ 5202 a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ 5203 VEC_DATA_TYPE(int, size) \ 5204 a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ 5205 VEC_DATA_TYPE(int, size) \ 5206 result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \ 5207 VEC_DATA_TYPE(int, size) \ 5208 remainder = a_mod_quarter_minus_one_quarter - a; \ 5209 \ 5210 result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ 5211 result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ 5212 result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ 5213 result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ 5214 result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ 5215 result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ 5216 result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ 5217 \ 5218 if(k_integer_bits > 5) \ 5219 { \ 5220 const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ 5221 result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ 5222 } \ 5223 \ 5224 const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 5225 return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ 5226 } 5227 5228 5229#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ 5230 inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ 5231 { \ 5232 if(exponent < 0) \ 5233 { \ 5234 return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ 5235 } \ 5236 \ 5237 const VEC_DATA_TYPE(int, size) min = INT_MIN; \ 5238 const VEC_DATA_TYPE(int, size) max = INT_MAX; \ 5239 int threshold = ((1 << (31 - exponent)) - 1); \ 5240 VEC_DATA_TYPE(int, size) \ 5241 positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ 5242 VEC_DATA_TYPE(int, size) \ 5243 negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ 5244 VEC_DATA_TYPE(int, size) \ 5245 result = x << exponent; \ 5246 result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ 5247 result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ 5248 return result; \ 5249 } 5250 5251 5252#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ 5253 inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 5254 { \ 5255 VEC_DATA_TYPE(long, size) \ 5256 a64 = convert_long##size(a); \ 5257 VEC_DATA_TYPE(long, size) \ 5258 b64 = convert_long##size(b); \ 5259 VEC_DATA_TYPE(long, size) \ 5260 sum = a64 + b64; \ 5261 const VEC_DATA_TYPE(long, size) one = 1; \ 5262 const VEC_DATA_TYPE(long, size) minus_one = -1; \ 5263 VEC_DATA_TYPE(long, size) \ 5264 sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ 5265 return convert_int##size((sum + sign) / 2); \ 5266 } 5267 5268 5269#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ 5270 inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ 5271 { \ 5272 const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 5273 const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ 5274 VEC_DATA_TYPE(int, size) \ 5275 half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ 5276 const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ 5277 const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ 5278 VEC_DATA_TYPE(int, size) \ 5279 x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ 5280 for(int i = 0; i < 3; i++) \ 5281 { \ 5282 VEC_DATA_TYPE(int, size) \ 5283 half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ 5284 VEC_DATA_TYPE(int, size) \ 5285 one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ 5286 VEC_DATA_TYPE(int, size) \ 5287 tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ 5288 x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ 5289 } \ 5290 return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ 5291 } 5292 5293 5294#define ASYMM_RESCALE_IMPL(size) \ 5295 inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ 5296 { \ 5297 int exponent = src_integer_bits - dst_integer_bits; \ 5298 return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ 5299 } 5300 5301#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) 5302#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) 5303#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale) 5304#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) 5305 5306#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) 5307#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) 5308#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) 5309#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) 5310#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ 5311 ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) 5312#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ 5313 ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) 5314#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) 5315#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val) 5316#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) 5317#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) 5318#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) 5319#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits) 5320#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) 5321#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) 5322#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) 5323#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent) 5324#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) 5325#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits) 5326#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) 5327 5328#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ 5329 inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ 5330 { \ 5331 const int left_shift = shift > 0 ? shift : 0; \ 5332 const int right_shift = shift > 0 ? 0 : -shift; \ 5333 return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ 5334 } 5335#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift) 5336 5337QUANTIZE_IMPL(uchar, 1) 5338QUANTIZE_IMPL(char, 1) 5339QUANTIZE_IMPL(uint, 1) 5340QUANTIZE_IMPL(int, 1) 5341QUANTIZE_IMPL(uchar, 2) 5342QUANTIZE_IMPL(char, 2) 5343QUANTIZE_IMPL(uint, 2) 5344QUANTIZE_IMPL(int, 2) 5345QUANTIZE_IMPL(uchar, 3) 5346QUANTIZE_IMPL(char, 3) 5347QUANTIZE_IMPL(uint, 3) 5348QUANTIZE_IMPL(int, 3) 5349QUANTIZE_IMPL(uchar, 4) 5350QUANTIZE_IMPL(ushort, 4) 5351QUANTIZE_IMPL(short, 4) 5352QUANTIZE_IMPL(int, 4) 5353QUANTIZE_IMPL(uchar, 8) 5354QUANTIZE_IMPL(char, 8) 5355QUANTIZE_IMPL(uint, 8) 5356QUANTIZE_IMPL(int, 8) 5357QUANTIZE_IMPL(uchar, 16) 5358QUANTIZE_IMPL(char, 16) 5359QUANTIZE_IMPL(ushort, 16) 5360QUANTIZE_IMPL(short, 16) 5361QUANTIZE_IMPL(uint, 16) 5362QUANTIZE_IMPL(int, 16) 5363 5364DEQUANTIZE_IMPL(uchar, 1) 5365DEQUANTIZE_IMPL(char, 1) 5366DEQUANTIZE_IMPL(uint, 1) 5367DEQUANTIZE_IMPL(int, 1) 5368DEQUANTIZE_IMPL(uchar, 2) 5369DEQUANTIZE_IMPL(char, 2) 5370DEQUANTIZE_IMPL(uint, 2) 5371DEQUANTIZE_IMPL(int, 2) 5372DEQUANTIZE_IMPL(uchar, 3) 5373DEQUANTIZE_IMPL(char, 3) 5374DEQUANTIZE_IMPL(uint, 3) 5375DEQUANTIZE_IMPL(int, 3) 5376DEQUANTIZE_IMPL(uchar, 4) 5377DEQUANTIZE_IMPL(ushort, 4) 5378DEQUANTIZE_IMPL(short, 4) 5379DEQUANTIZE_IMPL(int, 4) 5380DEQUANTIZE_IMPL(uchar, 8) 5381DEQUANTIZE_IMPL(char, 8) 5382DEQUANTIZE_IMPL(uint, 8) 5383DEQUANTIZE_IMPL(int, 8) 5384DEQUANTIZE_IMPL(uchar, 16) 5385DEQUANTIZE_IMPL(char, 16) 5386DEQUANTIZE_IMPL(ushort, 16) 5387DEQUANTIZE_IMPL(short, 16) 5388DEQUANTIZE_IMPL(uint, 16) 5389DEQUANTIZE_IMPL(int, 16) 5390 5391ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) 5392ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) 5393ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3) 5394ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) 5395ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) 5396ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) 5397 5398ASYMM_MULT_IMPL(1) 5399ASYMM_MULT_IMPL(2) 5400ASYMM_MULT_IMPL(3) 5401ASYMM_MULT_IMPL(4) 5402ASYMM_MULT_IMPL(8) 5403ASYMM_MULT_IMPL(16) 5404 5405ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1) 5406ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) 5407ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3) 5408ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) 5409ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) 5410ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) 5411 5412ASYMM_SELECT_USING_MASK_IMPL(1) 5413ASYMM_SELECT_USING_MASK_IMPL(2) 5414ASYMM_SELECT_USING_MASK_IMPL(3) 5415ASYMM_SELECT_USING_MASK_IMPL(4) 5416ASYMM_SELECT_USING_MASK_IMPL(8) 5417ASYMM_SELECT_USING_MASK_IMPL(16) 5418 5419ASYMM_MASK_IF_ZERO_IMPL(1) 5420ASYMM_MASK_IF_ZERO_IMPL(2) 5421ASYMM_MASK_IF_ZERO_IMPL(3) 5422ASYMM_MASK_IF_ZERO_IMPL(4) 5423ASYMM_MASK_IF_ZERO_IMPL(8) 5424ASYMM_MASK_IF_ZERO_IMPL(16) 5425 5426ASYMM_MASK_IF_NON_ZERO_IMPL(1) 5427ASYMM_MASK_IF_NON_ZERO_IMPL(2) 5428ASYMM_MASK_IF_NON_ZERO_IMPL(3) 5429ASYMM_MASK_IF_NON_ZERO_IMPL(4) 5430ASYMM_MASK_IF_NON_ZERO_IMPL(8) 5431ASYMM_MASK_IF_NON_ZERO_IMPL(16) 5432 5433EXP_BARREL_SHIFTER_IMPL(1) 5434EXP_BARREL_SHIFTER_IMPL(2) 5435EXP_BARREL_SHIFTER_IMPL(3) 5436EXP_BARREL_SHIFTER_IMPL(4) 5437EXP_BARREL_SHIFTER_IMPL(8) 5438EXP_BARREL_SHIFTER_IMPL(16) 5439 5440ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1) 5441ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) 5442ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3) 5443ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) 5444ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) 5445ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) 5446 5447ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) 5448ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) 5449ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3) 5450ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) 5451ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) 5452ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) 5453 5454ASYMM_ROUNDING_HALF_SUM_IMPL(1) 5455ASYMM_ROUNDING_HALF_SUM_IMPL(2) 5456ASYMM_ROUNDING_HALF_SUM_IMPL(3) 5457ASYMM_ROUNDING_HALF_SUM_IMPL(4) 5458ASYMM_ROUNDING_HALF_SUM_IMPL(8) 5459ASYMM_ROUNDING_HALF_SUM_IMPL(16) 5460 5461ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1) 5462ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) 5463ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3) 5464ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) 5465ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) 5466ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) 5467 5468ASYMM_RESCALE_IMPL(1) 5469ASYMM_RESCALE_IMPL(2) 5470ASYMM_RESCALE_IMPL(3) 5471ASYMM_RESCALE_IMPL(4) 5472ASYMM_RESCALE_IMPL(8) 5473ASYMM_RESCALE_IMPL(16) 5474 5475MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) 5476MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) 5477MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3) 5478MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) 5479MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) 5480MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) 5481 5482#endif 5483 5484#ifndef ARM_COMPUTE_REPEAT_H 5485#define ARM_COMPUTE_REPEAT_H 5486 5487 5488#ifndef ARM_COMPUTE_HELPER_H 5489#define ARM_COMPUTE_HELPER_H 5490 5491 5492 5493 5494#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5495 VSTORE(N0) \ 5496 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5497 5498#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5499 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5500 VSTORE(N0) \ 5501 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5502 5503#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5504 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5505 VSTORE(N0) \ 5506 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5507 5508#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5509 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5510 VSTORE(N0) \ 5511 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5512 5513#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5514 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5515 VSTORE(N0) \ 5516 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5517 5518#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5519 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5520 VSTORE(N0) \ 5521 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5522 5523#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5524 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5525 VSTORE(N0) \ 5526 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5527 5528#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5529 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5530 VSTORE(N0) \ 5531 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5532 5533#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5534 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5535 VSTORE(N0) \ 5536 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5537 5538#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5539 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5540 VSTORE(N0) \ 5541 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5542 5543#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5544 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5545 VSTORE(N0) \ 5546 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5547 5548#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5549 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5550 VSTORE(N0) \ 5551 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5552 5553#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5554 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5555 VSTORE(N0) \ 5556 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5557 5558#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5559 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5560 VSTORE(N0) \ 5561 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5562 5563#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5564 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5565 VSTORE(N0) \ 5566 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5567 5568#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5569 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5570 VSTORE(N0) \ 5571 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5572 5573 5574 5575#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5576 VSTORE(N0) \ 5577 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5578 5579#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5580 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5581 VSTORE(N0) \ 5582 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5583 5584#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5585 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5586 VSTORE(N0) \ 5587 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5588 5589#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5590 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5591 VSTORE(N0) \ 5592 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5593 5594#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5595 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5596 VSTORE(N0) \ 5597 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5598 5599#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5600 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5601 VSTORE(N0) \ 5602 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5603 5604#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5605 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5606 VSTORE(N0) \ 5607 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5608 5609#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5610 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5611 VSTORE(N0) \ 5612 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5613 5614#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5615 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5616 VSTORE(N0) \ 5617 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5618 5619#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 5620 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5621 VSTORE(N0) \ 5622 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5623 5624#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5625 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5626 VSTORE(N0) \ 5627 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5628 5629#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5630 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5631 VSTORE(N0) \ 5632 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5633 5634#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5635 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5636 VSTORE(N0) \ 5637 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5638 5639#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5640 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5641 VSTORE(N0) \ 5642 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5643 5644#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5645 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5646 VSTORE(N0) \ 5647 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5648 5649#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5650 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5651 VSTORE(N0) \ 5652 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5653 5654 5655 5656 5657#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5658#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5659 5660 5661 5662#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5663#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5664 5665 5666 5667#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5668 VSTORE_PARTIAL(N0, STORE_N0) \ 5669 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5670 5671#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5672 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5673 VSTORE_PARTIAL(N0, STORE_N0) \ 5674 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5675 5676#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5677 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5678 VSTORE_PARTIAL(N0, STORE_N0) \ 5679 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5680 5681#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5682 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5683 VSTORE_PARTIAL(N0, STORE_N0) \ 5684 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5685 5686#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5687 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5688 VSTORE_PARTIAL(N0, STORE_N0) \ 5689 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5690 5691#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5692 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5693 VSTORE_PARTIAL(N0, STORE_N0) \ 5694 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5695 5696#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5697 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5698 VSTORE_PARTIAL(N0, STORE_N0) \ 5699 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5700 5701#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5702 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5703 VSTORE_PARTIAL(N0, STORE_N0) \ 5704 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5705 5706#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5707 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5708 VSTORE_PARTIAL(N0, STORE_N0) \ 5709 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5710 5711#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5712 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5713 VSTORE_PARTIAL(N0, STORE_N0) \ 5714 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5715 5716#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5717 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5718 VSTORE_PARTIAL(N0, STORE_N0) \ 5719 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5720 5721#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5722 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5723 VSTORE_PARTIAL(N0, STORE_N0) \ 5724 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5725 5726#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5727 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5728 VSTORE_PARTIAL(N0, STORE_N0) \ 5729 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5730 5731#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5732 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5733 VSTORE_PARTIAL(N0, STORE_N0) \ 5734 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5735 5736#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5737 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5738 VSTORE_PARTIAL(N0, STORE_N0) \ 5739 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5740 5741#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5742 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5743 VSTORE_PARTIAL(N0, STORE_N0) \ 5744 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5745 5746 5747 5748#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5749#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5750 5751#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5752 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 5753 { \ 5754 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5755 } \ 5756 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 5757 { \ 5758 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5759 } \ 5760 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 5761 { \ 5762 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5763 } \ 5764 else \ 5765 { \ 5766 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5767 } 5768 5769#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 5770 if(!(PARTIAL_COND_X)) \ 5771 { \ 5772 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5773 } \ 5774 else \ 5775 { \ 5776 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5777 } 5778 5779#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 5780 if(!(PARTIAL_COND_Y)) \ 5781 { \ 5782 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5783 } \ 5784 else \ 5785 { \ 5786 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5787 } 5788 5789 5790#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 5791 5792 5793#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 5794 5795#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5796 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5797 5798#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 5799 5800#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5801 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 5802 5803#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 5804 5805#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5806 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 5807 5808#else 5809 5810#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5811 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 5812 5813#endif 5814 5815#endif 5816 5817 5818#if defined(PARTIAL_STORE_M0) 5819 5820#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 5821 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 5822#else 5823#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 5824 ((uint)(y * M0)) 5825#endif 5826 5827 5828 5829#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 5830 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 5831 5832 5833#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 5834#pragma OPENCL EXTENSION cl_khr_fp16 : enable 5835#endif 5836 5837#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 5838#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 5839#endif 5840 5841#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 5842#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 5843#endif 5844 5845#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 5846#pragma OPENCL EXTENSION cl_arm_printf : enable 5847#endif 5848 5849#define GPU_ARCH_MIDGARD 0x100 5850#define GPU_ARCH_BIFROST 0x200 5851#define GPU_ARCH_VALHALL 0x300 5852 5853 5854#define CONCAT(a, b) a##b 5855 5856 5857#define EXPAND(x) x 5858 5859 5860#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 5861 5862 5863#define REV1(x) ((x)) 5864#define REV2(x) ((x).s10) 5865#define REV3(x) ((x).s210) 5866#define REV4(x) ((x).s3210) 5867#define REV8(x) ((x).s76543210) 5868#define REV16(x) ((x).sFEDCBA9876543210) 5869 5870 5871 5872#define REVERSE_STR(x, s) REV##s((x)) 5873#define REVERSE(x, s) REVERSE_STR(x, s) 5874 5875 5876 5877#define ROT1_0(x) ((x)) 5878#define ROT1_1(x) ((x)) 5879 5880#define ROT2_0(x) ((x)) 5881#define ROT2_1(x) ((x).s10) 5882#define ROT2_2(x) ((x)) 5883 5884#define ROT3_0(x) ((x)) 5885#define ROT3_1(x) ((x).s201) 5886#define ROT3_2(x) ((x).s120) 5887#define ROT3_3(x) ((x)) 5888 5889#define ROT4_0(x) ((x)) 5890#define ROT4_1(x) ((x).s3012) 5891#define ROT4_2(x) ((x).s2301) 5892#define ROT4_3(x) ((x).s1230) 5893#define ROT4_4(x) ((x)) 5894 5895#define ROT8_0(x) ((x)) 5896#define ROT8_1(x) ((x).s70123456) 5897#define ROT8_2(x) ((x).s67012345) 5898#define ROT8_3(x) ((x).s56701234) 5899#define ROT8_4(x) ((x).s45670123) 5900#define ROT8_5(x) ((x).s34567012) 5901#define ROT8_6(x) ((x).s23456701) 5902#define ROT8_7(x) ((x).s12345670) 5903#define ROT8_8(x) ((x)) 5904 5905#define ROT16_0(x) ((x)) 5906#define ROT16_1(x) ((x).sF0123456789ABCDE) 5907#define ROT16_2(x) ((x).sEF0123456789ABCD) 5908#define ROT16_3(x) ((x).sDEF0123456789ABC) 5909#define ROT16_4(x) ((x).sCDEF0123456789AB) 5910#define ROT16_5(x) ((x).sBCDEF0123456789A) 5911#define ROT16_6(x) ((x).sABCDEF0123456789) 5912#define ROT16_7(x) ((x).s9ABCDEF012345678) 5913#define ROT16_8(x) ((x).s89ABCDEF01234567) 5914#define ROT16_9(x) ((x).s789ABCDEF0123456) 5915#define ROT16_10(x) ((x).s6789ABCDEF012345) 5916#define ROT16_11(x) ((x).s56789ABCDEF01234) 5917#define ROT16_12(x) ((x).s456789ABCDEF0123) 5918#define ROT16_13(x) ((x).s3456789ABCDEF012) 5919#define ROT16_14(x) ((x).s23456789ABCDEF01) 5920#define ROT16_15(x) ((x).s123456789ABCDEF0) 5921#define ROT16_16(x) ((x)) 5922 5923 5924 5925#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 5926#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 5927 5928 5929 5930#define V_OFFS1(dt) (dt##1)(0) 5931#define V_OFFS2(dt) (dt##2)(0, 1) 5932#define V_OFFS3(dt) (dt##3)(0, 1, 2) 5933#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 5934#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 5935#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 5936 5937 5938 5939#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 5940#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 5941 5942 5943#define VLOAD_STR(size) vload##size 5944#define VLOAD(size) VLOAD_STR(size) 5945 5946 5947#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 5948#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 5949 5950#define NO_LOAD(data, offs, ptr) \ 5951 { \ 5952 } 5953 5954 5955#define vload_partial_1_0 NO_LOAD 5956#define vload_partial_1_1 vload1 5957#define vload_partial_1_2 NO_LOAD 5958#define vload_partial_1_3 NO_LOAD 5959#define vload_partial_1_4 NO_LOAD 5960#define vload_partial_1_5 NO_LOAD 5961#define vload_partial_1_6 NO_LOAD 5962#define vload_partial_1_7 NO_LOAD 5963#define vload_partial_1_8 NO_LOAD 5964#define vload_partial_1_9 NO_LOAD 5965#define vload_partial_1_10 NO_LOAD 5966#define vload_partial_1_11 NO_LOAD 5967#define vload_partial_1_12 NO_LOAD 5968#define vload_partial_1_13 NO_LOAD 5969#define vload_partial_1_14 NO_LOAD 5970#define vload_partial_1_15 NO_LOAD 5971#define vload_partial_1_16 NO_LOAD 5972 5973#define vload_partial_2_0 NO_LOAD 5974#define vload_partial_2_1 vload_partial_1 5975#define vload_partial_2_2 vload_partial_2 5976#define vload_partial_2_3 NO_LOAD 5977#define vload_partial_2_4 NO_LOAD 5978#define vload_partial_2_5 NO_LOAD 5979#define vload_partial_2_6 NO_LOAD 5980#define vload_partial_2_7 NO_LOAD 5981#define vload_partial_2_8 NO_LOAD 5982#define vload_partial_2_9 NO_LOAD 5983#define vload_partial_2_10 NO_LOAD 5984#define vload_partial_2_11 NO_LOAD 5985#define vload_partial_2_12 NO_LOAD 5986#define vload_partial_2_13 NO_LOAD 5987#define vload_partial_2_14 NO_LOAD 5988#define vload_partial_2_15 NO_LOAD 5989#define vload_partial_2_16 NO_LOAD 5990 5991#define vload_partial_3_0 NO_LOAD 5992#define vload_partial_3_1 vload_partial_1 5993#define vload_partial_3_2 vload_partial_2 5994#define vload_partial_3_3 vload_partial_3 5995#define vload_partial_3_4 NO_LOAD 5996#define vload_partial_3_5 NO_LOAD 5997#define vload_partial_3_6 NO_LOAD 5998#define vload_partial_3_7 NO_LOAD 5999#define vload_partial_3_8 NO_LOAD 6000#define vload_partial_3_9 NO_LOAD 6001#define vload_partial_3_10 NO_LOAD 6002#define vload_partial_3_11 NO_LOAD 6003#define vload_partial_3_12 NO_LOAD 6004#define vload_partial_3_13 NO_LOAD 6005#define vload_partial_3_14 NO_LOAD 6006#define vload_partial_3_15 NO_LOAD 6007#define vload_partial_3_16 NO_LOAD 6008 6009#define vload_partial_4_0 NO_LOAD 6010#define vload_partial_4_1 vload_partial_1 6011#define vload_partial_4_2 vload_partial_2 6012#define vload_partial_4_3 vload_partial_3 6013#define vload_partial_4_4 vload_partial_4 6014#define vload_partial_4_5 NO_LOAD 6015#define vload_partial_4_6 NO_LOAD 6016#define vload_partial_4_7 NO_LOAD 6017#define vload_partial_4_8 NO_LOAD 6018#define vload_partial_4_9 NO_LOAD 6019#define vload_partial_4_10 NO_LOAD 6020#define vload_partial_4_11 NO_LOAD 6021#define vload_partial_4_12 NO_LOAD 6022#define vload_partial_4_13 NO_LOAD 6023#define vload_partial_4_14 NO_LOAD 6024#define vload_partial_4_15 NO_LOAD 6025#define vload_partial_4_16 NO_LOAD 6026 6027#define vload_partial_8_0 NO_LOAD 6028#define vload_partial_8_1 vload_partial_1 6029#define vload_partial_8_2 vload_partial_2 6030#define vload_partial_8_3 vload_partial_3 6031#define vload_partial_8_4 vload_partial_4 6032#define vload_partial_8_5 vload_partial_5 6033#define vload_partial_8_6 vload_partial_6 6034#define vload_partial_8_7 vload_partial_7 6035#define vload_partial_8_8 vload_partial_8 6036#define vload_partial_8_9 NO_LOAD 6037#define vload_partial_8_10 NO_LOAD 6038#define vload_partial_8_11 NO_LOAD 6039#define vload_partial_8_12 NO_LOAD 6040#define vload_partial_8_13 NO_LOAD 6041#define vload_partial_8_14 NO_LOAD 6042#define vload_partial_8_15 NO_LOAD 6043#define vload_partial_8_16 NO_LOAD 6044 6045#define vload_partial_16_0 NO_LOAD 6046#define vload_partial_16_1 vload_partial_1 6047#define vload_partial_16_2 vload_partial_2 6048#define vload_partial_16_3 vload_partial_3 6049#define vload_partial_16_4 vload_partial_4 6050#define vload_partial_16_5 vload_partial_5 6051#define vload_partial_16_6 vload_partial_6 6052#define vload_partial_16_7 vload_partial_7 6053#define vload_partial_16_8 vload_partial_8 6054#define vload_partial_16_9 vload_partial_9 6055#define vload_partial_16_10 vload_partial_10 6056#define vload_partial_16_11 vload_partial_11 6057#define vload_partial_16_12 vload_partial_12 6058#define vload_partial_16_13 vload_partial_13 6059#define vload_partial_16_14 vload_partial_14 6060#define vload_partial_16_15 vload_partial_15 6061#define vload_partial_16_16 vload_partial_16 6062 6063 6064#define vload_partial_1(DATA, OFFSET, PTR) \ 6065 DATA.s0 = vload1(OFFSET, PTR); 6066 6067#define vload_partial_2(DATA, OFFSET, PTR) \ 6068 DATA.s01 = vload2(OFFSET, PTR); 6069 6070#define vload_partial_3(DATA, OFFSET, PTR) \ 6071 DATA.s012 = vload3(OFFSET, PTR); 6072 6073#define vload_partial_4(DATA, OFFSET, PTR) \ 6074 DATA.s0123 = vload4(OFFSET, PTR); 6075 6076#define vload_partial_5(DATA, OFFSET, PTR) \ 6077 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 6078 DATA.s4 = vload1(OFFSET, PTR + 4); 6079 6080#define vload_partial_6(DATA, OFFSET, PTR) \ 6081 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 6082 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 6083 6084#define vload_partial_7(DATA, OFFSET, PTR) \ 6085 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 6086 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 6087 6088#define vload_partial_8(DATA, OFFSET, PTR) \ 6089 DATA.s01234567 = vload8(OFFSET, PTR); 6090 6091#define vload_partial_9(DATA, OFFSET, PTR) \ 6092 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6093 DATA.s8 = vload1(OFFSET, PTR + 8); 6094 6095#define vload_partial_10(DATA, OFFSET, PTR) \ 6096 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6097 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 6098 6099#define vload_partial_11(DATA, OFFSET, PTR) \ 6100 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6101 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 6102 6103#define vload_partial_12(DATA, OFFSET, PTR) \ 6104 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6105 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 6106 6107#define vload_partial_13(DATA, OFFSET, PTR) \ 6108 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6109 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 6110 6111#define vload_partial_14(DATA, OFFSET, PTR) \ 6112 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6113 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 6114 6115#define vload_partial_15(DATA, OFFSET, PTR) \ 6116 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6117 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 6118 6119#define vload_partial_16(DATA, OFFSET, PTR) \ 6120 DATA = vload16(OFFSET, PTR); 6121 6122 6123 6124#define PIXEL_UNIT4 1 6125#define PIXEL_UNIT8 2 6126#define PIXEL_UNIT16 4 6127 6128 6129#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 6130#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 6131 6132 6133#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 6134#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 6135#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 6136 6137#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 6138#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 6139#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 6140#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 6141#endif 6142 6143#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 6144#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 6145#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 6146 6147#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 6148#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 6149#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 6150#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 6151#endif 6152 6153 6154#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 6155#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 6156 6157 6158#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 6159#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 6160 6161#define VSTORE_STR(size) vstore##size 6162#define VSTORE(size) VSTORE_STR(size) 6163 6164#define float1 float 6165#define half1 half 6166#define char1 char 6167#define uchar1 uchar 6168#define short1 short 6169#define ushort1 ushort 6170#define int1 int 6171#define uint1 uint 6172#define long1 long 6173#define ulong1 ulong 6174#define double1 double 6175 6176#define vload1(OFFSET, PTR) *(OFFSET + PTR) 6177#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 6178 6179 6180#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 6181#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 6182 6183#define NO_STORE(data, offs, ptr) \ 6184 { \ 6185 } 6186 6187 6188#define vstore_partial_1_0 NO_STORE 6189#define vstore_partial_1_1 vstore1 6190#define vstore_partial_1_2 NO_STORE 6191#define vstore_partial_1_3 NO_STORE 6192#define vstore_partial_1_4 NO_STORE 6193#define vstore_partial_1_5 NO_STORE 6194#define vstore_partial_1_6 NO_STORE 6195#define vstore_partial_1_7 NO_STORE 6196#define vstore_partial_1_8 NO_STORE 6197#define vstore_partial_1_9 NO_STORE 6198#define vstore_partial_1_10 NO_STORE 6199#define vstore_partial_1_11 NO_STORE 6200#define vstore_partial_1_12 NO_STORE 6201#define vstore_partial_1_13 NO_STORE 6202#define vstore_partial_1_14 NO_STORE 6203#define vstore_partial_1_15 NO_STORE 6204#define vstore_partial_1_16 NO_STORE 6205 6206#define vstore_partial_2_0 NO_STORE 6207#define vstore_partial_2_1 vstore_partial_1 6208#define vstore_partial_2_2 vstore_partial_2 6209#define vstore_partial_2_3 NO_STORE 6210#define vstore_partial_2_4 NO_STORE 6211#define vstore_partial_2_5 NO_STORE 6212#define vstore_partial_2_6 NO_STORE 6213#define vstore_partial_2_7 NO_STORE 6214#define vstore_partial_2_8 NO_STORE 6215#define vstore_partial_2_9 NO_STORE 6216#define vstore_partial_2_10 NO_STORE 6217#define vstore_partial_2_11 NO_STORE 6218#define vstore_partial_2_12 NO_STORE 6219#define vstore_partial_2_13 NO_STORE 6220#define vstore_partial_2_14 NO_STORE 6221#define vstore_partial_2_15 NO_STORE 6222#define vstore_partial_2_16 NO_STORE 6223 6224#define vstore_partial_3_0 NO_STORE 6225#define vstore_partial_3_1 vstore_partial_1 6226#define vstore_partial_3_2 vstore_partial_2 6227#define vstore_partial_3_3 vstore_partial_3 6228#define vstore_partial_3_4 NO_STORE 6229#define vstore_partial_3_5 NO_STORE 6230#define vstore_partial_3_6 NO_STORE 6231#define vstore_partial_3_7 NO_STORE 6232#define vstore_partial_3_8 NO_STORE 6233#define vstore_partial_3_9 NO_STORE 6234#define vstore_partial_3_10 NO_STORE 6235#define vstore_partial_3_11 NO_STORE 6236#define vstore_partial_3_12 NO_STORE 6237#define vstore_partial_3_13 NO_STORE 6238#define vstore_partial_3_14 NO_STORE 6239#define vstore_partial_3_15 NO_STORE 6240#define vstore_partial_3_16 NO_STORE 6241 6242#define vstore_partial_4_0 NO_STORE 6243#define vstore_partial_4_1 vstore_partial_1 6244#define vstore_partial_4_2 vstore_partial_2 6245#define vstore_partial_4_3 vstore_partial_3 6246#define vstore_partial_4_4 vstore_partial_4 6247#define vstore_partial_4_5 NO_STORE 6248#define vstore_partial_4_6 NO_STORE 6249#define vstore_partial_4_7 NO_STORE 6250#define vstore_partial_4_8 NO_STORE 6251#define vstore_partial_4_9 NO_STORE 6252#define vstore_partial_4_10 NO_STORE 6253#define vstore_partial_4_11 NO_STORE 6254#define vstore_partial_4_12 NO_STORE 6255#define vstore_partial_4_13 NO_STORE 6256#define vstore_partial_4_14 NO_STORE 6257#define vstore_partial_4_15 NO_STORE 6258#define vstore_partial_4_16 NO_STORE 6259 6260#define vstore_partial_8_0 NO_STORE 6261#define vstore_partial_8_1 vstore_partial_1 6262#define vstore_partial_8_2 vstore_partial_2 6263#define vstore_partial_8_3 vstore_partial_3 6264#define vstore_partial_8_4 vstore_partial_4 6265#define vstore_partial_8_5 vstore_partial_5 6266#define vstore_partial_8_6 vstore_partial_6 6267#define vstore_partial_8_7 vstore_partial_7 6268#define vstore_partial_8_8 vstore_partial_8 6269#define vstore_partial_8_9 NO_STORE 6270#define vstore_partial_8_10 NO_STORE 6271#define vstore_partial_8_11 NO_STORE 6272#define vstore_partial_8_12 NO_STORE 6273#define vstore_partial_8_13 NO_STORE 6274#define vstore_partial_8_14 NO_STORE 6275#define vstore_partial_8_15 NO_STORE 6276#define vstore_partial_8_16 NO_STORE 6277 6278#define vstore_partial_16_0 NO_STORE 6279#define vstore_partial_16_1 vstore_partial_1 6280#define vstore_partial_16_2 vstore_partial_2 6281#define vstore_partial_16_3 vstore_partial_3 6282#define vstore_partial_16_4 vstore_partial_4 6283#define vstore_partial_16_5 vstore_partial_5 6284#define vstore_partial_16_6 vstore_partial_6 6285#define vstore_partial_16_7 vstore_partial_7 6286#define vstore_partial_16_8 vstore_partial_8 6287#define vstore_partial_16_9 vstore_partial_9 6288#define vstore_partial_16_10 vstore_partial_10 6289#define vstore_partial_16_11 vstore_partial_11 6290#define vstore_partial_16_12 vstore_partial_12 6291#define vstore_partial_16_13 vstore_partial_13 6292#define vstore_partial_16_14 vstore_partial_14 6293#define vstore_partial_16_15 vstore_partial_15 6294#define vstore_partial_16_16 vstore_partial_16 6295 6296 6297#define vstore_partial_1(DATA, OFFSET, PTR) \ 6298 vstore1(DATA.s0, OFFSET, PTR); 6299 6300#define vstore_partial_2(DATA, OFFSET, PTR) \ 6301 vstore2(DATA.s01, OFFSET, PTR); 6302 6303#define vstore_partial_3(DATA, OFFSET, PTR) \ 6304 vstore3(DATA.s012, OFFSET, PTR); 6305 6306#define vstore_partial_4(DATA, OFFSET, PTR) \ 6307 vstore4(DATA.s0123, OFFSET, PTR); 6308 6309#define vstore_partial_5(DATA, OFFSET, PTR) \ 6310 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 6311 vstore1(DATA.s4, OFFSET, PTR + 4); 6312 6313#define vstore_partial_6(DATA, OFFSET, PTR) \ 6314 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 6315 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 6316 6317#define vstore_partial_7(DATA, OFFSET, PTR) \ 6318 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 6319 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 6320 6321#define vstore_partial_8(DATA, OFFSET, PTR) \ 6322 vstore8(DATA.s01234567, OFFSET, PTR); 6323 6324#define vstore_partial_9(DATA, OFFSET, PTR) \ 6325 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6326 vstore1(DATA.s8, OFFSET, PTR + 8); 6327 6328#define vstore_partial_10(DATA, OFFSET, PTR) \ 6329 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6330 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 6331 6332#define vstore_partial_11(DATA, OFFSET, PTR) \ 6333 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6334 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 6335 6336#define vstore_partial_12(DATA, OFFSET, PTR) \ 6337 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6338 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 6339 6340#define vstore_partial_13(DATA, OFFSET, PTR) \ 6341 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6342 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 6343 6344#define vstore_partial_14(DATA, OFFSET, PTR) \ 6345 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6346 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 6347 6348#define vstore_partial_15(DATA, OFFSET, PTR) \ 6349 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6350 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 6351 6352#define vstore_partial_16(DATA, OFFSET, PTR) \ 6353 vstore16(DATA, OFFSET, PTR); 6354 6355 6356 6357 6358 6359#define convert_float_sat convert_float 6360#define convert_float1_sat convert_float 6361#define convert_float2_sat convert_float2 6362#define convert_float3_sat convert_float3 6363#define convert_float4_sat convert_float4 6364#define convert_float8_sat convert_float8 6365#define convert_float16_sat convert_float16 6366#define convert_half_sat convert_float 6367#define convert_half1_sat convert_half 6368#define convert_half2_sat convert_half2 6369#define convert_half3_sat convert_half3 6370#define convert_half4_sat convert_half4 6371#define convert_half8_sat convert_half8 6372#define convert_half16_sat convert_half16 6373 6374#define convert_float1 convert_float 6375#define convert_half1 convert_half 6376#define convert_char1 convert_char 6377#define convert_uchar1 convert_uchar 6378#define convert_short1 convert_short 6379#define convert_ushort1 convert_ushort 6380#define convert_int1 convert_int 6381#define convert_uint1 convert_uint 6382#define convert_long1 convert_long 6383#define convert_ulong1 convert_ulong 6384#define convert_double1 convert_double 6385 6386#define convert_char1_sat convert_char_sat 6387#define convert_uchar1_sat convert_uchar_sat 6388#define convert_uchar2_sat convert_uchar2_sat 6389#define convert_uchar3_sat convert_uchar3_sat 6390#define convert_uchar4_sat convert_uchar4_sat 6391#define convert_uchar8_sat convert_uchar8_sat 6392#define convert_uchar16_sat convert_uchar16_sat 6393#define convert_short1_sat convert_short_sat 6394#define convert_ushort1_sat convert_ushort_sat 6395#define convert_int1_sat convert_int_sat 6396#define convert_uint1_sat convert_uint_sat 6397#define convert_long1_sat convert_long_sat 6398#define convert_ulong1_sat convert_ulong_sat 6399#define convert_double1_sat convert_double_sat 6400 6401#define VEC_DATA_TYPE_STR(type, size) type##size 6402#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 6403 6404#define CONVERT_STR(x, type) (convert_##type((x))) 6405#define CONVERT(x, type) CONVERT_STR(x, type) 6406 6407#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 6408#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 6409 6410#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 6411#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 6412 6413#define select_vec_dt_uchar(size) uchar##size 6414#define select_vec_dt_char(size) char##size 6415#define select_vec_dt_ushort(size) ushort##size 6416#define select_vec_dt_short(size) short##size 6417#define select_vec_dt_half(size) short##size 6418#define select_vec_dt_uint(size) uint##size 6419#define select_vec_dt_int(size) int##size 6420#define select_vec_dt_float(size) int##size 6421#define select_vec_dt_ulong(size) ulong##size 6422#define select_vec_dt_long(size) long##size 6423 6424#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 6425#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 6426#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 6427 6428#define signed_int_vec_dt_uchar(size) char##size 6429#define signed_int_vec_dt_char(size) char##size 6430#define signed_int_vec_dt_ushort(size) short##size 6431#define signed_int_vec_dt_short(size) short##size 6432#define signed_int_vec_dt_half(size) short##size 6433#define signed_int_vec_dt_uint(size) int##size 6434#define signed_int_vec_dt_int(size) int##size 6435#define signed_int_vec_dt_float(size) int##size 6436#define signed_int_vec_dt_ulong(size) long##size 6437#define signed_int_vec_dt_long(size) long##size 6438 6439#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 6440#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 6441#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 6442 6443#define sum_reduce_1(x) (x) 6444#define sum_reduce_2(x) ((x).s0) + ((x).s1) 6445#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 6446#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 6447#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 6448#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 6449 6450#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 6451#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 6452 6453#define prod_reduce_1(x) (x) 6454#define prod_reduce_2(x) ((x).s0) * ((x).s1) 6455#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 6456#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 6457#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 6458#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 6459 6460#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 6461#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 6462 6463#define max_reduce_1(x) (x) 6464#define max_reduce_2(x) max(((x).s0), ((x).s1)) 6465#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 6466#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 6467#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 6468#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 6469 6470#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 6471#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 6472 6473#define VECTOR_DECLARATION(name) \ 6474 __global uchar *name##_ptr, \ 6475 uint name##_stride_x, \ 6476 uint name##_step_x, \ 6477 uint name##_offset_first_element_in_bytes 6478 6479#define IMAGE_DECLARATION(name) \ 6480 __global uchar *name##_ptr, \ 6481 uint name##_stride_x, \ 6482 uint name##_step_x, \ 6483 uint name##_stride_y, \ 6484 uint name##_step_y, \ 6485 uint name##_offset_first_element_in_bytes 6486 6487#define TENSOR3D_DECLARATION(name) \ 6488 __global uchar *name##_ptr, \ 6489 uint name##_stride_x, \ 6490 uint name##_step_x, \ 6491 uint name##_stride_y, \ 6492 uint name##_step_y, \ 6493 uint name##_stride_z, \ 6494 uint name##_step_z, \ 6495 uint name##_offset_first_element_in_bytes 6496 6497#define TENSOR4D_DECLARATION(name) \ 6498 __global uchar *name##_ptr, \ 6499 uint name##_stride_x, \ 6500 uint name##_step_x, \ 6501 uint name##_stride_y, \ 6502 uint name##_step_y, \ 6503 uint name##_stride_z, \ 6504 uint name##_step_z, \ 6505 uint name##_stride_w, \ 6506 uint name##_step_w, \ 6507 uint name##_offset_first_element_in_bytes 6508 6509#define TENSOR5D_DECLARATION(name) \ 6510 __global uchar *name##_ptr, \ 6511 uint name##_stride_x, \ 6512 uint name##_step_x, \ 6513 uint name##_stride_y, \ 6514 uint name##_step_y, \ 6515 uint name##_stride_z, \ 6516 uint name##_step_z, \ 6517 uint name##_stride_w, \ 6518 uint name##_step_w, \ 6519 uint name##_stride_v, \ 6520 uint name##_step_v, \ 6521 uint name##_offset_first_element_in_bytes 6522 6523#define CONVERT_TO_VECTOR_STRUCT(name) \ 6524 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 6525 6526#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 6527 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 6528 6529#define CONVERT_TO_IMAGE_STRUCT(name) \ 6530 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 6531 6532#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 6533 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 6534 6535#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 6536 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 6537 6538#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 6539 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 6540 6541#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 6542 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 6543 6544#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 6545 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 6546 name##_stride_z, name##_step_z) 6547 6548#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 6549 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 6550 6551#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 6552 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 6553 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 6554 6555#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 6556 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 6557 6558#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 6559 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 6560 name##_stride_z, name##_step_z) 6561 6562 6563typedef struct Vector 6564{ 6565 __global uchar *ptr; 6566 int offset_first_element_in_bytes; 6567 int stride_x; 6568} Vector; 6569 6570 6571typedef struct Image 6572{ 6573 __global uchar *ptr; 6574 int offset_first_element_in_bytes; 6575 int stride_x; 6576 int stride_y; 6577} Image; 6578 6579 6580typedef struct Tensor3D 6581{ 6582 __global uchar *ptr; 6583 int offset_first_element_in_bytes; 6584 int stride_x; 6585 int stride_y; 6586 int stride_z; 6587} Tensor3D; 6588 6589 6590typedef struct Tensor4D 6591{ 6592 __global uchar *ptr; 6593 int offset_first_element_in_bytes; 6594 int stride_x; 6595 int stride_y; 6596 int stride_z; 6597 int stride_w; 6598} Tensor4D; 6599 6600 6601inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 6602{ 6603 Vector vector = 6604 { 6605 .ptr = ptr, 6606 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6607 .stride_x = stride_x, 6608 }; 6609 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 6610 return vector; 6611} 6612 6613 6614inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 6615{ 6616 Image img = 6617 { 6618 .ptr = ptr, 6619 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6620 .stride_x = stride_x, 6621 .stride_y = stride_y 6622 }; 6623 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 6624 return img; 6625} 6626 6627 6628inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 6629{ 6630 Image img = 6631 { 6632 .ptr = ptr, 6633 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6634 .stride_x = stride_x, 6635 .stride_y = stride_y 6636 }; 6637 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 6638 return img; 6639} 6640 6641 6642inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 6643{ 6644 Tensor3D tensor = 6645 { 6646 .ptr = ptr, 6647 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6648 .stride_x = stride_x, 6649 .stride_y = stride_y, 6650 .stride_z = stride_z 6651 }; 6652 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 6653 return tensor; 6654} 6655 6656 6657inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 6658{ 6659 Tensor3D tensor = 6660 { 6661 .ptr = ptr, 6662 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6663 .stride_x = stride_x, 6664 .stride_y = stride_y, 6665 .stride_z = stride_z 6666 }; 6667 return tensor; 6668} 6669 6670inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 6671 uint step_w, 6672 uint mod_size) 6673{ 6674 Tensor4D tensor = 6675 { 6676 .ptr = ptr, 6677 .offset_first_element_in_bytes = offset_first_element_in_bytes, 6678 .stride_x = stride_x, 6679 .stride_y = stride_y, 6680 .stride_z = stride_z, 6681 .stride_w = stride_w 6682 }; 6683 6684 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 6685 return tensor; 6686} 6687 6688 6689inline __global const uchar *vector_offset(const Vector *vec, int x) 6690{ 6691 return vec->ptr + x * vec->stride_x; 6692} 6693 6694 6695inline __global uchar *offset(const Image *img, int x, int y) 6696{ 6697 return img->ptr + x * img->stride_x + y * img->stride_y; 6698} 6699 6700 6701inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 6702{ 6703 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 6704} 6705 6706 6707inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 6708{ 6709 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 6710} 6711 6712 6713inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 6714{ 6715 uint num_elements = width * height; 6716 6717 const uint z = index / num_elements; 6718 6719 index %= num_elements; 6720 6721 const uint y = index / width; 6722 6723 index %= width; 6724 6725 const uint x = index; 6726 6727 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 6728} 6729 6730#endif 6731 6732 6733 6734#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C) 6735#define REPEAT_3_2(P_X, P_A, P_B, P_C) \ 6736 P_X##_DEF(1, P_A, P_B, P_C); \ 6737 REPEAT_3_1(P_X, P_A, P_B, P_C) 6738#define REPEAT_3_3(P_X, P_A, P_B, P_C) \ 6739 P_X##_DEF(2, P_A, P_B, P_C); \ 6740 REPEAT_3_2(P_X, P_A, P_B, P_C) 6741#define REPEAT_3_4(P_X, P_A, P_B, P_C) \ 6742 P_X##_DEF(3, P_A, P_B, P_C); \ 6743 REPEAT_3_3(P_X, P_A, P_B, P_C) 6744#define REPEAT_3_5(P_X, P_A, P_B, P_C) \ 6745 P_X##_DEF(4, P_A, P_B, P_C); \ 6746 REPEAT_3_4(P_X, P_A, P_B, P_C) 6747#define REPEAT_3_6(P_X, P_A, P_B, P_C) \ 6748 P_X##_DEF(5, P_A, P_B, P_C); \ 6749 REPEAT_3_5(P_X, P_A, P_B, P_C) 6750#define REPEAT_3_7(P_X, P_A, P_B, P_C) \ 6751 P_X##_DEF(6, P_A, P_B, P_C); \ 6752 REPEAT_3_6(P_X, P_A, P_B, P_C) 6753#define REPEAT_3_8(P_X, P_A, P_B, P_C) \ 6754 P_X##_DEF(7, P_A, P_B, P_C); \ 6755 REPEAT_3_7(P_X, P_A, P_B, P_C) 6756#define REPEAT_3_9(P_X, P_A, P_B, P_C) \ 6757 P_X##_DEF(8, P_A, P_B, P_C); \ 6758 REPEAT_3_8(P_X, P_A, P_B, P_C) 6759#define REPEAT_3_10(P_X, P_A, P_B, P_C) \ 6760 P_X##_DEF(9, P_A, P_B, P_C); \ 6761 REPEAT_3_9(P_X, P_A, P_B, P_C) 6762#define REPEAT_3_11(P_X, P_A, P_B, P_C) \ 6763 P_X##_DEF(A, P_A, P_B, P_C); \ 6764 REPEAT_3_10(P_X, P_A, P_B, P_C) 6765#define REPEAT_3_12(P_X, P_A, P_B, P_C) \ 6766 P_X##_DEF(B, P_A, P_B, P_C); \ 6767 REPEAT_3_11(P_X, P_A, P_B, P_C) 6768#define REPEAT_3_13(P_X, P_A, P_B, P_C) \ 6769 P_X##_DEF(C, P_A, P_B, P_C); \ 6770 REPEAT_3_12(P_X, P_A, P_B, P_C) 6771#define REPEAT_3_14(P_X, P_A, P_B, P_C) \ 6772 P_X##_DEF(D, P_A, P_B, P_C); \ 6773 REPEAT_3_13(P_X, P_A, P_B, P_C) 6774#define REPEAT_3_15(P_X, P_A, P_B, P_C) \ 6775 P_X##_DEF(E, P_A, P_B, P_C); \ 6776 REPEAT_3_14(P_X, P_A, P_B, P_C) 6777#define REPEAT_3_16(P_X, P_A, P_B, P_C) \ 6778 P_X##_DEF(F, P_A, P_B, P_C); \ 6779 REPEAT_3_15(P_X, P_A, P_B, P_C) 6780 6781#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) 6782#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) 6783 6784 6785#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D) 6786#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \ 6787 P_X##_DEF(1, P_A, P_B, P_C, P_D); \ 6788 REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) 6789#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \ 6790 P_X##_DEF(2, P_A, P_B, P_C, P_D); \ 6791 REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) 6792#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \ 6793 P_X##_DEF(3, P_A, P_B, P_C, P_D); \ 6794 REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) 6795#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \ 6796 P_X##_DEF(4, P_A, P_B, P_C, P_D); \ 6797 REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) 6798#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \ 6799 P_X##_DEF(5, P_A, P_B, P_C, P_D); \ 6800 REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) 6801#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \ 6802 P_X##_DEF(6, P_A, P_B, P_C, P_D); \ 6803 REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) 6804#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \ 6805 P_X##_DEF(7, P_A, P_B, P_C, P_D); \ 6806 REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) 6807#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \ 6808 P_X##_DEF(8, P_A, P_B, P_C, P_D); \ 6809 REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) 6810#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \ 6811 P_X##_DEF(9, P_A, P_B, P_C, P_D); \ 6812 REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) 6813#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \ 6814 P_X##_DEF(A, P_A, P_B, P_C, P_D); \ 6815 REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) 6816#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \ 6817 P_X##_DEF(B, P_A, P_B, P_C, P_D); \ 6818 REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) 6819#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \ 6820 P_X##_DEF(C, P_A, P_B, P_C, P_D); \ 6821 REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) 6822#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \ 6823 P_X##_DEF(D, P_A, P_B, P_C, P_D); \ 6824 REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) 6825#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \ 6826 P_X##_DEF(E, P_A, P_B, P_C, P_D); \ 6827 REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) 6828#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \ 6829 P_X##_DEF(F, P_A, P_B, P_C, P_D); \ 6830 REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) 6831 6832#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) 6833#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) 6834 6835 6836#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL 6837#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) 6838 6839 6840#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) 6841#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) 6842 6843 6844#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) 6845#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) 6846 6847 6848#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL 6849#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) 6850 6851 6852#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL 6853#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) 6854 6855 6856#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC 6857#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) 6858 6859 6860#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID 6861#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) 6862 6863 6864#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) 6865#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) 6866 6867 6868#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) 6869#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) 6870 6871 6872#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 6873#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 6874 6875 6876#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 6877#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 6878 6879 6880#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ 6881 ({ \ 6882 VEC_DATA_TYPE(int, N0) \ 6883 VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 6884 VEC_DATA_TYPE(int, N0) \ 6885 VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 6886 VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ 6887 }) 6888#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) 6889 6890#endif 6891 6892#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 6893#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 6894 6895 6896 6897 6898#define TILE_VECTOR_SIZE1 1 6899#define TILE_VECTOR_SIZE2 2 6900#define TILE_VECTOR_SIZE3 3 6901#define TILE_VECTOR_SIZE4 4 6902#define TILE_VECTOR_SIZE5 8 6903#define TILE_VECTOR_SIZE6 8 6904#define TILE_VECTOR_SIZE7 8 6905#define TILE_VECTOR_SIZE8 8 6906#define TILE_VECTOR_SIZE9 16 6907#define TILE_VECTOR_SIZE10 16 6908#define TILE_VECTOR_SIZE11 16 6909#define TILE_VECTOR_SIZE12 16 6910#define TILE_VECTOR_SIZE13 16 6911#define TILE_VECTOR_SIZE14 16 6912#define TILE_VECTOR_SIZE15 16 6913#define TILE_VECTOR_SIZE16 16 6914 6915#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1 6916#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2 6917#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3 6918#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4 6919#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8 6920#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8 6921#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8 6922#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8 6923#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16 6924#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16 6925#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16 6926#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16 6927#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16 6928#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16 6929#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16 6930#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16 6931 6932 6933#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME) 6934#define TILE_STR(DATA_TYPE, H, W, BASENAME) \ 6935 union { \ 6936 DATA_TYPE s[TILE_VECTOR_SIZE##W]; \ 6937 TILE_VECTOR_TYPE##W(DATA_TYPE) v; \ 6938 } BASENAME[H] 6939 6940#define TENSOR4D_IMAGE(name) \ 6941 __read_only image2d_t name##_img, \ 6942 __global uchar *name##_ptr, \ 6943 uint name##_stride_x, \ 6944 uint name##_step_x, \ 6945 uint name##_stride_y, \ 6946 uint name##_step_y, \ 6947 uint name##_stride_z, \ 6948 uint name##_step_z, \ 6949 uint name##_stride_w, \ 6950 uint name##_step_w, \ 6951 uint name##_offset_first_element_in_bytes 6952 6953#define TENSOR4D_BUFFER(name) \ 6954 __global uchar *name##_ptr, \ 6955 uint name##_stride_x, \ 6956 uint name##_step_x, \ 6957 uint name##_stride_y, \ 6958 uint name##_step_y, \ 6959 uint name##_stride_z, \ 6960 uint name##_step_z, \ 6961 uint name##_stride_w, \ 6962 uint name##_step_w, \ 6963 uint name##_offset_first_element_in_bytes 6964 6965#define TENSOR4D_STR(name, type) TENSOR4D_##type(name) 6966#define TENSOR4D(name, type) TENSOR4D_STR(name, type) 6967 6968#define TENSOR4D_T_IMAGE(name) \ 6969 __read_only image2d_t name##_img, \ 6970 __global uchar *name##_ptr, \ 6971 uint name##_stride_y, \ 6972 uint name##_stride_z, \ 6973 uint name##_stride_w, \ 6974 uint name##_c, \ 6975 uint name##_w, \ 6976 uint name##_h, \ 6977 uint name##_n, \ 6978 uint name##_offset_first_element_in_bytes 6979 6980#define TENSOR4D_T_BUFFER(name) \ 6981 __global uchar *name##_ptr, \ 6982 uint name##_stride_y, \ 6983 uint name##_stride_z, \ 6984 uint name##_stride_w, \ 6985 uint name##_c, \ 6986 uint name##_w, \ 6987 uint name##_h, \ 6988 uint name##_n, \ 6989 uint name##_offset_first_element_in_bytes 6990 6991#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name) 6992 6993 6994#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type) 6995 6996#define TENSOR4D_RO_T_IMAGE(name) \ 6997 __read_only image2d_t name##_img, \ 6998 TENSOR4D_T_BUFFER(name) 6999 7000#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 7001 7002#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name) 7003 7004 7005#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type) 7006 7007#define TENSOR4D_WO_T_IMAGE(name) \ 7008 __write_only image2d_t name##_img, \ 7009 TENSOR4D_T_BUFFER(name) 7010 7011#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 7012 7013#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name) 7014 7015 7016#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type) 7017 7018#define TENSOR3D_T_IMAGE(name) \ 7019 __read_only image2d_t name##_img, \ 7020 __global uchar *name##_ptr, \ 7021 uint name##_stride_y, \ 7022 uint name##_stride_z, \ 7023 uint name##_w, \ 7024 uint name##_h, \ 7025 uint name##_n, \ 7026 uint name##_offset_first_element_in_bytes 7027 7028#define TENSOR3D_T_BUFFER(name) \ 7029 __global uchar *name##_ptr, \ 7030 uint name##_stride_y, \ 7031 uint name##_stride_z, \ 7032 uint name##_w, \ 7033 uint name##_h, \ 7034 uint name##_n, \ 7035 uint name##_offset_first_element_in_bytes 7036 7037#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name) 7038#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type) 7039 7040#if !defined(UNROLL_WITH_PRAGMA) 7041#define UNROLL_INCR(idx, step, macro) idx += (step); (macro) 7042 7043#define LOOP_UNROLLING_1(idx, step, macro) (macro) 7044#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro) 7045#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro) 7046#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro) 7047#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro) 7048#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro) 7049#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro) 7050#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro) 7051#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro) 7052#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro) 7053#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro) 7054#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro) 7055#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro) 7056#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro) 7057#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro) 7058#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro) 7059#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro) 7060#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro) 7061#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro) 7062#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro) 7063#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro) 7064#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro) 7065#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro) 7066#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro) 7067#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro) 7068#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro) 7069#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro) 7070#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro) 7071#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro) 7072#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro) 7073#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro) 7074#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro) 7075#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro) 7076#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro) 7077#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro) 7078#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro) 7079#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro) 7080#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro) 7081#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro) 7082#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro) 7083#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro) 7084#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro) 7085#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro) 7086#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro) 7087#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro) 7088#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro) 7089#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro) 7090#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro) 7091#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro) 7092#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro) 7093#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro) 7094#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro) 7095#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro) 7096#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro) 7097#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro) 7098#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro) 7099#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro) 7100#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro) 7101#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro) 7102#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro) 7103#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro) 7104#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro) 7105#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro) 7106#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro) 7107#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro) 7108#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro) 7109#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro) 7110#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro) 7111#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro) 7112#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro) 7113#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro) 7114#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro) 7115#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro) 7116#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro) 7117#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro) 7118#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro) 7119#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro) 7120#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro) 7121#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro) 7122#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro) 7123#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro) 7124#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro) 7125#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro) 7126#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro) 7127#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro) 7128#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro) 7129#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro) 7130#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro) 7131#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro) 7132#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro) 7133#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro) 7134#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro) 7135#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro) 7136#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro) 7137#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro) 7138#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro) 7139#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro) 7140#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro) 7141#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro) 7142#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro) 7143#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro) 7144#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro) 7145#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro) 7146#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro) 7147#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro) 7148#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro) 7149#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro) 7150#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro) 7151#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro) 7152#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro) 7153#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro) 7154#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro) 7155#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro) 7156#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro) 7157#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro) 7158#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro) 7159#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro) 7160#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro) 7161#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro) 7162#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro) 7163#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro) 7164#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro) 7165#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro) 7166#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro) 7167#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro) 7168#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro) 7169#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro) 7170#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro) 7171 7172#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 7173 { \ 7174 type idx = start; \ 7175 LOOP_UNROLLING_##num(idx, step, macro); \ 7176 } 7177#else 7178#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 7179 { \ 7180 _Pragma("unroll") \ 7181 for(type idx = start; idx < (num * step); idx += step) \ 7182 { \ 7183 (macro); \ 7184 } \ 7185 } 7186#endif 7187#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro) 7188 7189 7190#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0)) 7191 7192 7193#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) 7194#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) 7195#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7196 ({ \ 7197 c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b); \ 7198 }) 7199#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product) 7200#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 7201#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 7202#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b)); 7203#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 7204#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c)); 7205#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c)); 7206#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c)); 7207#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 7208#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 7209#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 7210#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b)); 7211#else 7212#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7213 ({ \ 7214 c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \ 7215 c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \ 7216 }) 7217#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7218 ({ \ 7219 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c); \ 7220 c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \ 7221 }) 7222#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val) \ 7223 ({ \ 7224 val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \ 7225 val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \ 7226 val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \ 7227 val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \ 7228 }) 7229#endif 7230#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7231 ({ \ 7232 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 7233 DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c); \ 7234 }) 7235#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7236 ({ \ 7237 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 7238 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c); \ 7239 }) 7240#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7241 ({ \ 7242 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 7243 DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c); \ 7244 }) 7245#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7246 ({ \ 7247 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 7248 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 7249 }) 7250#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7251 ({ \ 7252 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7253 DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c); \ 7254 }) 7255#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7256 ({ \ 7257 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7258 DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c); \ 7259 }) 7260#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7261 ({ \ 7262 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7263 DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c); \ 7264 }) 7265#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7266 ({ \ 7267 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7268 DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c); \ 7269 }) 7270#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7271 ({ \ 7272 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7273 DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c); \ 7274 }) 7275#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7276 ({ \ 7277 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7278 DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c); \ 7279 }) 7280#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7281 ({ \ 7282 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7283 DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c); \ 7284 }) 7285#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7286 ({ \ 7287 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 7288 DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 7289 }) 7290 7291 7292#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) 7293#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c) 7294 7295 7296#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) 7297#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) 7298#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \ 7299 VLOAD(WIDTH) \ 7300 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 7301#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y)) 7302 7303 7304#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) 7305#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) 7306#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \ 7307 VSTORE(WIDTH) \ 7308 (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 7309#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES) 7310 7311 7312#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \ 7313 ({ \ 7314 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7315 { \ 7316 dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \ 7317 }) \ 7318 }) 7319 7320 7321#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \ 7322 ({ \ 7323 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7324 { \ 7325 dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \ 7326 }) \ 7327 }) 7328 7329 7330#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y) \ 7331 ({ \ 7332 if(WIDTH1_CONDITION) \ 7333 { \ 7334 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7335 { \ 7336 VLOAD_PARTIAL(WIDTH0, WIDTH1) \ 7337 (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 7338 }) \ 7339 } \ 7340 else \ 7341 { \ 7342 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7343 { \ 7344 dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \ 7345 }) \ 7346 } \ 7347 }) 7348 7349#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \ 7350 ({ \ 7351 LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 7352 { \ 7353 LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 7354 { \ 7355 int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \ 7356 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 7357 int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \ 7358 if(_src_valid_y != 0) \ 7359 { \ 7360 dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 7361 } \ 7362 }) \ 7363 }) \ 7364 }) 7365 7366 7367#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst) \ 7368 ({ \ 7369 LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 7370 { \ 7371 LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 7372 { \ 7373 int _src_y = (X) + _xk * (DILATION_X); \ 7374 int _src_z = ((Y) + _yk * (DILATION_Y)); \ 7375 int _src_w = (B); \ 7376 bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \ 7377 if(!(BOUNDARY_CHECK)) \ 7378 { \ 7379 dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 7380 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 7381 } \ 7382 else \ 7383 { \ 7384 if(_src_valid_y) \ 7385 { \ 7386 dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 7387 (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 7388 } \ 7389 } \ 7390 }) \ 7391 }) \ 7392 }) 7393 7394 7395#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \ 7396 ({ \ 7397 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 7398 { \ 7399 int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \ 7400 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 7401 int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \ 7402 if(_src_valid_y != 0) \ 7403 { \ 7404 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 7405 } \ 7406 }) \ 7407 }) 7408 7409 7410#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 7411#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 7412#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 7413 ({ \ 7414 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 7415 { \ 7416 if(yi[0].s[_i] >= 0) \ 7417 { \ 7418 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 7419 } \ 7420 }) \ 7421 }) 7422 7423#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 7424 ({ \ 7425 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 7426 { \ 7427 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 7428 }) \ 7429 }) 7430 7431 7432#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \ 7433 ({ \ 7434 LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 7435 { \ 7436 int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT); \ 7437 _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH); \ 7438 int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT) \ 7439 && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH)); \ 7440 if(_src_valid_y != 0) \ 7441 { \ 7442 dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 7443 } \ 7444 }) \ 7445 }) 7446 7447 7448#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \ 7449 ({ \ 7450 if(WIDTH1_CONDITION) \ 7451 { \ 7452 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7453 { \ 7454 VSTORE_PARTIAL(WIDTH0, WIDTH1) \ 7455 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 7456 }) \ 7457 } \ 7458 else \ 7459 { \ 7460 LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7461 { \ 7462 VSTORE(WIDTH0) \ 7463 (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 7464 }) \ 7465 } \ 7466 }) 7467 7468 7469#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \ 7470 ({ \ 7471 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7472 { \ 7473 ACC_DATA_TYPE _tm = 0; \ 7474 LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 7475 { \ 7476 _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \ 7477 }) \ 7478 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7479 { \ 7480 dst[_m0].s[_n0] += _tm; \ 7481 LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 7482 { \ 7483 dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \ 7484 }) \ 7485 }) \ 7486 }) \ 7487 }) 7488 7489 7490#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 7491#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 7492 7493 7494#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 7495 ({ \ 7496 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7497 { \ 7498 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7499 { \ 7500 SRC_DATA_TYPE _tmp = 0; \ 7501 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 7502 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 7503 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 7504 long a_64 = (long)(_src); \ 7505 long b_64 = (long)(DST_MULTIPLIER); \ 7506 long ab_64 = a_64 * b_64; \ 7507 long mask1 = 1 << 30; \ 7508 long mask2 = 1 - (1 << 30); \ 7509 long is_positive_or_zero = ab_64 >= 0; \ 7510 long nudge = select(mask2, mask1, is_positive_or_zero); \ 7511 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 7512 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 7513 if(DST_SHIFT >= 0) \ 7514 { \ 7515 long mask = ((((int)1) << DST_SHIFT) - (long)1); \ 7516 long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 7517 _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 7518 } \ 7519 _tmp += DST_OFFSET; \ 7520 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 7521 }) \ 7522 }) \ 7523 }) 7524 7525 7526#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 7527 ({ \ 7528 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7529 { \ 7530 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7531 { \ 7532 SRC_DATA_TYPE _tmp = 0; \ 7533 SRC_DATA_TYPE _tmp2 = 0; \ 7534 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 7535 SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \ 7536 SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \ 7537 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \ 7538 SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \ 7539 long a_64 = (long)(_src); \ 7540 long b_64 = (long)(_dst_multiplier); \ 7541 long ab_64 = a_64 * b_64; \ 7542 long mask1 = 1 << 30; \ 7543 long mask2 = 1 - (1 << 30); \ 7544 long is_positive_or_zero = ab_64 >= 0; \ 7545 long nudge = select(mask2, mask1, is_positive_or_zero); \ 7546 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 7547 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 7548 long mask = ((((int)1) << _dst_shift) - (int)1); \ 7549 long threshold = (mask >> 1) + any(_tmp); \ 7550 _tmp2 = _tmp >> _dst_shift; \ 7551 _tmp2 += select(0, 1, (_tmp & mask) > threshold); \ 7552 _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \ 7553 _tmp += DST_OFFSET; \ 7554 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 7555 }) \ 7556 }) \ 7557 }) 7558 7559 7560#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \ 7561 ({ \ 7562 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7563 { \ 7564 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7565 { \ 7566 SRC_DATA_TYPE _tmp = 0; \ 7567 SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 7568 _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 7569 SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 7570 long a_64 = (long)(_src); \ 7571 long b_64 = (long)(DST_MULTIPLIER); \ 7572 long ab_64 = a_64 * b_64; \ 7573 long mask1 = 1 << 30; \ 7574 long mask2 = 1 - (1 << 30); \ 7575 long is_positive_or_zero = ab_64 >= 0; \ 7576 long nudge = select(mask2, mask1, is_positive_or_zero); \ 7577 SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 7578 _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 7579 if(DST_SHIFT >= 0) \ 7580 { \ 7581 long mask = ((((int)1) << DST_SHIFT) - (int)1); \ 7582 long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 7583 _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 7584 } \ 7585 _tmp += DST_OFFSET; \ 7586 dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 7587 }) \ 7588 }) \ 7589 }) 7590 7591 7592#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \ 7593 ({ \ 7594 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7595 { \ 7596 LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7597 { \ 7598 a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \ 7599 }) \ 7600 }) \ 7601 }) 7602 7603 7604#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \ 7605 ({ \ 7606 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7607 { \ 7608 dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \ 7609 }) \ 7610 }) 7611 7612 7613#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x)) 7614 7615#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x))) 7616 7617#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 7618 7619#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f)) 7620 7621#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x) 7622 7623#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 7624#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 7625 7626#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL)) 7627#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL)) 7628#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL)) 7629#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL)) 7630 7631 7632#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \ 7633 ({ \ 7634 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7635 { \ 7636 dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \ 7637 }) \ 7638 }) 7639 7640 7641#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7642 ({ \ 7643 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7644 { \ 7645 dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \ 7646 }) \ 7647 }) 7648 7649 7650#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 7651 ({ \ 7652 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7653 { \ 7654 dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant; \ 7655 }) \ 7656 }) 7657 7658#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7659#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7660#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7661 7662#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7663#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7664 7665#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7666 7667#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7668#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7669 7670 7671#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 7672 ({ \ 7673 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7674 { \ 7675 dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \ 7676 }) \ 7677 }) 7678 7679 7680#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7681 ({ \ 7682 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7683 { \ 7684 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7685 }) \ 7686 }) 7687 7688 7689#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7690 ({ \ 7691 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7692 { \ 7693 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7694 }) \ 7695 }) 7696 7697#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7698#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7699#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7700#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7701 7702 7703#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7704 ({ \ 7705 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7706 { \ 7707 dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7708 }) \ 7709 }) 7710 7711 7712#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \ 7713 ({ \ 7714 LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7715 { \ 7716 dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7717 }) \ 7718 }) 7719 7720 7721#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7722#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7723#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7724#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7725#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7726#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7727#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7728#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7729#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 7730 { \ 7731 LOOP_UNROLLING(int, _m, 0, 1, M0, \ 7732 { \ 7733 LOOP_UNROLLING(int, _n, 0, 1, N0, \ 7734 { \ 7735 LOOP_UNROLLING(int, _k, 0, 1, K0, \ 7736 { \ 7737 dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \ 7738 }) \ 7739 }) \ 7740 }) \ 7741 } 7742 7743#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 7744 ({ \ 7745 LOOP_UNROLLING(int, _m, 0, 1, M0, \ 7746 { \ 7747 LOOP_UNROLLING(int, _n, 0, 1, N0, \ 7748 { \ 7749 DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \ 7750 }) \ 7751 }) \ 7752 }) 7753 7754#endif 7755 7756#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE) 7757 7758#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 7759#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 7760#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val)); 7761#else 7762#define ARM_DOT(x, y, val) val += arm_dot((x), (y)); 7763#endif 7764#endif 7765 7766#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 7767 7768#define ARM_DOT1(a, b, c) \ 7769 ({ \ 7770 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \ 7771 }) 7772#define ARM_DOT2(a, b, c) \ 7773 ({ \ 7774 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \ 7775 }) 7776#define ARM_DOT3(a, b, c) \ 7777 ({ \ 7778 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \ 7779 }) 7780#define ARM_DOT4(a, b, c) \ 7781 ({ \ 7782 ARM_DOT(a, b, c); \ 7783 }) 7784#define ARM_DOT8(a, b, c) \ 7785 ({ \ 7786 ARM_DOT4((a.lo), (b.lo), c); \ 7787 ARM_DOT4((a.hi), (b.hi), c); \ 7788 }) 7789#define ARM_DOT16(a, b, c) \ 7790 ({ \ 7791 ARM_DOT8((a.lo), (b.lo), c); \ 7792 ARM_DOT8((a.hi), (b.hi), c); \ 7793 }) 7794 7795#else 7796 7797 7798#define ARM_DOT1(a, b, c) \ 7799 ({ \ 7800 c += (ACC_DATA_TYPE)a * b; \ 7801 }) 7802#define ARM_DOT2(a, b, c) \ 7803 ({ \ 7804 c += (ACC_DATA_TYPE)a.s0 * b.s0; \ 7805 c += (ACC_DATA_TYPE)a.s1 * b.s1; \ 7806 }) 7807#define ARM_DOT3(a, b, c) \ 7808 ({ \ 7809 ARM_DOT2(a, b, c); \ 7810 c += (ACC_DATA_TYPE)a.s2 * b.s2; \ 7811 }) 7812#define ARM_DOT4(a, b, c) \ 7813 ({ \ 7814 ARM_DOT3(a, b, c); \ 7815 c += (ACC_DATA_TYPE)a.s3 * b.s3; \ 7816 }) 7817#define ARM_DOT8(a, b, c) \ 7818 ({ \ 7819 ARM_DOT4((a.lo), (b.lo), c); \ 7820 ARM_DOT4((a.hi), (b.hi), c); \ 7821 }) 7822#define ARM_DOT16(a, b, c) \ 7823 ({ \ 7824 ARM_DOT8((a.lo), (b.lo), c); \ 7825 ARM_DOT8((a.hi), (b.hi), c); \ 7826 }) 7827#endif 7828 7829 7830#define ARM_DOT_K0X1(k0, a, b, c) \ 7831 ({ \ 7832 ARM_DOT_K0(k0, (a), (b##0), (c)); \ 7833 }) 7834#define ARM_DOT_K0X2(k0, a, b, c) \ 7835 ({ \ 7836 ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \ 7837 ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \ 7838 }) 7839#define ARM_DOT_K0X3(k0, a, b, c) \ 7840 ({ \ 7841 ARM_DOT_K0X2(k0, a, b, c); \ 7842 ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \ 7843 }) 7844#define ARM_DOT_K0X4(k0, a, b, c) \ 7845 ({ \ 7846 ARM_DOT_K0X3(k0, a, b, c); \ 7847 ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \ 7848 }) 7849#define ARM_DOT_K0X8(k0, a, b, c) \ 7850 ({ \ 7851 ARM_DOT_K0X4(k0, a, b, c); \ 7852 ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \ 7853 ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \ 7854 ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \ 7855 ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \ 7856 }) 7857#define ARM_DOT_K0X16(k0, a, b, c) \ 7858 ({ \ 7859 ARM_DOT_K0X8(k0, a, b, c); \ 7860 ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \ 7861 ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \ 7862 ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \ 7863 ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \ 7864 ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \ 7865 ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \ 7866 ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \ 7867 ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \ 7868 }) 7869 7870 7871#define ARM_MM_K0XN0X1(n0, k0, a, b, c) \ 7872 ({ \ 7873 ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \ 7874 }) 7875#define ARM_MM_K0XN0X2(n0, k0, a, b, c) \ 7876 ({ \ 7877 ARM_MM_K0XN0X1(n0, k0, a, b, c); \ 7878 ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \ 7879 }) 7880#define ARM_MM_K0XN0X3(n0, k0, a, b, c) \ 7881 ({ \ 7882 ARM_MM_K0XN0X2(n0, k0, a, b, c); \ 7883 ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \ 7884 }) 7885#define ARM_MM_K0XN0X4(n0, k0, a, b, c) \ 7886 ({ \ 7887 ARM_MM_K0XN0X3(n0, k0, a, b, c); \ 7888 ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \ 7889 }) 7890#define ARM_MM_K0XN0X5(n0, k0, a, b, c) \ 7891 ({ \ 7892 ARM_MM_K0XN0X4(n0, k0, a, b, c); \ 7893 ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \ 7894 }) 7895#define ARM_MM_K0XN0X6(n0, k0, a, b, c) \ 7896 ({ \ 7897 ARM_MM_K0XN0X5(n0, k0, a, b, c); \ 7898 ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \ 7899 }) 7900#define ARM_MM_K0XN0X7(n0, k0, a, b, c) \ 7901 ({ \ 7902 ARM_MM_K0XN0X6(n0, k0, a, b, c); \ 7903 ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \ 7904 }) 7905#define ARM_MM_K0XN0X8(n0, k0, a, b, c) \ 7906 ({ \ 7907 ARM_MM_K0XN0X7(n0, k0, a, b, c); \ 7908 ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \ 7909 }) 7910 7911#define ARM_DOT_K0(k0, a, b, c) \ 7912 ({ \ 7913 CONCAT(ARM_DOT, k0) \ 7914 ((a), (b), (c)); \ 7915 }) 7916 7917#define ARM_DOT_K0XN0(n0, k0, a, b, c) \ 7918 ({ \ 7919 CONCAT(ARM_DOT_K0X, n0) \ 7920 (k0, (a), b, (c)); \ 7921 }) 7922 7923#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \ 7924 ({ \ 7925 CONCAT(ARM_MM_K0XN0X, m0) \ 7926 (n0, k0, a, b, c); \ 7927 }) 7928 7929 7930#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) \ 7931 ({ \ 7932 c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \ 7933 }) 7934#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \ 7935 ({ \ 7936 c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \ 7937 c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \ 7938 }) 7939#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \ 7940 ({ \ 7941 ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \ 7942 c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \ 7943 }) 7944#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \ 7945 ({ \ 7946 ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \ 7947 c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \ 7948 }) 7949#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \ 7950 ({ \ 7951 ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \ 7952 c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \ 7953 c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \ 7954 c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \ 7955 c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \ 7956 }) 7957#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \ 7958 ({ \ 7959 ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \ 7960 c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \ 7961 c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \ 7962 c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \ 7963 c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \ 7964 c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \ 7965 c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \ 7966 c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \ 7967 c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \ 7968 }) 7969 7970#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \ 7971 ({ \ 7972 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \ 7973 }) 7974#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \ 7975 ({ \ 7976 ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \ 7977 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \ 7978 }) 7979#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \ 7980 ({ \ 7981 ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \ 7982 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \ 7983 }) 7984#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \ 7985 ({ \ 7986 ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \ 7987 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \ 7988 }) 7989#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \ 7990 ({ \ 7991 ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \ 7992 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \ 7993 }) 7994#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \ 7995 ({ \ 7996 ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \ 7997 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \ 7998 }) 7999#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \ 8000 ({ \ 8001 ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \ 8002 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \ 8003 }) 8004#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \ 8005 ({ \ 8006 ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \ 8007 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \ 8008 }) 8009#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \ 8010 ({ \ 8011 CONCAT(ARM_MUL_N0X, k0) \ 8012 (VECTOR_ACC_TYPE, (a), b, (c)); \ 8013 }) 8014#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \ 8015 ({ \ 8016 CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \ 8017 (VECTOR_ACC_TYPE, k0, a, b, c); \ 8018 }) 8019 8020#if defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T) 8021 8022__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), 8023 IMAGE_DECLARATION(rhs), 8024 IMAGE_DECLARATION(dst), 8025 uint k, 8026 uint lhs_stride_z, 8027 uint rhs_stride_z, 8028 uint dst_stride_z 8029#if defined(REINTERPRET_OUTPUT_AS_3D) 8030 , 8031 uint dst_cross_plane_pad 8032#endif 8033 ) 8034{ 8035 8036#define LHS_BLOCK_SIZE ((K0) * (M0)) 8037 8038#if defined(LHS_INTERLEAVE) 8039#define LHS_OFFSET_X (K0) 8040#define LHS_STEP_X ((K0) * (V0)) 8041#define LHS_STEP_LOOP (1) 8042#else 8043#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 8044#define LHS_STEP_X (K0) 8045#define LHS_STEP_LOOP (V0) 8046#endif 8047 8048 8049#define RHS_BLOCK_SIZE ((K0) * (N0)) 8050 8051 8052#if defined(RHS_INTERLEAVE) 8053#define RHS_OFFSET_X (K0) 8054#define RHS_STEP_X ((K0) * (H0)) 8055#define RHS_STEP_LOOP (1) 8056#else 8057#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 8058#define RHS_STEP_X (K0) 8059#define RHS_STEP_LOOP (H0) 8060#endif 8061 8062 uint x = get_global_id(0); 8063 uint y = get_global_id(1); 8064 uint z = get_global_id(2); 8065 8066#if defined(DUMMY_WORK_ITEMS) 8067 if((x * N0 >= N) || (y * M0 >= M)) 8068 { 8069 return; 8070 } 8071#endif 8072 8073 8074 __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z)); 8075 8076 8077 __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y); 8078 8079#if defined(MATRIX_B_DEPTH) 8080 8081 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; 8082#else 8083 rhs_addr += z * rhs_stride_z; 8084#endif 8085 8086 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); 8087 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); 8088 8089 8090 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); 8091 8092 for(int i = 0; i < k; i += K0) 8093 { 8094 8095 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs); 8096 8097 8098 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs); 8099 8100 8101 ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c); 8102 8103 8104 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP); 8105 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP); 8106 } 8107 8108 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y); 8109 8110 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); 8111 8112#if defined(REINTERPRET_OUTPUT_AS_3D) 8113 8114 CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 8115 8116 8117 8118 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 8119 8120#else 8121 8122 8123 dst_addr += z * dst_stride_z; 8124 8125#endif 8126 8127 8128 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 8129 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 8130 8131 8132 REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp); 8133 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 8134 8135#undef LHS_BLOCK_SIZE 8136#undef LHS_OFFSET_X 8137#undef LHS_STEP_X 8138#undef RHS_BLOCK_SIZE 8139#undef RHS_OFFSET_X 8140#undef RHS_STEP_X 8141} 8142#endif 8143 8144#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) 8145#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) 8146#define FUSED_OUTPUT_STAGE_FIXED_POINT 8147#endif 8148 8149 8150#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) 8151__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint 8152#elif defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) 8153__kernel void gemmlowp_mm_reshaped_only_rhs_t 8154#endif 8155(IMAGE_DECLARATION(lhs), 8156 IMAGE_DECLARATION(rhs), 8157 IMAGE_DECLARATION(dst), 8158 uint lhs_stride_z, 8159 uint rhs_stride_z, 8160 uint dst_stride_z 8161#if defined(REINTERPRET_INPUT_AS_3D) 8162 , 8163 uint lhs_cross_plane_pad 8164#endif 8165#if defined(REINTERPRET_OUTPUT_AS_3D) 8166 , 8167 uint dst_cross_plane_pad 8168#endif 8169#if defined(A_OFFSET) 8170 , 8171 IMAGE_DECLARATION(sum_col) 8172#endif 8173#if defined(B_OFFSET) 8174 , 8175 IMAGE_DECLARATION(sum_row) 8176#endif 8177#if defined(ADD_BIAS) 8178 , 8179 VECTOR_DECLARATION(biases) 8180#endif 8181#if defined(PER_CHANNEL_QUANTIZATION) 8182 , 8183 VECTOR_DECLARATION(result_multipliers), 8184 VECTOR_DECLARATION(result_shifts) 8185#endif 8186) 8187{ 8188 8189#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y) 8190#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y) 8191 8192 8193#if defined(RHS_INTERLEAVE) 8194#define RHS_OFFSET_X (K0) 8195#define RHS_STEP_X (K0 * H0) 8196#else 8197#define RHS_OFFSET_X (K0 * N0) 8198#define RHS_STEP_X (K0) 8199#endif 8200#define RHS_STEP_LOOP (N0 * K0 * H0) 8201 8202 uint x = GET_SPATIAL_IDX(0, 1, 1); 8203 uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0); 8204 uint z = GET_SPATIAL_IDX(2, 1, 1); 8205 int xo = (x * N0); 8206 8207#if defined(DUMMY_WORK_ITEMS) 8208 if((xo >= N) || (y >= M)) 8209 { 8210 return; 8211 } 8212#endif 8213 8214 8215 uint lhs_y = y + z * FULL_LHS_HEIGHT; 8216 8217 8218 uint rhs_offset_x = (x % H0) * RHS_OFFSET_X; 8219 uint rhs_offset_y = (x / H0) * rhs_stride_y; 8220 8221#if defined(MATRIX_B_DEPTH) 8222 8223 rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z; 8224#else 8225 rhs_offset_y += z * rhs_stride_z; 8226#endif 8227 8228 8229 TILE(ACC_DATA_TYPE, M0, N0, c); 8230 LOOP_UNROLLING(int, i, 0, 1, M0, 8231 { 8232 c[i].v = 0; 8233 }) 8234 8235 int i = 0; 8236 for(; i <= (K - K0); i += K0) 8237 { 8238 TILE(DATA_TYPE, M0, K0, a); 8239 TILE(DATA_TYPE, N0, K0, b); 8240 8241 8242 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a); 8243 8244 8245 LOOP_UNROLLING(int, _i, 0, 1, N0, 8246 { 8247 b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X)); 8248 }) 8249 8250 8251 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c); 8252 8253 rhs_offset_x += RHS_STEP_LOOP; 8254 } 8255 8256#if((K % K0) != 0) 8257 8258 8259 for(; i < K; ++i) 8260 { 8261 TILE(DATA_TYPE, M0, 1, a); 8262 TILE(DATA_TYPE, N0, 1, b); 8263 8264 8265 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a); 8266 8267 LOOP_UNROLLING(int, _i, 0, 1, N0, 8268 { 8269 b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X); 8270 }) 8271 8272 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); 8273 8274 rhs_offset_x += 1; 8275 } 8276#endif 8277 8278#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 8279 8280 TILE(int, M0, N0, c_int); 8281 TILE(int, M0, N0, offset_s32); 8282 LOOP_UNROLLING(int, i, 0, 1, M0, 8283 { 8284 offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET; 8285 }) 8286 8287 LOOP_UNROLLING(int, i, 0, 1, M0, 8288 { 8289 c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0)); 8290 }) 8291 8292#if defined(A_OFFSET) 8293 8294#if defined(SUM_COL_HAS_BATCHES) 8295 int sum_col_y = z; 8296#else 8297 int sum_col_y = 0; 8298#endif 8299 TILE(int, 1, N0, a_offset_s32); 8300 8301 T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32); 8302 8303 a_offset_s32[0].v *= A_OFFSET; 8304 8305 T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32); 8306#endif 8307 8308#if defined(B_OFFSET) 8309 8310 8311 8312 8313 TILE(int, M0, N0, b_offset_s32); 8314 8315 T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32); 8316 8317 LOOP_UNROLLING(int, i, 0, 1, M0, 8318 { 8319 offset_s32[i].v += b_offset_s32[i].v *B_OFFSET; 8320 }) 8321 8322#endif 8323 8324#if defined(ADD_BIAS) 8325 8326 TILE(int, 1, N0, bias); 8327 8328 T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias); 8329 8330 T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32); 8331#endif 8332 8333 LOOP_UNROLLING(int, i, 0, 1, M0, 8334 { 8335 c_int[i].v += offset_s32[i].v; 8336 }) 8337 8338 TILE(DATA_TYPE, M0, N0, c_lp); 8339 8340 8341#if defined(PER_CHANNEL_QUANTIZATION) 8342 TILE(int, 1, N0, res_mul); 8343 TILE(int, 1, N0, res_shift); 8344 8345 T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul); 8346 T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift); 8347 8348 T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp); 8349#else 8350 T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp); 8351#endif 8352 8353#if defined(MIN_BOUND) 8354 LOOP_UNROLLING(int, i, 0, 1, M0, 8355 { 8356 c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND); 8357 }) 8358#endif 8359#if defined(MAX_BOUND) 8360 LOOP_UNROLLING(int, i, 0, 1, M0, 8361 { 8362 c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND); 8363 }) 8364#endif 8365 8366#else 8367 TILE(int, M0, N0, c_lp); 8368 8369 LOOP_UNROLLING(int, i, 0, 1, M0, 8370 { 8371 c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0)); 8372 }) 8373#endif 8374 8375 TILE(uint, M0, 1, dst_indirect_y); 8376 8377 LOOP_UNROLLING(int, i, 0, 1, M0, 8378 { 8379#if defined(REINTERPRET_OUTPUT_AS_3D) 8380 dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1); 8381 dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT; 8382 dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D; 8383#else 8384 dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT; 8385#endif 8386 }) 8387 8388 const bool cond_x = (xo > (N - N0)) & (PARTIAL_STORE_N0 != 0); 8389 8390#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 8391 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y); 8392#else 8393 T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y); 8394#endif 8395 8396#undef RHS_OFFSET_X 8397#undef RHS_STEP_X 8398#undef RHS_STEP_LOOP 8399} 8400#endif 8401 8402#if defined(GEMMLOWP_MM_NATIVE) 8403 8404 8405__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), 8406 IMAGE_DECLARATION(rhs), 8407 IMAGE_DECLARATION(dst), 8408 uint lhs_stride_z, 8409 uint rhs_stride_z, 8410 uint dst_stride_z 8411#if defined(REINTERPRET_INPUT_AS_3D) 8412 , 8413 uint lhs_cross_plane_pad 8414#endif 8415#if defined(REINTERPRET_OUTPUT_AS_3D) 8416 , 8417 uint dst_cross_plane_pad 8418#endif 8419 ) 8420{ 8421 uint x = get_global_id(0); 8422 uint y = get_global_id(1); 8423 uint z = get_global_id(2); 8424 8425#if defined(DUMMY_WORK_ITEMS) 8426 if((x * N0 >= N) || (y * M0 >= M)) 8427 { 8428 return; 8429 } 8430#endif 8431 8432 8433 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 8434 8435 8436 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE); 8437 8438#if defined(MATRIX_B_DEPTH) 8439 8440 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; 8441#else 8442 rhs_offset += z * rhs_stride_z; 8443#endif 8444 8445 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); 8446 REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); 8447 8448#if defined(REINTERPRET_INPUT_AS_3D) 8449 8450 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 8451 8452 8453 8454 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 8455 8456#else 8457 8458 8459 lhs_offset += z * lhs_stride_z; 8460 8461#endif 8462 8463 8464 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); 8465 8466 int i = 0; 8467 8468 for(; i <= (K - K0); i += K0) 8469 { 8470 8471 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 8472 8473 8474 LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); 8475 8476 8477#if(GPU_ARCH == GPU_ARCH_MIDGARD) 8478 ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c); 8479#else 8480 8481 TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE); 8482 8483 ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c); 8484#endif 8485 8486 8487 lhs_offset += K0; 8488 rhs_offset += K0 * rhs_stride_y; 8489 } 8490 8491 8492 for(; i < K; ++i) 8493 { 8494 8495 LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 8496 8497 8498 LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); 8499 8500 8501#if(GPU_ARCH == GPU_ARCH_MIDGARD) 8502 ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c); 8503#else 8504 8505 TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE); 8506 8507 ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c); 8508#endif 8509 8510 8511 lhs_offset += 1; 8512 rhs_offset += rhs_stride_y; 8513 } 8514 8515 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 8516 8517 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 8518 8519#if defined(REINTERPRET_OUTPUT_AS_3D) 8520 8521 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 8522 8523 8524 8525 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 8526 8527#else 8528 8529 8530 dst_addr += z * dst_stride_z; 8531 8532#endif 8533 const bool cond_y = y == 0; 8534 const bool cond_x = ((x + 1) * N0 >= N); 8535 8536 8537 REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); 8538 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 8539} 8540#endif 8541 8542#if defined(GEMMLOWP_MATRIX_A_REDUCTION) 8543 8544__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), 8545 IMAGE_DECLARATION(dst)) 8546{ 8547 8548 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 8549 Image dst = CONVERT_TO_IMAGE_STRUCT(dst); 8550 8551 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) 8552 sum_row_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0; 8553 ACC_DATA_TYPE sum_row = 0; 8554 8555 __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z); 8556 8557 int i = 0; 8558 8559 8560 for(; i <= ((int)COLS_A - 16); i += 16) 8561 { 8562 const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i); 8563 8564 sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF, 8565 VEC_DATA_TYPE(ACC_DATA_TYPE, 4)); 8566 } 8567 8568 8569 for(; i < COLS_A; ++i) 8570 { 8571 sum_row += (ACC_DATA_TYPE)matrix_a[i]; 8572 } 8573 8574 sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3; 8575 8576#if defined(SCALAR) 8577 sum_row *= (int)SCALAR; 8578#endif 8579 *((__global int *)dst.ptr) = (int)sum_row; 8580} 8581#endif 8582 8583#if defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8) 8584 8585__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src), 8586 IMAGE_DECLARATION(dst)) 8587{ 8588 8589 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 8590 Image dst = CONVERT_TO_IMAGE_STRUCT(dst); 8591 8592 ACC_DATA_TYPE sum_row = 0; 8593 8594 __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z); 8595 8596 int i = 0; 8597 8598 8599 for(; i <= ((int)COLS_A - 32); i += 32) 8600 { 8601 VEC_DATA_TYPE(DATA_TYPE, 16) 8602 a0 = vload16(0, matrix_a + i); 8603 8604 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8605 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8606 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8607 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8608 8609 a0 = vload16(1, matrix_a + i); 8610 8611 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8612 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8613 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8614 DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8615 } 8616 8617 8618 for(; i < COLS_A; ++i) 8619 { 8620 sum_row += (ACC_DATA_TYPE)matrix_a[i]; 8621 } 8622 8623#if defined(SCALAR) 8624 sum_row *= (int)SCALAR; 8625#endif 8626 *((__global int *)dst.ptr) = (int)sum_row; 8627} 8628#endif 8629 8630#if defined(GEMMLOWP_MATRIX_B_REDUCTION) 8631 8632__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), 8633 IMAGE_DECLARATION(dst)) 8634{ 8635 8636 const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 8637 const uint y = get_global_id(1); 8638 8639 __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z); 8640 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y; 8641 8642 VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) 8643 sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0; 8644 8645 int i = 0; 8646 8647 for(; i <= ((int)ROWS_B - 4); i += 4) 8648 { 8649 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8650 b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y); 8651 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8652 b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y); 8653 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8654 b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y); 8655 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8656 b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y); 8657 8658 sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3, 8659 VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); 8660 8661 matrix_b += 4 * src_stride_y; 8662 } 8663 8664 8665 for(; i < (int)ROWS_B; ++i) 8666 { 8667 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8668 b0 = VLOAD(VEC_SIZE)(0, matrix_b); 8669 8670 sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); 8671 8672 matrix_b += src_stride_y; 8673 } 8674 8675#if defined(SCALAR) 8676 sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR; 8677#endif 8678 VEC_DATA_TYPE(int, VEC_SIZE) 8679 res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE)); 8680 8681 STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 8682} 8683#endif 8684 8685#endif 8686 8687#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) 8688 8689#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) 8690 8691 8692inline VEC_INT offset_contribution( 8693 int x, 8694 int y, 8695 int z 8696#if defined(A_OFFSET) 8697 , 8698 IMAGE_DECLARATION(sum_col) 8699#endif 8700#if defined(B_OFFSET) 8701 , 8702 IMAGE_DECLARATION(sum_row) 8703#endif 8704#if defined(ADD_BIAS) 8705 , 8706 VECTOR_DECLARATION(biases) 8707#endif 8708) 8709{ 8710 VEC_INT a_offset_s32 = (VEC_INT)0; 8711 VEC_INT b_offset_s32 = (VEC_INT)0; 8712 8713 int batch_id = z; 8714#if defined(DEPTH_INPUT3D) 8715 batch_id /= (int)DEPTH_INPUT3D; 8716#endif 8717 8718#if defined(A_OFFSET) 8719 8720 __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int); 8721 8722 8723#if defined(SUM_COL_HAS_BATCHES) 8724 a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y)); 8725#else 8726 a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr); 8727#endif 8728 8729 a_offset_s32 *= (VEC_INT)A_OFFSET; 8730#endif 8731 8732#if defined(B_OFFSET) 8733 8734 __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int); 8735 8736 8737#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) 8738 b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D); 8739#else 8740 b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y))); 8741#endif 8742 b_offset_s32 *= (VEC_INT)B_OFFSET; 8743#endif 8744 8745#if defined(ADD_BIAS) 8746 8747 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 8748 8749 VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 8750 b_offset_s32 += (VEC_INT)biases_values; 8751#endif 8752 8753 return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32; 8754} 8755 8756#if defined(GEMMLOWP_OFFSET_CONTRIBUTION) 8757 8758__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result) 8759#if defined(A_OFFSET) 8760 , 8761 IMAGE_DECLARATION(sum_col) 8762#endif 8763#if defined(B_OFFSET) 8764 , 8765 IMAGE_DECLARATION(sum_row) 8766#endif 8767#if defined(ADD_BIAS) 8768 , 8769 VECTOR_DECLARATION(biases) 8770#endif 8771 ) 8772{ 8773 const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 8774 const int y = get_global_id(1); 8775 const int z = get_global_id(2); 8776 8777 8778 VEC_INT offset_term_s32 = offset_contribution( 8779 x, y, z 8780#if defined(A_OFFSET) 8781 , 8782 sum_col_ptr, 8783 sum_col_stride_x, 8784 sum_col_step_x, 8785 sum_col_stride_y, 8786 sum_col_step_y, 8787 sum_col_offset_first_element_in_bytes 8788#endif 8789#if defined(B_OFFSET) 8790 , 8791 sum_row_ptr, 8792 sum_row_stride_x, 8793 sum_row_step_x, 8794 sum_row_stride_y, 8795 sum_row_step_y, 8796 sum_row_offset_first_element_in_bytes 8797#endif 8798#if defined(ADD_BIAS) 8799 , 8800 biases_ptr, 8801 biases_stride_x, 8802 biases_step_x, 8803 biases_offset_first_element_in_bytes 8804#endif 8805 ); 8806 8807 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; 8808 8809 VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); 8810 8811 8812 in_s32_0 += offset_term_s32; 8813 8814 8815 STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 8816} 8817#endif 8818 8819#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN) 8820 8821__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result) 8822#if defined(A_OFFSET) 8823 , 8824 IMAGE_DECLARATION(sum_col) 8825#endif 8826#if defined(B_OFFSET) 8827 , 8828 IMAGE_DECLARATION(sum_row) 8829#endif 8830 , 8831#if defined(ADD_BIAS) 8832 VECTOR_DECLARATION(biases), 8833#endif 8834 TENSOR3D_DECLARATION(dst) 8835#if defined(PER_CHANNEL_QUANTIZATION) 8836 , 8837 VECTOR_DECLARATION(result_multipliers), 8838 VECTOR_DECLARATION(result_shifts) 8839#endif 8840 ) 8841{ 8842 const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 8843 const int y = get_global_id(1); 8844 const int z = get_global_id(2); 8845 8846 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 8847 8848 8849 VEC_INT offset_term_s32 = offset_contribution( 8850 x, y, z 8851#if defined(A_OFFSET) 8852 , 8853 sum_col_ptr, 8854 sum_col_stride_x, 8855 sum_col_step_x, 8856 sum_col_stride_y, 8857 sum_col_step_y, 8858 sum_col_offset_first_element_in_bytes 8859#endif 8860#if defined(B_OFFSET) 8861 , 8862 sum_row_ptr, 8863 sum_row_stride_x, 8864 sum_row_step_x, 8865 sum_row_stride_y, 8866 sum_row_step_y, 8867 sum_row_offset_first_element_in_bytes 8868#endif 8869#if defined(ADD_BIAS) 8870 , 8871 biases_ptr, 8872 biases_stride_x, 8873 biases_step_x, 8874 biases_offset_first_element_in_bytes 8875#endif 8876 ); 8877 8878 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; 8879 8880 VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); 8881 8882 8883 in_s32 += offset_term_s32; 8884 8885 8886 8887 8888 in_s32 += (VEC_INT)RESULT_OFFSET; 8889 8890 8891#if defined(PER_CHANNEL_QUANTIZATION) 8892 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); 8893 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); 8894 VEC_INT result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr); 8895 VEC_INT result_shifts_values = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr); 8896 8897 in_s32 *= result_multipliers_values; 8898 in_s32 >>= result_shifts_values; 8899#else 8900 in_s32 *= RESULT_MULTIPLIER; 8901 8902 in_s32 >>= RESULT_SHIFT; 8903#endif 8904 8905 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 8906 res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 8907 8908#if defined(MIN_BOUND) 8909 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 8910#endif 8911#if defined(MAX_BOUND) 8912 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 8913#endif 8914 8915 8916 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 8917} 8918#endif 8919 8920#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT) 8921 8922__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result) 8923#if defined(A_OFFSET) 8924 , 8925 IMAGE_DECLARATION(sum_col) 8926#endif 8927#if defined(B_OFFSET) 8928 , 8929 IMAGE_DECLARATION(sum_row) 8930#endif 8931 , 8932#if defined(ADD_BIAS) 8933 VECTOR_DECLARATION(biases), 8934#endif 8935 TENSOR3D_DECLARATION(dst) 8936#if defined(PER_CHANNEL_QUANTIZATION) 8937 , 8938 VECTOR_DECLARATION(result_multipliers), 8939 VECTOR_DECLARATION(result_shifts) 8940#endif 8941 ) 8942{ 8943 const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 8944 const int y = get_global_id(1); 8945 const int z = get_global_id(2); 8946 8947 8948 VEC_INT offset_term_s32 = offset_contribution( 8949 x, y, z 8950#if defined(A_OFFSET) 8951 , 8952 sum_col_ptr, 8953 sum_col_stride_x, 8954 sum_col_step_x, 8955 sum_col_stride_y, 8956 sum_col_step_y, 8957 sum_col_offset_first_element_in_bytes 8958#endif 8959#if defined(B_OFFSET) 8960 , 8961 sum_row_ptr, 8962 sum_row_stride_x, 8963 sum_row_step_x, 8964 sum_row_stride_y, 8965 sum_row_step_y, 8966 sum_row_offset_first_element_in_bytes 8967#endif 8968#if defined(ADD_BIAS) 8969 , 8970 biases_ptr, 8971 biases_stride_x, 8972 biases_step_x, 8973 biases_offset_first_element_in_bytes 8974#endif 8975 ); 8976 8977 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; 8978 8979 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 8980 8981 VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); 8982 8983 8984 in_s32 += offset_term_s32; 8985 8986 8987 8988 8989#if defined(PER_CHANNEL_QUANTIZATION) 8990 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); 8991 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); 8992 VEC_INT result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr); 8993 VEC_INT result_shifts_values = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr); 8994 8995 VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE); 8996 VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE); 8997 in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0); 8998#else 8999 9000#if RESULT_SHIFT < 0 9001 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9002#else 9003 in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9004#endif 9005 9006#endif 9007 9008 9009 in_s32 += (VEC_INT)RESULT_OFFSET; 9010 9011 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 9012 res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 9013 9014#if defined(MIN_BOUND) 9015 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 9016#endif 9017#if defined(MAX_BOUND) 9018 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 9019#endif 9020 9021 9022 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9023} 9024#endif 9025 9026#undef VEC_INT 9027 9028#endif 9029 9030#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN) 9031 9032__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), 9033#if defined(ADD_BIAS) 9034 VECTOR_DECLARATION(biases), 9035#endif 9036 TENSOR3D_DECLARATION(dst)) 9037{ 9038 9039 int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 9040 int y = get_global_id(1); 9041 int z = get_global_id(2); 9042 9043 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 9044 9045 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 9046 9047 VEC_DATA_TYPE(int, VEC_SIZE) 9048 input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 9049 9050#if defined(ADD_BIAS) 9051 9052 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 9053 9054 VEC_DATA_TYPE(int, VEC_SIZE) 9055 biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 9056 input_values += biases_values; 9057#endif 9058 9059 9060 input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET; 9061 9062 9063 input_values *= RESULT_MULT_INT; 9064 9065#if RESULT_SHIFT < 0 9066 input_values >>= -RESULT_SHIFT; 9067#else 9068 input_values >>= RESULT_SHIFT; 9069#endif 9070 9071 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 9072 res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 9073 9074#if defined(MIN_BOUND) 9075 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 9076#endif 9077#if defined(MAX_BOUND) 9078 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 9079#endif 9080 9081 9082 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9083} 9084#endif 9085 9086#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT) 9087 9088__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src), 9089#if defined(ADD_BIAS) 9090 VECTOR_DECLARATION(biases), 9091#endif 9092 TENSOR3D_DECLARATION(dst)) 9093{ 9094 9095 int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 9096 int y = get_global_id(1); 9097 int z = get_global_id(2); 9098 9099 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 9100 9101 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 9102 9103 VEC_DATA_TYPE(int, VEC_SIZE) 9104 input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 9105 9106#if defined(ADD_BIAS) 9107 9108 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 9109 9110 VEC_DATA_TYPE(int, VEC_SIZE) 9111 biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 9112 input_values += biases_values; 9113#endif 9114 9115 9116#if RESULT_SHIFT < 0 9117 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9118#else 9119 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9120#endif 9121 9122 9123 input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT; 9124 9125 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 9126 res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 9127 9128#if defined(MIN_BOUND) 9129 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 9130#endif 9131#if defined(MAX_BOUND) 9132 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 9133#endif 9134 9135 9136 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9137} 9138#endif 9139 9140#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16) 9141 9142__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src), 9143#if defined(ADD_BIAS) 9144 VECTOR_DECLARATION(biases), 9145#endif 9146 TENSOR3D_DECLARATION(dst)) 9147{ 9148 9149 int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 9150 int y = get_global_id(1); 9151 int z = get_global_id(2); 9152 9153 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 9154 9155 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z; 9156 9157 VEC_DATA_TYPE(int, VEC_SIZE) 9158 input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 9159 9160#if defined(ADD_BIAS) 9161 9162 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 9163 9164 VEC_DATA_TYPE(int, VEC_SIZE) 9165 biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 9166 input_values += biases_values; 9167#endif 9168 9169 9170#if RESULT_SHIFT < 0 9171 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9172#else 9173 input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9174#endif 9175 9176 VEC_DATA_TYPE(short, VEC_SIZE) 9177 res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE)); 9178 9179#if defined(MIN_BOUND) 9180 res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND); 9181#endif 9182#if defined(MAX_BOUND) 9183 res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND); 9184#endif 9185 9186 9187 STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9188} 9189#endif 9190 9191#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT) 9192 9193__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src), 9194#if defined(ADD_BIAS) 9195 VECTOR_DECLARATION(biases), 9196#endif 9197#if defined(DST_HEIGHT) 9198 TENSOR4D_DECLARATION(dst)) 9199#else 9200 TENSOR3D_DECLARATION(dst)) 9201#endif 9202{ 9203 9204 int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 9205 int y = get_global_id(1); 9206 int z = get_global_id(2); 9207 9208 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 9209 9210 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 9211 9212 VEC_DATA_TYPE(int, VEC_SIZE) 9213 input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 9214 9215#if defined(ADD_BIAS) 9216 9217 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 9218 9219 VEC_DATA_TYPE(int, VEC_SIZE) 9220 biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 9221 input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values; 9222#endif 9223 9224 9225 VEC_DATA_TYPE(float, VEC_SIZE) 9226 input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE)); 9227 input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET); 9228 9229 VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 9230 res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 9231 9232#if defined(MIN_BOUND) 9233 res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 9234#endif 9235#if defined(MAX_BOUND) 9236 res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 9237#endif 9238 9239 9240 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9241} 9242#endif )"