1R"( 2 3 4 5 6#ifndef ARM_COMPUTE_HELPER_H 7#define ARM_COMPUTE_HELPER_H 8 9 10 11 12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 13 VSTORE(N0) \ 14 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 15 16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 17 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 18 VSTORE(N0) \ 19 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 20 21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 22 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 23 VSTORE(N0) \ 24 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 25 26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 27 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 28 VSTORE(N0) \ 29 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 30 31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 32 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 33 VSTORE(N0) \ 34 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 35 36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 37 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 38 VSTORE(N0) \ 39 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 40 41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 42 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 43 VSTORE(N0) \ 44 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 45 46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 47 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 48 VSTORE(N0) \ 49 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 50 51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 52 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 53 VSTORE(N0) \ 54 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 55 56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 57 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 58 VSTORE(N0) \ 59 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 60 61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 62 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 63 VSTORE(N0) \ 64 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 65 66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 67 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 68 VSTORE(N0) \ 69 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 70 71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 72 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 73 VSTORE(N0) \ 74 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 75 76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 77 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 78 VSTORE(N0) \ 79 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 80 81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 82 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 83 VSTORE(N0) \ 84 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 85 86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 87 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 88 VSTORE(N0) \ 89 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 90 91 92 93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 94 VSTORE(N0) \ 95 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 96 97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 98 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 99 VSTORE(N0) \ 100 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 101 102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 103 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 104 VSTORE(N0) \ 105 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 106 107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 108 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 109 VSTORE(N0) \ 110 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 111 112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 113 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114 VSTORE(N0) \ 115 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 116 117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 118 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119 VSTORE(N0) \ 120 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 121 122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 123 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124 VSTORE(N0) \ 125 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 126 127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 128 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129 VSTORE(N0) \ 130 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 131 132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 133 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 134 VSTORE(N0) \ 135 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 136 137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 138 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139 VSTORE(N0) \ 140 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 141 142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 143 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 144 VSTORE(N0) \ 145 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 146 147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 148 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 149 VSTORE(N0) \ 150 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 151 152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 153 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 154 VSTORE(N0) \ 155 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 156 157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 158 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 159 VSTORE(N0) \ 160 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 161 162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 163 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 164 VSTORE(N0) \ 165 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 166 167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 168 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 169 VSTORE(N0) \ 170 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 171 172 173 174 175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 177 178 179 180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 182 183 184 185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186 VSTORE_PARTIAL(N0, STORE_N0) \ 187 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 188 189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 190 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 191 VSTORE_PARTIAL(N0, STORE_N0) \ 192 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 193 194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 195 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 196 VSTORE_PARTIAL(N0, STORE_N0) \ 197 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 198 199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 200 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 201 VSTORE_PARTIAL(N0, STORE_N0) \ 202 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 203 204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 205 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 206 VSTORE_PARTIAL(N0, STORE_N0) \ 207 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 208 209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 210 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 211 VSTORE_PARTIAL(N0, STORE_N0) \ 212 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 213 214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 215 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 216 VSTORE_PARTIAL(N0, STORE_N0) \ 217 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 218 219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 220 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 221 VSTORE_PARTIAL(N0, STORE_N0) \ 222 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 223 224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 225 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 226 VSTORE_PARTIAL(N0, STORE_N0) \ 227 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 228 229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 230 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 231 VSTORE_PARTIAL(N0, STORE_N0) \ 232 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 233 234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 235 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 236 VSTORE_PARTIAL(N0, STORE_N0) \ 237 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 238 239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 240 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 241 VSTORE_PARTIAL(N0, STORE_N0) \ 242 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 243 244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 245 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 246 VSTORE_PARTIAL(N0, STORE_N0) \ 247 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 248 249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 250 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 251 VSTORE_PARTIAL(N0, STORE_N0) \ 252 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 253 254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 255 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 256 VSTORE_PARTIAL(N0, STORE_N0) \ 257 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 258 259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 260 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 261 VSTORE_PARTIAL(N0, STORE_N0) \ 262 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 263 264 265 266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 268 269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 270 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 271 { \ 272 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 273 } \ 274 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 275 { \ 276 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 277 } \ 278 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 279 { \ 280 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 281 } \ 282 else \ 283 { \ 284 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 285 } 286 287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 288 if(!(PARTIAL_COND_X)) \ 289 { \ 290 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 291 } \ 292 else \ 293 { \ 294 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 295 } 296 297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 298 if(!(PARTIAL_COND_Y)) \ 299 { \ 300 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 301 } \ 302 else \ 303 { \ 304 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 305 } 306 307 308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 309 310 311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 312 313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 314 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 315 316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 317 318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 319 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 320 321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 322 323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 324 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 325 326#else 327 328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 329 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 330 331#endif 332 333#endif 334 335 336#if defined(PARTIAL_STORE_M0) 337 338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 339 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 340#else 341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 342 ((uint)(y * M0)) 343#endif 344 345 346 347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 348 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 349 350 351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 352#pragma OPENCL EXTENSION cl_khr_fp16 : enable 353#endif 354 355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 357#endif 358 359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 361#endif 362 363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 364#pragma OPENCL EXTENSION cl_arm_printf : enable 365#endif 366 367#define GPU_ARCH_MIDGARD 0x100 368#define GPU_ARCH_BIFROST 0x200 369#define GPU_ARCH_VALHALL 0x300 370 371 372#define CONCAT(a, b) a##b 373 374 375#define EXPAND(x) x 376 377 378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 379 380 381#define REV1(x) ((x)) 382#define REV2(x) ((x).s10) 383#define REV3(x) ((x).s210) 384#define REV4(x) ((x).s3210) 385#define REV8(x) ((x).s76543210) 386#define REV16(x) ((x).sFEDCBA9876543210) 387 388 389 390#define REVERSE_STR(x, s) REV##s((x)) 391#define REVERSE(x, s) REVERSE_STR(x, s) 392 393 394 395#define ROT1_0(x) ((x)) 396#define ROT1_1(x) ((x)) 397 398#define ROT2_0(x) ((x)) 399#define ROT2_1(x) ((x).s10) 400#define ROT2_2(x) ((x)) 401 402#define ROT3_0(x) ((x)) 403#define ROT3_1(x) ((x).s201) 404#define ROT3_2(x) ((x).s120) 405#define ROT3_3(x) ((x)) 406 407#define ROT4_0(x) ((x)) 408#define ROT4_1(x) ((x).s3012) 409#define ROT4_2(x) ((x).s2301) 410#define ROT4_3(x) ((x).s1230) 411#define ROT4_4(x) ((x)) 412 413#define ROT8_0(x) ((x)) 414#define ROT8_1(x) ((x).s70123456) 415#define ROT8_2(x) ((x).s67012345) 416#define ROT8_3(x) ((x).s56701234) 417#define ROT8_4(x) ((x).s45670123) 418#define ROT8_5(x) ((x).s34567012) 419#define ROT8_6(x) ((x).s23456701) 420#define ROT8_7(x) ((x).s12345670) 421#define ROT8_8(x) ((x)) 422 423#define ROT16_0(x) ((x)) 424#define ROT16_1(x) ((x).sF0123456789ABCDE) 425#define ROT16_2(x) ((x).sEF0123456789ABCD) 426#define ROT16_3(x) ((x).sDEF0123456789ABC) 427#define ROT16_4(x) ((x).sCDEF0123456789AB) 428#define ROT16_5(x) ((x).sBCDEF0123456789A) 429#define ROT16_6(x) ((x).sABCDEF0123456789) 430#define ROT16_7(x) ((x).s9ABCDEF012345678) 431#define ROT16_8(x) ((x).s89ABCDEF01234567) 432#define ROT16_9(x) ((x).s789ABCDEF0123456) 433#define ROT16_10(x) ((x).s6789ABCDEF012345) 434#define ROT16_11(x) ((x).s56789ABCDEF01234) 435#define ROT16_12(x) ((x).s456789ABCDEF0123) 436#define ROT16_13(x) ((x).s3456789ABCDEF012) 437#define ROT16_14(x) ((x).s23456789ABCDEF01) 438#define ROT16_15(x) ((x).s123456789ABCDEF0) 439#define ROT16_16(x) ((x)) 440 441 442 443#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 444#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 445 446 447 448#define V_OFFS1(dt) (dt##1)(0) 449#define V_OFFS2(dt) (dt##2)(0, 1) 450#define V_OFFS3(dt) (dt##3)(0, 1, 2) 451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 454 455 456 457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 459 460 461#define VLOAD_STR(size) vload##size 462#define VLOAD(size) VLOAD_STR(size) 463 464 465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 467 468#define NO_LOAD(data, offs, ptr) \ 469 { \ 470 } 471 472 473#define vload_partial_1_0 NO_LOAD 474#define vload_partial_1_1 vload1 475#define vload_partial_1_2 NO_LOAD 476#define vload_partial_1_3 NO_LOAD 477#define vload_partial_1_4 NO_LOAD 478#define vload_partial_1_5 NO_LOAD 479#define vload_partial_1_6 NO_LOAD 480#define vload_partial_1_7 NO_LOAD 481#define vload_partial_1_8 NO_LOAD 482#define vload_partial_1_9 NO_LOAD 483#define vload_partial_1_10 NO_LOAD 484#define vload_partial_1_11 NO_LOAD 485#define vload_partial_1_12 NO_LOAD 486#define vload_partial_1_13 NO_LOAD 487#define vload_partial_1_14 NO_LOAD 488#define vload_partial_1_15 NO_LOAD 489#define vload_partial_1_16 NO_LOAD 490 491#define vload_partial_2_0 NO_LOAD 492#define vload_partial_2_1 vload_partial_1 493#define vload_partial_2_2 vload_partial_2 494#define vload_partial_2_3 NO_LOAD 495#define vload_partial_2_4 NO_LOAD 496#define vload_partial_2_5 NO_LOAD 497#define vload_partial_2_6 NO_LOAD 498#define vload_partial_2_7 NO_LOAD 499#define vload_partial_2_8 NO_LOAD 500#define vload_partial_2_9 NO_LOAD 501#define vload_partial_2_10 NO_LOAD 502#define vload_partial_2_11 NO_LOAD 503#define vload_partial_2_12 NO_LOAD 504#define vload_partial_2_13 NO_LOAD 505#define vload_partial_2_14 NO_LOAD 506#define vload_partial_2_15 NO_LOAD 507#define vload_partial_2_16 NO_LOAD 508 509#define vload_partial_3_0 NO_LOAD 510#define vload_partial_3_1 vload_partial_1 511#define vload_partial_3_2 vload_partial_2 512#define vload_partial_3_3 vload_partial_3 513#define vload_partial_3_4 NO_LOAD 514#define vload_partial_3_5 NO_LOAD 515#define vload_partial_3_6 NO_LOAD 516#define vload_partial_3_7 NO_LOAD 517#define vload_partial_3_8 NO_LOAD 518#define vload_partial_3_9 NO_LOAD 519#define vload_partial_3_10 NO_LOAD 520#define vload_partial_3_11 NO_LOAD 521#define vload_partial_3_12 NO_LOAD 522#define vload_partial_3_13 NO_LOAD 523#define vload_partial_3_14 NO_LOAD 524#define vload_partial_3_15 NO_LOAD 525#define vload_partial_3_16 NO_LOAD 526 527#define vload_partial_4_0 NO_LOAD 528#define vload_partial_4_1 vload_partial_1 529#define vload_partial_4_2 vload_partial_2 530#define vload_partial_4_3 vload_partial_3 531#define vload_partial_4_4 vload_partial_4 532#define vload_partial_4_5 NO_LOAD 533#define vload_partial_4_6 NO_LOAD 534#define vload_partial_4_7 NO_LOAD 535#define vload_partial_4_8 NO_LOAD 536#define vload_partial_4_9 NO_LOAD 537#define vload_partial_4_10 NO_LOAD 538#define vload_partial_4_11 NO_LOAD 539#define vload_partial_4_12 NO_LOAD 540#define vload_partial_4_13 NO_LOAD 541#define vload_partial_4_14 NO_LOAD 542#define vload_partial_4_15 NO_LOAD 543#define vload_partial_4_16 NO_LOAD 544 545#define vload_partial_8_0 NO_LOAD 546#define vload_partial_8_1 vload_partial_1 547#define vload_partial_8_2 vload_partial_2 548#define vload_partial_8_3 vload_partial_3 549#define vload_partial_8_4 vload_partial_4 550#define vload_partial_8_5 vload_partial_5 551#define vload_partial_8_6 vload_partial_6 552#define vload_partial_8_7 vload_partial_7 553#define vload_partial_8_8 vload_partial_8 554#define vload_partial_8_9 NO_LOAD 555#define vload_partial_8_10 NO_LOAD 556#define vload_partial_8_11 NO_LOAD 557#define vload_partial_8_12 NO_LOAD 558#define vload_partial_8_13 NO_LOAD 559#define vload_partial_8_14 NO_LOAD 560#define vload_partial_8_15 NO_LOAD 561#define vload_partial_8_16 NO_LOAD 562 563#define vload_partial_16_0 NO_LOAD 564#define vload_partial_16_1 vload_partial_1 565#define vload_partial_16_2 vload_partial_2 566#define vload_partial_16_3 vload_partial_3 567#define vload_partial_16_4 vload_partial_4 568#define vload_partial_16_5 vload_partial_5 569#define vload_partial_16_6 vload_partial_6 570#define vload_partial_16_7 vload_partial_7 571#define vload_partial_16_8 vload_partial_8 572#define vload_partial_16_9 vload_partial_9 573#define vload_partial_16_10 vload_partial_10 574#define vload_partial_16_11 vload_partial_11 575#define vload_partial_16_12 vload_partial_12 576#define vload_partial_16_13 vload_partial_13 577#define vload_partial_16_14 vload_partial_14 578#define vload_partial_16_15 vload_partial_15 579#define vload_partial_16_16 vload_partial_16 580 581 582#define vload_partial_1(DATA, OFFSET, PTR) \ 583 DATA.s0 = vload1(OFFSET, PTR); 584 585#define vload_partial_2(DATA, OFFSET, PTR) \ 586 DATA.s01 = vload2(OFFSET, PTR); 587 588#define vload_partial_3(DATA, OFFSET, PTR) \ 589 DATA.s012 = vload3(OFFSET, PTR); 590 591#define vload_partial_4(DATA, OFFSET, PTR) \ 592 DATA.s0123 = vload4(OFFSET, PTR); 593 594#define vload_partial_5(DATA, OFFSET, PTR) \ 595 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 596 DATA.s4 = vload1(OFFSET, PTR + 4); 597 598#define vload_partial_6(DATA, OFFSET, PTR) \ 599 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 600 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 601 602#define vload_partial_7(DATA, OFFSET, PTR) \ 603 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 604 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 605 606#define vload_partial_8(DATA, OFFSET, PTR) \ 607 DATA.s01234567 = vload8(OFFSET, PTR); 608 609#define vload_partial_9(DATA, OFFSET, PTR) \ 610 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 611 DATA.s8 = vload1(OFFSET, PTR + 8); 612 613#define vload_partial_10(DATA, OFFSET, PTR) \ 614 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 615 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 616 617#define vload_partial_11(DATA, OFFSET, PTR) \ 618 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 619 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 620 621#define vload_partial_12(DATA, OFFSET, PTR) \ 622 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 623 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 624 625#define vload_partial_13(DATA, OFFSET, PTR) \ 626 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 627 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 628 629#define vload_partial_14(DATA, OFFSET, PTR) \ 630 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 631 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 632 633#define vload_partial_15(DATA, OFFSET, PTR) \ 634 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 635 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 636 637#define vload_partial_16(DATA, OFFSET, PTR) \ 638 DATA = vload16(OFFSET, PTR); 639 640 641 642#define PIXEL_UNIT4 1 643#define PIXEL_UNIT8 2 644#define PIXEL_UNIT16 4 645 646 647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 649 650 651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 654 655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 659#endif 660 661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 664 665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 669#endif 670 671 672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 674 675 676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 678 679#define VSTORE_STR(size) vstore##size 680#define VSTORE(size) VSTORE_STR(size) 681 682#define float1 float 683#define half1 half 684#define char1 char 685#define uchar1 uchar 686#define short1 short 687#define ushort1 ushort 688#define int1 int 689#define uint1 uint 690#define long1 long 691#define ulong1 ulong 692#define double1 double 693 694#define vload1(OFFSET, PTR) *(OFFSET + PTR) 695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 696 697 698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 700 701#define NO_STORE(data, offs, ptr) \ 702 { \ 703 } 704 705 706#define vstore_partial_1_0 NO_STORE 707#define vstore_partial_1_1 vstore1 708#define vstore_partial_1_2 NO_STORE 709#define vstore_partial_1_3 NO_STORE 710#define vstore_partial_1_4 NO_STORE 711#define vstore_partial_1_5 NO_STORE 712#define vstore_partial_1_6 NO_STORE 713#define vstore_partial_1_7 NO_STORE 714#define vstore_partial_1_8 NO_STORE 715#define vstore_partial_1_9 NO_STORE 716#define vstore_partial_1_10 NO_STORE 717#define vstore_partial_1_11 NO_STORE 718#define vstore_partial_1_12 NO_STORE 719#define vstore_partial_1_13 NO_STORE 720#define vstore_partial_1_14 NO_STORE 721#define vstore_partial_1_15 NO_STORE 722#define vstore_partial_1_16 NO_STORE 723 724#define vstore_partial_2_0 NO_STORE 725#define vstore_partial_2_1 vstore_partial_1 726#define vstore_partial_2_2 vstore_partial_2 727#define vstore_partial_2_3 NO_STORE 728#define vstore_partial_2_4 NO_STORE 729#define vstore_partial_2_5 NO_STORE 730#define vstore_partial_2_6 NO_STORE 731#define vstore_partial_2_7 NO_STORE 732#define vstore_partial_2_8 NO_STORE 733#define vstore_partial_2_9 NO_STORE 734#define vstore_partial_2_10 NO_STORE 735#define vstore_partial_2_11 NO_STORE 736#define vstore_partial_2_12 NO_STORE 737#define vstore_partial_2_13 NO_STORE 738#define vstore_partial_2_14 NO_STORE 739#define vstore_partial_2_15 NO_STORE 740#define vstore_partial_2_16 NO_STORE 741 742#define vstore_partial_3_0 NO_STORE 743#define vstore_partial_3_1 vstore_partial_1 744#define vstore_partial_3_2 vstore_partial_2 745#define vstore_partial_3_3 vstore_partial_3 746#define vstore_partial_3_4 NO_STORE 747#define vstore_partial_3_5 NO_STORE 748#define vstore_partial_3_6 NO_STORE 749#define vstore_partial_3_7 NO_STORE 750#define vstore_partial_3_8 NO_STORE 751#define vstore_partial_3_9 NO_STORE 752#define vstore_partial_3_10 NO_STORE 753#define vstore_partial_3_11 NO_STORE 754#define vstore_partial_3_12 NO_STORE 755#define vstore_partial_3_13 NO_STORE 756#define vstore_partial_3_14 NO_STORE 757#define vstore_partial_3_15 NO_STORE 758#define vstore_partial_3_16 NO_STORE 759 760#define vstore_partial_4_0 NO_STORE 761#define vstore_partial_4_1 vstore_partial_1 762#define vstore_partial_4_2 vstore_partial_2 763#define vstore_partial_4_3 vstore_partial_3 764#define vstore_partial_4_4 vstore_partial_4 765#define vstore_partial_4_5 NO_STORE 766#define vstore_partial_4_6 NO_STORE 767#define vstore_partial_4_7 NO_STORE 768#define vstore_partial_4_8 NO_STORE 769#define vstore_partial_4_9 NO_STORE 770#define vstore_partial_4_10 NO_STORE 771#define vstore_partial_4_11 NO_STORE 772#define vstore_partial_4_12 NO_STORE 773#define vstore_partial_4_13 NO_STORE 774#define vstore_partial_4_14 NO_STORE 775#define vstore_partial_4_15 NO_STORE 776#define vstore_partial_4_16 NO_STORE 777 778#define vstore_partial_8_0 NO_STORE 779#define vstore_partial_8_1 vstore_partial_1 780#define vstore_partial_8_2 vstore_partial_2 781#define vstore_partial_8_3 vstore_partial_3 782#define vstore_partial_8_4 vstore_partial_4 783#define vstore_partial_8_5 vstore_partial_5 784#define vstore_partial_8_6 vstore_partial_6 785#define vstore_partial_8_7 vstore_partial_7 786#define vstore_partial_8_8 vstore_partial_8 787#define vstore_partial_8_9 NO_STORE 788#define vstore_partial_8_10 NO_STORE 789#define vstore_partial_8_11 NO_STORE 790#define vstore_partial_8_12 NO_STORE 791#define vstore_partial_8_13 NO_STORE 792#define vstore_partial_8_14 NO_STORE 793#define vstore_partial_8_15 NO_STORE 794#define vstore_partial_8_16 NO_STORE 795 796#define vstore_partial_16_0 NO_STORE 797#define vstore_partial_16_1 vstore_partial_1 798#define vstore_partial_16_2 vstore_partial_2 799#define vstore_partial_16_3 vstore_partial_3 800#define vstore_partial_16_4 vstore_partial_4 801#define vstore_partial_16_5 vstore_partial_5 802#define vstore_partial_16_6 vstore_partial_6 803#define vstore_partial_16_7 vstore_partial_7 804#define vstore_partial_16_8 vstore_partial_8 805#define vstore_partial_16_9 vstore_partial_9 806#define vstore_partial_16_10 vstore_partial_10 807#define vstore_partial_16_11 vstore_partial_11 808#define vstore_partial_16_12 vstore_partial_12 809#define vstore_partial_16_13 vstore_partial_13 810#define vstore_partial_16_14 vstore_partial_14 811#define vstore_partial_16_15 vstore_partial_15 812#define vstore_partial_16_16 vstore_partial_16 813 814 815#define vstore_partial_1(DATA, OFFSET, PTR) \ 816 vstore1(DATA.s0, OFFSET, PTR); 817 818#define vstore_partial_2(DATA, OFFSET, PTR) \ 819 vstore2(DATA.s01, OFFSET, PTR); 820 821#define vstore_partial_3(DATA, OFFSET, PTR) \ 822 vstore3(DATA.s012, OFFSET, PTR); 823 824#define vstore_partial_4(DATA, OFFSET, PTR) \ 825 vstore4(DATA.s0123, OFFSET, PTR); 826 827#define vstore_partial_5(DATA, OFFSET, PTR) \ 828 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 829 vstore1(DATA.s4, OFFSET, PTR + 4); 830 831#define vstore_partial_6(DATA, OFFSET, PTR) \ 832 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 833 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 834 835#define vstore_partial_7(DATA, OFFSET, PTR) \ 836 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 837 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 838 839#define vstore_partial_8(DATA, OFFSET, PTR) \ 840 vstore8(DATA.s01234567, OFFSET, PTR); 841 842#define vstore_partial_9(DATA, OFFSET, PTR) \ 843 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 844 vstore1(DATA.s8, OFFSET, PTR + 8); 845 846#define vstore_partial_10(DATA, OFFSET, PTR) \ 847 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 848 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 849 850#define vstore_partial_11(DATA, OFFSET, PTR) \ 851 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 852 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 853 854#define vstore_partial_12(DATA, OFFSET, PTR) \ 855 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 856 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 857 858#define vstore_partial_13(DATA, OFFSET, PTR) \ 859 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 860 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 861 862#define vstore_partial_14(DATA, OFFSET, PTR) \ 863 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 864 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 865 866#define vstore_partial_15(DATA, OFFSET, PTR) \ 867 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 868 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 869 870#define vstore_partial_16(DATA, OFFSET, PTR) \ 871 vstore16(DATA, OFFSET, PTR); 872 873 874 875 876 877#define convert_float_sat convert_float 878#define convert_float1_sat convert_float 879#define convert_float2_sat convert_float2 880#define convert_float3_sat convert_float3 881#define convert_float4_sat convert_float4 882#define convert_float8_sat convert_float8 883#define convert_float16_sat convert_float16 884#define convert_half_sat convert_float 885#define convert_half1_sat convert_half 886#define convert_half2_sat convert_half2 887#define convert_half3_sat convert_half3 888#define convert_half4_sat convert_half4 889#define convert_half8_sat convert_half8 890#define convert_half16_sat convert_half16 891 892#define convert_float1 convert_float 893#define convert_half1 convert_half 894#define convert_char1 convert_char 895#define convert_uchar1 convert_uchar 896#define convert_short1 convert_short 897#define convert_ushort1 convert_ushort 898#define convert_int1 convert_int 899#define convert_uint1 convert_uint 900#define convert_long1 convert_long 901#define convert_ulong1 convert_ulong 902#define convert_double1 convert_double 903 904#define convert_char1_sat convert_char_sat 905#define convert_uchar1_sat convert_uchar_sat 906#define convert_uchar2_sat convert_uchar2_sat 907#define convert_uchar3_sat convert_uchar3_sat 908#define convert_uchar4_sat convert_uchar4_sat 909#define convert_uchar8_sat convert_uchar8_sat 910#define convert_uchar16_sat convert_uchar16_sat 911#define convert_short1_sat convert_short_sat 912#define convert_ushort1_sat convert_ushort_sat 913#define convert_int1_sat convert_int_sat 914#define convert_uint1_sat convert_uint_sat 915#define convert_long1_sat convert_long_sat 916#define convert_ulong1_sat convert_ulong_sat 917#define convert_double1_sat convert_double_sat 918 919#define VEC_DATA_TYPE_STR(type, size) type##size 920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 921 922#define CONVERT_STR(x, type) (convert_##type((x))) 923#define CONVERT(x, type) CONVERT_STR(x, type) 924 925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 927 928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 930 931#define select_vec_dt_uchar(size) uchar##size 932#define select_vec_dt_char(size) char##size 933#define select_vec_dt_ushort(size) ushort##size 934#define select_vec_dt_short(size) short##size 935#define select_vec_dt_half(size) short##size 936#define select_vec_dt_uint(size) uint##size 937#define select_vec_dt_int(size) int##size 938#define select_vec_dt_float(size) int##size 939#define select_vec_dt_ulong(size) ulong##size 940#define select_vec_dt_long(size) long##size 941 942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 945 946#define signed_int_vec_dt_uchar(size) char##size 947#define signed_int_vec_dt_char(size) char##size 948#define signed_int_vec_dt_ushort(size) short##size 949#define signed_int_vec_dt_short(size) short##size 950#define signed_int_vec_dt_half(size) short##size 951#define signed_int_vec_dt_uint(size) int##size 952#define signed_int_vec_dt_int(size) int##size 953#define signed_int_vec_dt_float(size) int##size 954#define signed_int_vec_dt_ulong(size) long##size 955#define signed_int_vec_dt_long(size) long##size 956 957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 960 961#define sum_reduce_1(x) (x) 962#define sum_reduce_2(x) ((x).s0) + ((x).s1) 963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 967 968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 970 971#define prod_reduce_1(x) (x) 972#define prod_reduce_2(x) ((x).s0) * ((x).s1) 973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 977 978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 980 981#define max_reduce_1(x) (x) 982#define max_reduce_2(x) max(((x).s0), ((x).s1)) 983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 987 988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 990 991#define VECTOR_DECLARATION(name) \ 992 __global uchar *name##_ptr, \ 993 uint name##_stride_x, \ 994 uint name##_step_x, \ 995 uint name##_offset_first_element_in_bytes 996 997#define IMAGE_DECLARATION(name) \ 998 __global uchar *name##_ptr, \ 999 uint name##_stride_x, \ 1000 uint name##_step_x, \ 1001 uint name##_stride_y, \ 1002 uint name##_step_y, \ 1003 uint name##_offset_first_element_in_bytes 1004 1005#define TENSOR3D_DECLARATION(name) \ 1006 __global uchar *name##_ptr, \ 1007 uint name##_stride_x, \ 1008 uint name##_step_x, \ 1009 uint name##_stride_y, \ 1010 uint name##_step_y, \ 1011 uint name##_stride_z, \ 1012 uint name##_step_z, \ 1013 uint name##_offset_first_element_in_bytes 1014 1015#define TENSOR4D_DECLARATION(name) \ 1016 __global uchar *name##_ptr, \ 1017 uint name##_stride_x, \ 1018 uint name##_step_x, \ 1019 uint name##_stride_y, \ 1020 uint name##_step_y, \ 1021 uint name##_stride_z, \ 1022 uint name##_step_z, \ 1023 uint name##_stride_w, \ 1024 uint name##_step_w, \ 1025 uint name##_offset_first_element_in_bytes 1026 1027#define TENSOR5D_DECLARATION(name) \ 1028 __global uchar *name##_ptr, \ 1029 uint name##_stride_x, \ 1030 uint name##_step_x, \ 1031 uint name##_stride_y, \ 1032 uint name##_step_y, \ 1033 uint name##_stride_z, \ 1034 uint name##_step_z, \ 1035 uint name##_stride_w, \ 1036 uint name##_step_w, \ 1037 uint name##_stride_v, \ 1038 uint name##_step_v, \ 1039 uint name##_offset_first_element_in_bytes 1040 1041#define CONVERT_TO_VECTOR_STRUCT(name) \ 1042 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1043 1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1045 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1046 1047#define CONVERT_TO_IMAGE_STRUCT(name) \ 1048 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1049 1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1051 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1052 1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1054 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1055 1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1057 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1058 1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1060 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1061 1062#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1063 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1064 name##_stride_z, name##_step_z) 1065 1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1067 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1068 1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1070 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1071 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1072 1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1074 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1075 1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1077 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1078 name##_stride_z, name##_step_z) 1079 1080 1081typedef struct Vector 1082{ 1083 __global uchar *ptr; 1084 int offset_first_element_in_bytes; 1085 int stride_x; 1086} Vector; 1087 1088 1089typedef struct Image 1090{ 1091 __global uchar *ptr; 1092 int offset_first_element_in_bytes; 1093 int stride_x; 1094 int stride_y; 1095} Image; 1096 1097 1098typedef struct Tensor3D 1099{ 1100 __global uchar *ptr; 1101 int offset_first_element_in_bytes; 1102 int stride_x; 1103 int stride_y; 1104 int stride_z; 1105} Tensor3D; 1106 1107 1108typedef struct Tensor4D 1109{ 1110 __global uchar *ptr; 1111 int offset_first_element_in_bytes; 1112 int stride_x; 1113 int stride_y; 1114 int stride_z; 1115 int stride_w; 1116} Tensor4D; 1117 1118 1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1120{ 1121 Vector vector = 1122 { 1123 .ptr = ptr, 1124 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1125 .stride_x = stride_x, 1126 }; 1127 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1128 return vector; 1129} 1130 1131 1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1133{ 1134 Image img = 1135 { 1136 .ptr = ptr, 1137 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1138 .stride_x = stride_x, 1139 .stride_y = stride_y 1140 }; 1141 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1142 return img; 1143} 1144 1145 1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1147{ 1148 Image img = 1149 { 1150 .ptr = ptr, 1151 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1152 .stride_x = stride_x, 1153 .stride_y = stride_y 1154 }; 1155 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1156 return img; 1157} 1158 1159 1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1161{ 1162 Tensor3D tensor = 1163 { 1164 .ptr = ptr, 1165 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1166 .stride_x = stride_x, 1167 .stride_y = stride_y, 1168 .stride_z = stride_z 1169 }; 1170 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1171 return tensor; 1172} 1173 1174 1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1176{ 1177 Tensor3D tensor = 1178 { 1179 .ptr = ptr, 1180 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1181 .stride_x = stride_x, 1182 .stride_y = stride_y, 1183 .stride_z = stride_z 1184 }; 1185 return tensor; 1186} 1187 1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1189 uint step_w, 1190 uint mod_size) 1191{ 1192 Tensor4D tensor = 1193 { 1194 .ptr = ptr, 1195 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1196 .stride_x = stride_x, 1197 .stride_y = stride_y, 1198 .stride_z = stride_z, 1199 .stride_w = stride_w 1200 }; 1201 1202 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1203 return tensor; 1204} 1205 1206 1207inline __global const uchar *vector_offset(const Vector *vec, int x) 1208{ 1209 return vec->ptr + x * vec->stride_x; 1210} 1211 1212 1213inline __global uchar *offset(const Image *img, int x, int y) 1214{ 1215 return img->ptr + x * img->stride_x + y * img->stride_y; 1216} 1217 1218 1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1220{ 1221 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1222} 1223 1224 1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1226{ 1227 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1228} 1229 1230 1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1232{ 1233 uint num_elements = width * height; 1234 1235 const uint z = index / num_elements; 1236 1237 index %= num_elements; 1238 1239 const uint y = index / width; 1240 1241 index %= width; 1242 1243 const uint x = index; 1244 1245 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1246} 1247 1248#endif 1249 1250#if GPU_ARCH == GPU_ARCH_BIFROST 1251#define MLA(a, b, c) (fma(c, b, a)) 1252#else 1253#define MLA(a, b, c) ((b) * (c) + (a)) 1254#endif 1255 1256 1257#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) 1258 1259 1260#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) 1261 1262 1263#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) 1264 1265 1266#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) 1267 1268 1269#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) 1270 1271 1272#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 1273 1274 1275#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) 1276 1277 1278#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) 1279 1280 1281#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) 1282 1283 1284#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) 1285 1286 1287#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x) 1288 1289 1290#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x)) 1291 1292 1293#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) 1294 1295 1296#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) 1297 1298 1299#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) 1300 1301#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1302 1303#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1304 1305#ifndef ARM_COMPUTE_HELPER_H 1306#define ARM_COMPUTE_HELPER_H 1307 1308 1309 1310 1311#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1312 VSTORE(N0) \ 1313 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1314 1315#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1316 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1317 VSTORE(N0) \ 1318 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1319 1320#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1321 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1322 VSTORE(N0) \ 1323 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1324 1325#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1326 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1327 VSTORE(N0) \ 1328 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1329 1330#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1331 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1332 VSTORE(N0) \ 1333 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1334 1335#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1336 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1337 VSTORE(N0) \ 1338 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1339 1340#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1341 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1342 VSTORE(N0) \ 1343 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1344 1345#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1346 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1347 VSTORE(N0) \ 1348 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1349 1350#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1351 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1352 VSTORE(N0) \ 1353 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1354 1355#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1356 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1357 VSTORE(N0) \ 1358 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1359 1360#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1361 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1362 VSTORE(N0) \ 1363 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1364 1365#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1366 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1367 VSTORE(N0) \ 1368 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1369 1370#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1371 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1372 VSTORE(N0) \ 1373 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1374 1375#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1376 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1377 VSTORE(N0) \ 1378 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1379 1380#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1381 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1382 VSTORE(N0) \ 1383 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1384 1385#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1386 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1387 VSTORE(N0) \ 1388 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1389 1390 1391 1392#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1393 VSTORE(N0) \ 1394 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1395 1396#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1397 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1398 VSTORE(N0) \ 1399 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1400 1401#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1402 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1403 VSTORE(N0) \ 1404 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1405 1406#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1407 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1408 VSTORE(N0) \ 1409 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1410 1411#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1412 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1413 VSTORE(N0) \ 1414 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1415 1416#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1417 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1418 VSTORE(N0) \ 1419 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1420 1421#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1422 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1423 VSTORE(N0) \ 1424 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1425 1426#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1427 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1428 VSTORE(N0) \ 1429 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1430 1431#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1432 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1433 VSTORE(N0) \ 1434 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1435 1436#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 1437 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1438 VSTORE(N0) \ 1439 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1440 1441#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1442 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1443 VSTORE(N0) \ 1444 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1445 1446#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1447 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1448 VSTORE(N0) \ 1449 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1450 1451#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1452 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1453 VSTORE(N0) \ 1454 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1455 1456#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1457 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1458 VSTORE(N0) \ 1459 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1460 1461#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1462 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1463 VSTORE(N0) \ 1464 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1465 1466#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1467 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1468 VSTORE(N0) \ 1469 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1470 1471 1472 1473 1474#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1475#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1476 1477 1478 1479#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1480#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1481 1482 1483 1484#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1485 VSTORE_PARTIAL(N0, STORE_N0) \ 1486 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1487 1488#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1489 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1490 VSTORE_PARTIAL(N0, STORE_N0) \ 1491 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1492 1493#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1494 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1495 VSTORE_PARTIAL(N0, STORE_N0) \ 1496 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1497 1498#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1499 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1500 VSTORE_PARTIAL(N0, STORE_N0) \ 1501 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1502 1503#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1504 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1505 VSTORE_PARTIAL(N0, STORE_N0) \ 1506 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1507 1508#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1509 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1510 VSTORE_PARTIAL(N0, STORE_N0) \ 1511 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1512 1513#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1514 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1515 VSTORE_PARTIAL(N0, STORE_N0) \ 1516 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1517 1518#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1519 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1520 VSTORE_PARTIAL(N0, STORE_N0) \ 1521 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1522 1523#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1524 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1525 VSTORE_PARTIAL(N0, STORE_N0) \ 1526 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1527 1528#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1529 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1530 VSTORE_PARTIAL(N0, STORE_N0) \ 1531 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1532 1533#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1534 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1535 VSTORE_PARTIAL(N0, STORE_N0) \ 1536 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1537 1538#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1539 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1540 VSTORE_PARTIAL(N0, STORE_N0) \ 1541 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1542 1543#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1544 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1545 VSTORE_PARTIAL(N0, STORE_N0) \ 1546 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1547 1548#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1549 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1550 VSTORE_PARTIAL(N0, STORE_N0) \ 1551 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1552 1553#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1554 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1555 VSTORE_PARTIAL(N0, STORE_N0) \ 1556 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1557 1558#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1559 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1560 VSTORE_PARTIAL(N0, STORE_N0) \ 1561 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1562 1563 1564 1565#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1566#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1567 1568#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1569 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 1570 { \ 1571 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1572 } \ 1573 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 1574 { \ 1575 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1576 } \ 1577 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 1578 { \ 1579 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1580 } \ 1581 else \ 1582 { \ 1583 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1584 } 1585 1586#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 1587 if(!(PARTIAL_COND_X)) \ 1588 { \ 1589 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1590 } \ 1591 else \ 1592 { \ 1593 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1594 } 1595 1596#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 1597 if(!(PARTIAL_COND_Y)) \ 1598 { \ 1599 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1600 } \ 1601 else \ 1602 { \ 1603 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1604 } 1605 1606 1607#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1608 1609 1610#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 1611 1612#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1613 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1614 1615#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 1616 1617#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1618 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 1619 1620#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 1621 1622#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1623 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 1624 1625#else 1626 1627#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1628 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 1629 1630#endif 1631 1632#endif 1633 1634 1635#if defined(PARTIAL_STORE_M0) 1636 1637#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1638 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 1639#else 1640#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1641 ((uint)(y * M0)) 1642#endif 1643 1644 1645 1646#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 1647 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 1648 1649 1650#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1651#pragma OPENCL EXTENSION cl_khr_fp16 : enable 1652#endif 1653 1654#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1655#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 1656#endif 1657 1658#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 1659#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 1660#endif 1661 1662#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 1663#pragma OPENCL EXTENSION cl_arm_printf : enable 1664#endif 1665 1666#define GPU_ARCH_MIDGARD 0x100 1667#define GPU_ARCH_BIFROST 0x200 1668#define GPU_ARCH_VALHALL 0x300 1669 1670 1671#define CONCAT(a, b) a##b 1672 1673 1674#define EXPAND(x) x 1675 1676 1677#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 1678 1679 1680#define REV1(x) ((x)) 1681#define REV2(x) ((x).s10) 1682#define REV3(x) ((x).s210) 1683#define REV4(x) ((x).s3210) 1684#define REV8(x) ((x).s76543210) 1685#define REV16(x) ((x).sFEDCBA9876543210) 1686 1687 1688 1689#define REVERSE_STR(x, s) REV##s((x)) 1690#define REVERSE(x, s) REVERSE_STR(x, s) 1691 1692 1693 1694#define ROT1_0(x) ((x)) 1695#define ROT1_1(x) ((x)) 1696 1697#define ROT2_0(x) ((x)) 1698#define ROT2_1(x) ((x).s10) 1699#define ROT2_2(x) ((x)) 1700 1701#define ROT3_0(x) ((x)) 1702#define ROT3_1(x) ((x).s201) 1703#define ROT3_2(x) ((x).s120) 1704#define ROT3_3(x) ((x)) 1705 1706#define ROT4_0(x) ((x)) 1707#define ROT4_1(x) ((x).s3012) 1708#define ROT4_2(x) ((x).s2301) 1709#define ROT4_3(x) ((x).s1230) 1710#define ROT4_4(x) ((x)) 1711 1712#define ROT8_0(x) ((x)) 1713#define ROT8_1(x) ((x).s70123456) 1714#define ROT8_2(x) ((x).s67012345) 1715#define ROT8_3(x) ((x).s56701234) 1716#define ROT8_4(x) ((x).s45670123) 1717#define ROT8_5(x) ((x).s34567012) 1718#define ROT8_6(x) ((x).s23456701) 1719#define ROT8_7(x) ((x).s12345670) 1720#define ROT8_8(x) ((x)) 1721 1722#define ROT16_0(x) ((x)) 1723#define ROT16_1(x) ((x).sF0123456789ABCDE) 1724#define ROT16_2(x) ((x).sEF0123456789ABCD) 1725#define ROT16_3(x) ((x).sDEF0123456789ABC) 1726#define ROT16_4(x) ((x).sCDEF0123456789AB) 1727#define ROT16_5(x) ((x).sBCDEF0123456789A) 1728#define ROT16_6(x) ((x).sABCDEF0123456789) 1729#define ROT16_7(x) ((x).s9ABCDEF012345678) 1730#define ROT16_8(x) ((x).s89ABCDEF01234567) 1731#define ROT16_9(x) ((x).s789ABCDEF0123456) 1732#define ROT16_10(x) ((x).s6789ABCDEF012345) 1733#define ROT16_11(x) ((x).s56789ABCDEF01234) 1734#define ROT16_12(x) ((x).s456789ABCDEF0123) 1735#define ROT16_13(x) ((x).s3456789ABCDEF012) 1736#define ROT16_14(x) ((x).s23456789ABCDEF01) 1737#define ROT16_15(x) ((x).s123456789ABCDEF0) 1738#define ROT16_16(x) ((x)) 1739 1740 1741 1742#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 1743#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 1744 1745 1746 1747#define V_OFFS1(dt) (dt##1)(0) 1748#define V_OFFS2(dt) (dt##2)(0, 1) 1749#define V_OFFS3(dt) (dt##3)(0, 1, 2) 1750#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 1751#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 1752#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 1753 1754 1755 1756#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 1757#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 1758 1759 1760#define VLOAD_STR(size) vload##size 1761#define VLOAD(size) VLOAD_STR(size) 1762 1763 1764#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 1765#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 1766 1767#define NO_LOAD(data, offs, ptr) \ 1768 { \ 1769 } 1770 1771 1772#define vload_partial_1_0 NO_LOAD 1773#define vload_partial_1_1 vload1 1774#define vload_partial_1_2 NO_LOAD 1775#define vload_partial_1_3 NO_LOAD 1776#define vload_partial_1_4 NO_LOAD 1777#define vload_partial_1_5 NO_LOAD 1778#define vload_partial_1_6 NO_LOAD 1779#define vload_partial_1_7 NO_LOAD 1780#define vload_partial_1_8 NO_LOAD 1781#define vload_partial_1_9 NO_LOAD 1782#define vload_partial_1_10 NO_LOAD 1783#define vload_partial_1_11 NO_LOAD 1784#define vload_partial_1_12 NO_LOAD 1785#define vload_partial_1_13 NO_LOAD 1786#define vload_partial_1_14 NO_LOAD 1787#define vload_partial_1_15 NO_LOAD 1788#define vload_partial_1_16 NO_LOAD 1789 1790#define vload_partial_2_0 NO_LOAD 1791#define vload_partial_2_1 vload_partial_1 1792#define vload_partial_2_2 vload_partial_2 1793#define vload_partial_2_3 NO_LOAD 1794#define vload_partial_2_4 NO_LOAD 1795#define vload_partial_2_5 NO_LOAD 1796#define vload_partial_2_6 NO_LOAD 1797#define vload_partial_2_7 NO_LOAD 1798#define vload_partial_2_8 NO_LOAD 1799#define vload_partial_2_9 NO_LOAD 1800#define vload_partial_2_10 NO_LOAD 1801#define vload_partial_2_11 NO_LOAD 1802#define vload_partial_2_12 NO_LOAD 1803#define vload_partial_2_13 NO_LOAD 1804#define vload_partial_2_14 NO_LOAD 1805#define vload_partial_2_15 NO_LOAD 1806#define vload_partial_2_16 NO_LOAD 1807 1808#define vload_partial_3_0 NO_LOAD 1809#define vload_partial_3_1 vload_partial_1 1810#define vload_partial_3_2 vload_partial_2 1811#define vload_partial_3_3 vload_partial_3 1812#define vload_partial_3_4 NO_LOAD 1813#define vload_partial_3_5 NO_LOAD 1814#define vload_partial_3_6 NO_LOAD 1815#define vload_partial_3_7 NO_LOAD 1816#define vload_partial_3_8 NO_LOAD 1817#define vload_partial_3_9 NO_LOAD 1818#define vload_partial_3_10 NO_LOAD 1819#define vload_partial_3_11 NO_LOAD 1820#define vload_partial_3_12 NO_LOAD 1821#define vload_partial_3_13 NO_LOAD 1822#define vload_partial_3_14 NO_LOAD 1823#define vload_partial_3_15 NO_LOAD 1824#define vload_partial_3_16 NO_LOAD 1825 1826#define vload_partial_4_0 NO_LOAD 1827#define vload_partial_4_1 vload_partial_1 1828#define vload_partial_4_2 vload_partial_2 1829#define vload_partial_4_3 vload_partial_3 1830#define vload_partial_4_4 vload_partial_4 1831#define vload_partial_4_5 NO_LOAD 1832#define vload_partial_4_6 NO_LOAD 1833#define vload_partial_4_7 NO_LOAD 1834#define vload_partial_4_8 NO_LOAD 1835#define vload_partial_4_9 NO_LOAD 1836#define vload_partial_4_10 NO_LOAD 1837#define vload_partial_4_11 NO_LOAD 1838#define vload_partial_4_12 NO_LOAD 1839#define vload_partial_4_13 NO_LOAD 1840#define vload_partial_4_14 NO_LOAD 1841#define vload_partial_4_15 NO_LOAD 1842#define vload_partial_4_16 NO_LOAD 1843 1844#define vload_partial_8_0 NO_LOAD 1845#define vload_partial_8_1 vload_partial_1 1846#define vload_partial_8_2 vload_partial_2 1847#define vload_partial_8_3 vload_partial_3 1848#define vload_partial_8_4 vload_partial_4 1849#define vload_partial_8_5 vload_partial_5 1850#define vload_partial_8_6 vload_partial_6 1851#define vload_partial_8_7 vload_partial_7 1852#define vload_partial_8_8 vload_partial_8 1853#define vload_partial_8_9 NO_LOAD 1854#define vload_partial_8_10 NO_LOAD 1855#define vload_partial_8_11 NO_LOAD 1856#define vload_partial_8_12 NO_LOAD 1857#define vload_partial_8_13 NO_LOAD 1858#define vload_partial_8_14 NO_LOAD 1859#define vload_partial_8_15 NO_LOAD 1860#define vload_partial_8_16 NO_LOAD 1861 1862#define vload_partial_16_0 NO_LOAD 1863#define vload_partial_16_1 vload_partial_1 1864#define vload_partial_16_2 vload_partial_2 1865#define vload_partial_16_3 vload_partial_3 1866#define vload_partial_16_4 vload_partial_4 1867#define vload_partial_16_5 vload_partial_5 1868#define vload_partial_16_6 vload_partial_6 1869#define vload_partial_16_7 vload_partial_7 1870#define vload_partial_16_8 vload_partial_8 1871#define vload_partial_16_9 vload_partial_9 1872#define vload_partial_16_10 vload_partial_10 1873#define vload_partial_16_11 vload_partial_11 1874#define vload_partial_16_12 vload_partial_12 1875#define vload_partial_16_13 vload_partial_13 1876#define vload_partial_16_14 vload_partial_14 1877#define vload_partial_16_15 vload_partial_15 1878#define vload_partial_16_16 vload_partial_16 1879 1880 1881#define vload_partial_1(DATA, OFFSET, PTR) \ 1882 DATA.s0 = vload1(OFFSET, PTR); 1883 1884#define vload_partial_2(DATA, OFFSET, PTR) \ 1885 DATA.s01 = vload2(OFFSET, PTR); 1886 1887#define vload_partial_3(DATA, OFFSET, PTR) \ 1888 DATA.s012 = vload3(OFFSET, PTR); 1889 1890#define vload_partial_4(DATA, OFFSET, PTR) \ 1891 DATA.s0123 = vload4(OFFSET, PTR); 1892 1893#define vload_partial_5(DATA, OFFSET, PTR) \ 1894 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1895 DATA.s4 = vload1(OFFSET, PTR + 4); 1896 1897#define vload_partial_6(DATA, OFFSET, PTR) \ 1898 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1899 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 1900 1901#define vload_partial_7(DATA, OFFSET, PTR) \ 1902 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1903 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 1904 1905#define vload_partial_8(DATA, OFFSET, PTR) \ 1906 DATA.s01234567 = vload8(OFFSET, PTR); 1907 1908#define vload_partial_9(DATA, OFFSET, PTR) \ 1909 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1910 DATA.s8 = vload1(OFFSET, PTR + 8); 1911 1912#define vload_partial_10(DATA, OFFSET, PTR) \ 1913 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1914 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 1915 1916#define vload_partial_11(DATA, OFFSET, PTR) \ 1917 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1918 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 1919 1920#define vload_partial_12(DATA, OFFSET, PTR) \ 1921 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1922 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 1923 1924#define vload_partial_13(DATA, OFFSET, PTR) \ 1925 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1926 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 1927 1928#define vload_partial_14(DATA, OFFSET, PTR) \ 1929 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1930 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 1931 1932#define vload_partial_15(DATA, OFFSET, PTR) \ 1933 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1934 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 1935 1936#define vload_partial_16(DATA, OFFSET, PTR) \ 1937 DATA = vload16(OFFSET, PTR); 1938 1939 1940 1941#define PIXEL_UNIT4 1 1942#define PIXEL_UNIT8 2 1943#define PIXEL_UNIT16 4 1944 1945 1946#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 1947#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 1948 1949 1950#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 1951#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 1952#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 1953 1954#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1955#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 1956#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 1957#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 1958#endif 1959 1960#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 1961#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1962#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1963 1964#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1965#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 1966#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1967#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1968#endif 1969 1970 1971#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 1972#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 1973 1974 1975#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 1976#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 1977 1978#define VSTORE_STR(size) vstore##size 1979#define VSTORE(size) VSTORE_STR(size) 1980 1981#define float1 float 1982#define half1 half 1983#define char1 char 1984#define uchar1 uchar 1985#define short1 short 1986#define ushort1 ushort 1987#define int1 int 1988#define uint1 uint 1989#define long1 long 1990#define ulong1 ulong 1991#define double1 double 1992 1993#define vload1(OFFSET, PTR) *(OFFSET + PTR) 1994#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 1995 1996 1997#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 1998#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 1999 2000#define NO_STORE(data, offs, ptr) \ 2001 { \ 2002 } 2003 2004 2005#define vstore_partial_1_0 NO_STORE 2006#define vstore_partial_1_1 vstore1 2007#define vstore_partial_1_2 NO_STORE 2008#define vstore_partial_1_3 NO_STORE 2009#define vstore_partial_1_4 NO_STORE 2010#define vstore_partial_1_5 NO_STORE 2011#define vstore_partial_1_6 NO_STORE 2012#define vstore_partial_1_7 NO_STORE 2013#define vstore_partial_1_8 NO_STORE 2014#define vstore_partial_1_9 NO_STORE 2015#define vstore_partial_1_10 NO_STORE 2016#define vstore_partial_1_11 NO_STORE 2017#define vstore_partial_1_12 NO_STORE 2018#define vstore_partial_1_13 NO_STORE 2019#define vstore_partial_1_14 NO_STORE 2020#define vstore_partial_1_15 NO_STORE 2021#define vstore_partial_1_16 NO_STORE 2022 2023#define vstore_partial_2_0 NO_STORE 2024#define vstore_partial_2_1 vstore_partial_1 2025#define vstore_partial_2_2 vstore_partial_2 2026#define vstore_partial_2_3 NO_STORE 2027#define vstore_partial_2_4 NO_STORE 2028#define vstore_partial_2_5 NO_STORE 2029#define vstore_partial_2_6 NO_STORE 2030#define vstore_partial_2_7 NO_STORE 2031#define vstore_partial_2_8 NO_STORE 2032#define vstore_partial_2_9 NO_STORE 2033#define vstore_partial_2_10 NO_STORE 2034#define vstore_partial_2_11 NO_STORE 2035#define vstore_partial_2_12 NO_STORE 2036#define vstore_partial_2_13 NO_STORE 2037#define vstore_partial_2_14 NO_STORE 2038#define vstore_partial_2_15 NO_STORE 2039#define vstore_partial_2_16 NO_STORE 2040 2041#define vstore_partial_3_0 NO_STORE 2042#define vstore_partial_3_1 vstore_partial_1 2043#define vstore_partial_3_2 vstore_partial_2 2044#define vstore_partial_3_3 vstore_partial_3 2045#define vstore_partial_3_4 NO_STORE 2046#define vstore_partial_3_5 NO_STORE 2047#define vstore_partial_3_6 NO_STORE 2048#define vstore_partial_3_7 NO_STORE 2049#define vstore_partial_3_8 NO_STORE 2050#define vstore_partial_3_9 NO_STORE 2051#define vstore_partial_3_10 NO_STORE 2052#define vstore_partial_3_11 NO_STORE 2053#define vstore_partial_3_12 NO_STORE 2054#define vstore_partial_3_13 NO_STORE 2055#define vstore_partial_3_14 NO_STORE 2056#define vstore_partial_3_15 NO_STORE 2057#define vstore_partial_3_16 NO_STORE 2058 2059#define vstore_partial_4_0 NO_STORE 2060#define vstore_partial_4_1 vstore_partial_1 2061#define vstore_partial_4_2 vstore_partial_2 2062#define vstore_partial_4_3 vstore_partial_3 2063#define vstore_partial_4_4 vstore_partial_4 2064#define vstore_partial_4_5 NO_STORE 2065#define vstore_partial_4_6 NO_STORE 2066#define vstore_partial_4_7 NO_STORE 2067#define vstore_partial_4_8 NO_STORE 2068#define vstore_partial_4_9 NO_STORE 2069#define vstore_partial_4_10 NO_STORE 2070#define vstore_partial_4_11 NO_STORE 2071#define vstore_partial_4_12 NO_STORE 2072#define vstore_partial_4_13 NO_STORE 2073#define vstore_partial_4_14 NO_STORE 2074#define vstore_partial_4_15 NO_STORE 2075#define vstore_partial_4_16 NO_STORE 2076 2077#define vstore_partial_8_0 NO_STORE 2078#define vstore_partial_8_1 vstore_partial_1 2079#define vstore_partial_8_2 vstore_partial_2 2080#define vstore_partial_8_3 vstore_partial_3 2081#define vstore_partial_8_4 vstore_partial_4 2082#define vstore_partial_8_5 vstore_partial_5 2083#define vstore_partial_8_6 vstore_partial_6 2084#define vstore_partial_8_7 vstore_partial_7 2085#define vstore_partial_8_8 vstore_partial_8 2086#define vstore_partial_8_9 NO_STORE 2087#define vstore_partial_8_10 NO_STORE 2088#define vstore_partial_8_11 NO_STORE 2089#define vstore_partial_8_12 NO_STORE 2090#define vstore_partial_8_13 NO_STORE 2091#define vstore_partial_8_14 NO_STORE 2092#define vstore_partial_8_15 NO_STORE 2093#define vstore_partial_8_16 NO_STORE 2094 2095#define vstore_partial_16_0 NO_STORE 2096#define vstore_partial_16_1 vstore_partial_1 2097#define vstore_partial_16_2 vstore_partial_2 2098#define vstore_partial_16_3 vstore_partial_3 2099#define vstore_partial_16_4 vstore_partial_4 2100#define vstore_partial_16_5 vstore_partial_5 2101#define vstore_partial_16_6 vstore_partial_6 2102#define vstore_partial_16_7 vstore_partial_7 2103#define vstore_partial_16_8 vstore_partial_8 2104#define vstore_partial_16_9 vstore_partial_9 2105#define vstore_partial_16_10 vstore_partial_10 2106#define vstore_partial_16_11 vstore_partial_11 2107#define vstore_partial_16_12 vstore_partial_12 2108#define vstore_partial_16_13 vstore_partial_13 2109#define vstore_partial_16_14 vstore_partial_14 2110#define vstore_partial_16_15 vstore_partial_15 2111#define vstore_partial_16_16 vstore_partial_16 2112 2113 2114#define vstore_partial_1(DATA, OFFSET, PTR) \ 2115 vstore1(DATA.s0, OFFSET, PTR); 2116 2117#define vstore_partial_2(DATA, OFFSET, PTR) \ 2118 vstore2(DATA.s01, OFFSET, PTR); 2119 2120#define vstore_partial_3(DATA, OFFSET, PTR) \ 2121 vstore3(DATA.s012, OFFSET, PTR); 2122 2123#define vstore_partial_4(DATA, OFFSET, PTR) \ 2124 vstore4(DATA.s0123, OFFSET, PTR); 2125 2126#define vstore_partial_5(DATA, OFFSET, PTR) \ 2127 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2128 vstore1(DATA.s4, OFFSET, PTR + 4); 2129 2130#define vstore_partial_6(DATA, OFFSET, PTR) \ 2131 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2132 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 2133 2134#define vstore_partial_7(DATA, OFFSET, PTR) \ 2135 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2136 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 2137 2138#define vstore_partial_8(DATA, OFFSET, PTR) \ 2139 vstore8(DATA.s01234567, OFFSET, PTR); 2140 2141#define vstore_partial_9(DATA, OFFSET, PTR) \ 2142 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2143 vstore1(DATA.s8, OFFSET, PTR + 8); 2144 2145#define vstore_partial_10(DATA, OFFSET, PTR) \ 2146 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2147 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 2148 2149#define vstore_partial_11(DATA, OFFSET, PTR) \ 2150 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2151 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 2152 2153#define vstore_partial_12(DATA, OFFSET, PTR) \ 2154 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2155 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 2156 2157#define vstore_partial_13(DATA, OFFSET, PTR) \ 2158 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2159 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 2160 2161#define vstore_partial_14(DATA, OFFSET, PTR) \ 2162 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2163 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 2164 2165#define vstore_partial_15(DATA, OFFSET, PTR) \ 2166 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2167 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 2168 2169#define vstore_partial_16(DATA, OFFSET, PTR) \ 2170 vstore16(DATA, OFFSET, PTR); 2171 2172 2173 2174 2175 2176#define convert_float_sat convert_float 2177#define convert_float1_sat convert_float 2178#define convert_float2_sat convert_float2 2179#define convert_float3_sat convert_float3 2180#define convert_float4_sat convert_float4 2181#define convert_float8_sat convert_float8 2182#define convert_float16_sat convert_float16 2183#define convert_half_sat convert_float 2184#define convert_half1_sat convert_half 2185#define convert_half2_sat convert_half2 2186#define convert_half3_sat convert_half3 2187#define convert_half4_sat convert_half4 2188#define convert_half8_sat convert_half8 2189#define convert_half16_sat convert_half16 2190 2191#define convert_float1 convert_float 2192#define convert_half1 convert_half 2193#define convert_char1 convert_char 2194#define convert_uchar1 convert_uchar 2195#define convert_short1 convert_short 2196#define convert_ushort1 convert_ushort 2197#define convert_int1 convert_int 2198#define convert_uint1 convert_uint 2199#define convert_long1 convert_long 2200#define convert_ulong1 convert_ulong 2201#define convert_double1 convert_double 2202 2203#define convert_char1_sat convert_char_sat 2204#define convert_uchar1_sat convert_uchar_sat 2205#define convert_uchar2_sat convert_uchar2_sat 2206#define convert_uchar3_sat convert_uchar3_sat 2207#define convert_uchar4_sat convert_uchar4_sat 2208#define convert_uchar8_sat convert_uchar8_sat 2209#define convert_uchar16_sat convert_uchar16_sat 2210#define convert_short1_sat convert_short_sat 2211#define convert_ushort1_sat convert_ushort_sat 2212#define convert_int1_sat convert_int_sat 2213#define convert_uint1_sat convert_uint_sat 2214#define convert_long1_sat convert_long_sat 2215#define convert_ulong1_sat convert_ulong_sat 2216#define convert_double1_sat convert_double_sat 2217 2218#define VEC_DATA_TYPE_STR(type, size) type##size 2219#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 2220 2221#define CONVERT_STR(x, type) (convert_##type((x))) 2222#define CONVERT(x, type) CONVERT_STR(x, type) 2223 2224#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 2225#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 2226 2227#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 2228#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 2229 2230#define select_vec_dt_uchar(size) uchar##size 2231#define select_vec_dt_char(size) char##size 2232#define select_vec_dt_ushort(size) ushort##size 2233#define select_vec_dt_short(size) short##size 2234#define select_vec_dt_half(size) short##size 2235#define select_vec_dt_uint(size) uint##size 2236#define select_vec_dt_int(size) int##size 2237#define select_vec_dt_float(size) int##size 2238#define select_vec_dt_ulong(size) ulong##size 2239#define select_vec_dt_long(size) long##size 2240 2241#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 2242#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 2243#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 2244 2245#define signed_int_vec_dt_uchar(size) char##size 2246#define signed_int_vec_dt_char(size) char##size 2247#define signed_int_vec_dt_ushort(size) short##size 2248#define signed_int_vec_dt_short(size) short##size 2249#define signed_int_vec_dt_half(size) short##size 2250#define signed_int_vec_dt_uint(size) int##size 2251#define signed_int_vec_dt_int(size) int##size 2252#define signed_int_vec_dt_float(size) int##size 2253#define signed_int_vec_dt_ulong(size) long##size 2254#define signed_int_vec_dt_long(size) long##size 2255 2256#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 2257#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 2258#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 2259 2260#define sum_reduce_1(x) (x) 2261#define sum_reduce_2(x) ((x).s0) + ((x).s1) 2262#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 2263#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 2264#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 2265#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 2266 2267#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 2268#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 2269 2270#define prod_reduce_1(x) (x) 2271#define prod_reduce_2(x) ((x).s0) * ((x).s1) 2272#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 2273#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 2274#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 2275#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 2276 2277#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 2278#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 2279 2280#define max_reduce_1(x) (x) 2281#define max_reduce_2(x) max(((x).s0), ((x).s1)) 2282#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 2283#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 2284#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 2285#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 2286 2287#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 2288#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 2289 2290#define VECTOR_DECLARATION(name) \ 2291 __global uchar *name##_ptr, \ 2292 uint name##_stride_x, \ 2293 uint name##_step_x, \ 2294 uint name##_offset_first_element_in_bytes 2295 2296#define IMAGE_DECLARATION(name) \ 2297 __global uchar *name##_ptr, \ 2298 uint name##_stride_x, \ 2299 uint name##_step_x, \ 2300 uint name##_stride_y, \ 2301 uint name##_step_y, \ 2302 uint name##_offset_first_element_in_bytes 2303 2304#define TENSOR3D_DECLARATION(name) \ 2305 __global uchar *name##_ptr, \ 2306 uint name##_stride_x, \ 2307 uint name##_step_x, \ 2308 uint name##_stride_y, \ 2309 uint name##_step_y, \ 2310 uint name##_stride_z, \ 2311 uint name##_step_z, \ 2312 uint name##_offset_first_element_in_bytes 2313 2314#define TENSOR4D_DECLARATION(name) \ 2315 __global uchar *name##_ptr, \ 2316 uint name##_stride_x, \ 2317 uint name##_step_x, \ 2318 uint name##_stride_y, \ 2319 uint name##_step_y, \ 2320 uint name##_stride_z, \ 2321 uint name##_step_z, \ 2322 uint name##_stride_w, \ 2323 uint name##_step_w, \ 2324 uint name##_offset_first_element_in_bytes 2325 2326#define TENSOR5D_DECLARATION(name) \ 2327 __global uchar *name##_ptr, \ 2328 uint name##_stride_x, \ 2329 uint name##_step_x, \ 2330 uint name##_stride_y, \ 2331 uint name##_step_y, \ 2332 uint name##_stride_z, \ 2333 uint name##_step_z, \ 2334 uint name##_stride_w, \ 2335 uint name##_step_w, \ 2336 uint name##_stride_v, \ 2337 uint name##_step_v, \ 2338 uint name##_offset_first_element_in_bytes 2339 2340#define CONVERT_TO_VECTOR_STRUCT(name) \ 2341 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 2342 2343#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 2344 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 2345 2346#define CONVERT_TO_IMAGE_STRUCT(name) \ 2347 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 2348 2349#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 2350 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 2351 2352#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2353 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2354 2355#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 2356 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 2357 2358#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2359 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2360 2361#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 2362 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2363 name##_stride_z, name##_step_z) 2364 2365#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 2366 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 2367 2368#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 2369 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2370 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 2371 2372#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 2373 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 2374 2375#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 2376 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2377 name##_stride_z, name##_step_z) 2378 2379 2380typedef struct Vector 2381{ 2382 __global uchar *ptr; 2383 int offset_first_element_in_bytes; 2384 int stride_x; 2385} Vector; 2386 2387 2388typedef struct Image 2389{ 2390 __global uchar *ptr; 2391 int offset_first_element_in_bytes; 2392 int stride_x; 2393 int stride_y; 2394} Image; 2395 2396 2397typedef struct Tensor3D 2398{ 2399 __global uchar *ptr; 2400 int offset_first_element_in_bytes; 2401 int stride_x; 2402 int stride_y; 2403 int stride_z; 2404} Tensor3D; 2405 2406 2407typedef struct Tensor4D 2408{ 2409 __global uchar *ptr; 2410 int offset_first_element_in_bytes; 2411 int stride_x; 2412 int stride_y; 2413 int stride_z; 2414 int stride_w; 2415} Tensor4D; 2416 2417 2418inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 2419{ 2420 Vector vector = 2421 { 2422 .ptr = ptr, 2423 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2424 .stride_x = stride_x, 2425 }; 2426 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 2427 return vector; 2428} 2429 2430 2431inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 2432{ 2433 Image img = 2434 { 2435 .ptr = ptr, 2436 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2437 .stride_x = stride_x, 2438 .stride_y = stride_y 2439 }; 2440 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 2441 return img; 2442} 2443 2444 2445inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2446{ 2447 Image img = 2448 { 2449 .ptr = ptr, 2450 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2451 .stride_x = stride_x, 2452 .stride_y = stride_y 2453 }; 2454 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2455 return img; 2456} 2457 2458 2459inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2460{ 2461 Tensor3D tensor = 2462 { 2463 .ptr = ptr, 2464 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2465 .stride_x = stride_x, 2466 .stride_y = stride_y, 2467 .stride_z = stride_z 2468 }; 2469 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2470 return tensor; 2471} 2472 2473 2474inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2475{ 2476 Tensor3D tensor = 2477 { 2478 .ptr = ptr, 2479 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2480 .stride_x = stride_x, 2481 .stride_y = stride_y, 2482 .stride_z = stride_z 2483 }; 2484 return tensor; 2485} 2486 2487inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 2488 uint step_w, 2489 uint mod_size) 2490{ 2491 Tensor4D tensor = 2492 { 2493 .ptr = ptr, 2494 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2495 .stride_x = stride_x, 2496 .stride_y = stride_y, 2497 .stride_z = stride_z, 2498 .stride_w = stride_w 2499 }; 2500 2501 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 2502 return tensor; 2503} 2504 2505 2506inline __global const uchar *vector_offset(const Vector *vec, int x) 2507{ 2508 return vec->ptr + x * vec->stride_x; 2509} 2510 2511 2512inline __global uchar *offset(const Image *img, int x, int y) 2513{ 2514 return img->ptr + x * img->stride_x + y * img->stride_y; 2515} 2516 2517 2518inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 2519{ 2520 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 2521} 2522 2523 2524inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 2525{ 2526 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 2527} 2528 2529 2530inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 2531{ 2532 uint num_elements = width * height; 2533 2534 const uint z = index / num_elements; 2535 2536 index %= num_elements; 2537 2538 const uint y = index / width; 2539 2540 index %= width; 2541 2542 const uint x = index; 2543 2544 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 2545} 2546 2547#endif 2548 2549 2550#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) 2551#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) 2552 2553 2554#define scalar_access_0_1(x) ((x).s0) 2555#define scalar_access_0_2(x) ((x).s01) 2556#define scalar_access_0_3(x) ((x).s012) 2557#define scalar_access_0_4(x) ((x).s0123) 2558#define scalar_access_0_8(x) ((x).s01234567) 2559#define scalar_access_0_16(x) ((x).s0123456789ABCDEF) 2560 2561 2562#define scalar_access_1_1(x) ((x).s1) 2563#define scalar_access_1_2(x) ((x).s12) 2564#define scalar_access_1_3(x) ((x).s123) 2565#define scalar_access_1_4(x) ((x).s1234) 2566#define scalar_access_1_8(x) ((x).s12345678) 2567 2568 2569#define scalar_access_2_1(x) ((x).s2) 2570#define scalar_access_2_2(x) ((x).s23) 2571#define scalar_access_2_3(x) ((x).s234) 2572#define scalar_access_2_4(x) ((x).s2345) 2573#define scalar_access_2_8(x) ((x).s23456789) 2574 2575 2576#define scalar_access_3_1(x) ((x).s3) 2577#define scalar_access_3_2(x) ((x).s34) 2578#define scalar_access_3_3(x) ((x).s345) 2579#define scalar_access_3_4(x) ((x).s3456) 2580#define scalar_access_3_8(x) ((x).s3456789A) 2581 2582 2583#define scalar_access_4_1(x) ((x).s4) 2584#define scalar_access_4_2(x) ((x).s45) 2585#define scalar_access_4_3(x) ((x).s456) 2586#define scalar_access_4_4(x) ((x).s4567) 2587#define scalar_access_4_8(x) ((x).s456789AB) 2588 2589 2590#define scalar_access_8_1(x) ((x).s8) 2591#define scalar_access_8_2(x) ((x).s89) 2592#define scalar_access_8_3(x) ((x).s89A) 2593#define scalar_access_8_4(x) ((x).s89AB) 2594#define scalar_access_8_8(x) ((x).s89ABCDEF) 2595 2596 2597#define scalar_access_12_1(x) ((x).sC) 2598#define scalar_access_12_2(x) ((x).sCD) 2599#define scalar_access_12_3(x) ((x).sCDE) 2600#define scalar_access_12_4(x) ((x).sCDEF) 2601 2602 2603#define scalar_access_16_1(x) ((x).sF) 2604 2605 2606#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2607 ({}) 2608 2609#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2610 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2611 2612#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2613 LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2614 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2615 2616#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2617 LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2618 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2619 2620#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2621 LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2622 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2623 2624#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2625 LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2626 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2627 2628#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2629 LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2630 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2631 2632#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2633 LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2634 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2635 2636#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2637 LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2638 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2639 2640#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2641 LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2642 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2643 2644#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2645 LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2646 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2647 2648#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2649 LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2650 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2651 2652#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2653 LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2654 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2655 2656#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2657 LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2658 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2659 2660#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2661 LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2662 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2663 2664#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2665 LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2666 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2667 2668#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2669 LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2670 SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2671 2672 2673 2674#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2675#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2676 2677 2678 2679#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2680 ({}) 2681 2682#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2683 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2684 2685#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2686 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2687 2688#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2689 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2690 2691#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2692 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2693 2694#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2695 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2696 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2697 2698#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2699 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2700 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2701 2702#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2703 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2704 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2705 2706#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2707 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2708 2709#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2710 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2711 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2712 2713#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2714 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2715 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2716 2717#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2718 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2719 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2720 2721#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2722 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2723 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2724 2725#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2726 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2727 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2728 LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2729 2730#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2731 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2732 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2733 LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2734 2735#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2736 LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2737 LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2738 LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2739 2740#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2741 LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2742 2743 2744 2745#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2746#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2747 2748 2749#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2750 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2751 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2752 2753#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2754 LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2755 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2756 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2757 2758#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2759 LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2760 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2761 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2762 2763#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2764 LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2765 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2766 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2767 2768#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2769 LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2770 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2771 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2772 2773#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2774 LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2775 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2776 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2777 2778#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2779 LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2780 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2781 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2782 2783#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2784 LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2785 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2786 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2787 2788#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2789 LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2790 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2791 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2792 2793#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2794 LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2795 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2796 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2797 2798#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2799 LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2800 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2801 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2802 2803#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2804 LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2805 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2806 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2807 2808#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2809 LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2810 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2811 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2812 2813#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2814 LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2815 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2816 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2817 2818#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2819 LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2820 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2821 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2822 2823#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2824 LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2825 VEC_DATA_TYPE(DATA_TYPE, N0) \ 2826 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2827 2828 2829 2830 2831#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2832#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2833 2834 2835 2836#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2837 VLOAD_PARTIAL(N0, LOAD_N0) \ 2838 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2839 2840#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2841 LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2842 VLOAD_PARTIAL(N0, LOAD_N0) \ 2843 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2844 2845#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2846 LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2847 VLOAD_PARTIAL(N0, LOAD_N0) \ 2848 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2849 2850#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2851 LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2852 VLOAD_PARTIAL(N0, LOAD_N0) \ 2853 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2854 2855#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2856 LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2857 VLOAD_PARTIAL(N0, LOAD_N0) \ 2858 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2859 2860#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2861 LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2862 VLOAD_PARTIAL(N0, LOAD_N0) \ 2863 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2864 2865#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2866 LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2867 VLOAD_PARTIAL(N0, LOAD_N0) \ 2868 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2869 2870#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2871 LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2872 VLOAD_PARTIAL(N0, LOAD_N0) \ 2873 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2874 2875#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2876 LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2877 VLOAD_PARTIAL(N0, LOAD_N0) \ 2878 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2879 2880#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2881 LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2882 VLOAD_PARTIAL(N0, LOAD_N0) \ 2883 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2884 2885#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2886 LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2887 VLOAD_PARTIAL(N0, LOAD_N0) \ 2888 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2889 2890#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2891 LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2892 VLOAD_PARTIAL(N0, LOAD_N0) \ 2893 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2894 2895#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2896 LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2897 VLOAD_PARTIAL(N0, LOAD_N0) \ 2898 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2899 2900#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2901 LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2902 VLOAD_PARTIAL(N0, LOAD_N0) \ 2903 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2904 2905#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2906 LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2907 VLOAD_PARTIAL(N0, LOAD_N0) \ 2908 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2909 2910#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2911 LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2912 VLOAD_PARTIAL(N0, LOAD_N0) \ 2913 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2914 2915 2916 2917#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2918#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2919 2920#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2921 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 2922 { \ 2923 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2924 } \ 2925 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 2926 { \ 2927 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2928 } \ 2929 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 2930 { \ 2931 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2932 } \ 2933 else \ 2934 { \ 2935 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2936 } 2937 2938#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 2939 if(!(PARTIAL_COND_X)) \ 2940 { \ 2941 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2942 } \ 2943 else \ 2944 { \ 2945 LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2946 } 2947 2948#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 2949 if(!(PARTIAL_COND_Y)) \ 2950 { \ 2951 LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2952 } \ 2953 else \ 2954 { \ 2955 LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2956 } 2957 2958 2959#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2960 2961#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2962 LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2963 2964#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 2965 2966#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2967 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2968 LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 2969 2970#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 2971 2972#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2973 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2974 LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 2975 2976#else 2977 2978#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2979 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2980 LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 2981 2982#endif 2983 2984 2985#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2986 BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW)) 2987 2988#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2989 LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2990 BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW)) 2991 2992#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2993 LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2994 BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW)) 2995 2996#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2997 LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2998 BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW)) 2999 3000#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3001 LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3002 BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW)) 3003 3004#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3005 LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3006 BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW)) 3007 3008#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3009 LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3010 BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW)) 3011 3012#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3013 LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3014 BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW)) 3015 3016#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3017 LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3018 BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW)) 3019 3020#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3021 LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3022 BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW)) 3023 3024#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3025 LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3026 BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW)) 3027 3028#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3029 LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3030 BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW)) 3031 3032#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3033 LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3034 BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW)) 3035 3036#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3037 LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3038 BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW)) 3039 3040#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3041 LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3042 BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW)) 3043 3044#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3045 LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3046 BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW)) 3047 3048 3049 3050#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3051#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3052 3053 3054 3055#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3056 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3057 BASENAME##0; \ 3058 if(Y_MASK##0 != 0) \ 3059 BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ 3060 else \ 3061 BASENAME##0 = 0; 3062 3063#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3064 LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3065 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3066 BASENAME##1; \ 3067 if(Y_MASK##1 != 0) \ 3068 BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ 3069 else \ 3070 BASENAME##1 = 0; 3071 3072#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3073 LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3074 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3075 BASENAME##2; \ 3076 if(Y_MASK##2 != 0) \ 3077 BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ 3078 else \ 3079 BASENAME##2 = 0; 3080 3081#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3082 LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3083 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3084 BASENAME##3; \ 3085 if(Y_MASK##3 != 0) \ 3086 BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ 3087 else \ 3088 BASENAME##3 = 0; 3089 3090#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3091 LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3092 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3093 BASENAME##4; \ 3094 if(Y_MASK##4 != 0) \ 3095 BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ 3096 else \ 3097 BASENAME##4 = 0; 3098 3099#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3100 LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3101 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3102 BASENAME##5; \ 3103 if(Y_MASK##5 != 0) \ 3104 BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ 3105 else \ 3106 BASENAME##5 = 0; 3107 3108#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3109 LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3110 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3111 BASENAME##6; \ 3112 if(Y_MASK##6 != 0) \ 3113 BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ 3114 else \ 3115 BASENAME##6 = 0; 3116 3117#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3118 LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3119 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3120 BASENAME##7; \ 3121 if(Y_MASK##7 != 0) \ 3122 BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ 3123 else \ 3124 BASENAME##7 = 0; 3125 3126#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3127 LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3128 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3129 BASENAME##8; \ 3130 if(Y_MASK##8 != 0) \ 3131 BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ 3132 else \ 3133 BASENAME##8 = 0; 3134 3135#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3136 LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3137 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3138 BASENAME##9; \ 3139 if(Y_MASK##9 != 0) \ 3140 BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ 3141 else \ 3142 BASENAME##9 = 0; 3143 3144#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3145 LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3146 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3147 BASENAME##A; \ 3148 if(Y_MASK##A != 0) \ 3149 BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ 3150 else \ 3151 BASENAME##A = 0; 3152 3153#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3154 LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3155 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3156 BASENAME##B; \ 3157 if(Y_MASK##B != 0) \ 3158 BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ 3159 else \ 3160 BASENAME##B = 0; 3161 3162#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3163 LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3164 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3165 BASENAME##C; \ 3166 if(Y_MASK##C != 0) \ 3167 BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ 3168 else \ 3169 BASENAME##C = 0; 3170 3171#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3172 LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3173 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3174 BASENAME##D; \ 3175 if(Y_MASK##D != 0) \ 3176 BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ 3177 else \ 3178 BASENAME##D = 0; 3179 3180#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3181 LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3182 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3183 BASENAME##E; \ 3184 if(Y_MASK##E != 0) \ 3185 BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ 3186 else \ 3187 BASENAME##E = 0; 3188 3189#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3190 LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3191 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3192 BASENAME##F; \ 3193 if(Y_MASK##F != 0) \ 3194 BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ 3195 else \ 3196 BASENAME##F = 0; 3197 3198 3199#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3200#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3201 3202 3203#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3204 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3205 BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y)); 3206 3207#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3208 LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3209 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3210 BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y)); 3211 3212#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3213 LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3214 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3215 BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y)); 3216 3217#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3218 LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3219 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3220 BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y)); 3221 3222#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3223 LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3224 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3225 BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y)); 3226 3227#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3228 LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3229 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3230 BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y)); 3231 3232#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3233 LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3234 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3235 BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y)); 3236 3237#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3238 LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3239 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3240 BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y)); 3241 3242#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3243 LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3244 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3245 BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y)); 3246 3247#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3248 LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3249 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3250 BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y)); 3251 3252#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3253 LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3254 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3255 BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y)); 3256 3257#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3258 LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3259 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3260 BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y)); 3261 3262#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3263 LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3264 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3265 BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y)); 3266 3267#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3268 LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3269 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3270 BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y)); 3271 3272#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3273 LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3274 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3275 BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y)); 3276 3277#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3278 LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3279 VEC_DATA_TYPE(DATA_TYPE, N0) \ 3280 BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y)); 3281 3282 3283 3284 3285#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3286#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3287 3288 3289 3290#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3291 Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3292 Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ 3293 Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); 3294 3295#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3296 CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3297 Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3298 Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ 3299 Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); 3300 3301#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3302 CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3303 Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3304 Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ 3305 Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); 3306 3307#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3308 CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3309 Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3310 Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ 3311 Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); 3312 3313#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3314 CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3315 Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3316 Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ 3317 Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); 3318 3319#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3320 CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3321 Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3322 Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ 3323 Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); 3324 3325#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3326 CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3327 Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3328 Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ 3329 Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); 3330 3331#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3332 CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3333 Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3334 Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ 3335 Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); 3336 3337 3338 3339 3340#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3341#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3342 3343 3344 3345#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3346 BASENAME##0 *= (DATA_TYPE)SCALE; 3347 3348#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3349 SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3350 BASENAME##1 *= (DATA_TYPE)SCALE; 3351 3352#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3353 SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3354 BASENAME##2 *= (DATA_TYPE)SCALE; 3355 3356#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3357 SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3358 BASENAME##3 *= (DATA_TYPE)SCALE; 3359 3360#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3361 SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3362 BASENAME##4 *= (DATA_TYPE)SCALE; 3363 3364#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3365 SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3366 BASENAME##5 *= (DATA_TYPE)SCALE; 3367 3368#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3369 SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3370 BASENAME##6 *= (DATA_TYPE)SCALE; 3371 3372#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3373 SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3374 BASENAME##7 *= (DATA_TYPE)SCALE; 3375 3376#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3377 SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3378 BASENAME##8 *= (DATA_TYPE)SCALE; 3379 3380#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3381 SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3382 BASENAME##9 *= (DATA_TYPE)SCALE; 3383 3384#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3385 SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3386 BASENAME##A *= (DATA_TYPE)SCALE; 3387 3388#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3389 SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3390 BASENAME##B *= (DATA_TYPE)SCALE; 3391 3392#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3393 SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3394 BASENAME##C *= (DATA_TYPE)SCALE; 3395 3396#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3397 SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3398 BASENAME##D *= (DATA_TYPE)SCALE; 3399 3400#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3401 SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3402 BASENAME##E *= (DATA_TYPE)SCALE; 3403 3404#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \ 3405 SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3406 BASENAME##F *= (DATA_TYPE)SCALE; 3407 3408 3409 3410#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) 3411#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) 3412 3413 3414 3415#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ 3416 TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); 3417#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ 3418 VEC_DATA_TYPE(TYPE, 2) \ 3419 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); 3420#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \ 3421 VEC_DATA_TYPE(TYPE, 3) \ 3422 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); 3423#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ 3424 VEC_DATA_TYPE(TYPE, 4) \ 3425 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); 3426#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ 3427 VEC_DATA_TYPE(TYPE, 8) \ 3428 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); 3429#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ 3430 VEC_DATA_TYPE(TYPE, 16) \ 3431 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); 3432 3433 3434 3435#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ 3436 TYPE BASENAME##IDX_COL = (TYPE)((X##0)); 3437#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ 3438 VEC_DATA_TYPE(TYPE, 2) \ 3439 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); 3440#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \ 3441 VEC_DATA_TYPE(TYPE, 3) \ 3442 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2)); 3443#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \ 3444 VEC_DATA_TYPE(TYPE, 4) \ 3445 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3)); 3446#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ 3447 VEC_DATA_TYPE(TYPE, 8) \ 3448 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); 3449#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ 3450 VEC_DATA_TYPE(TYPE, 16) \ 3451 BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); 3452 3453 3454 3455#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ 3456 COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); 3457#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ 3458 COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ 3459 COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); 3460#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \ 3461 TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE); \ 3462 COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE); 3463#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \ 3464 TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE); \ 3465 COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE); 3466#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \ 3467 TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE); \ 3468 COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE); \ 3469 COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE); \ 3470 COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE); \ 3471 COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE); 3472#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \ 3473 TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE); \ 3474 COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE); \ 3475 COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE); \ 3476 COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE); \ 3477 COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE); \ 3478 COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE); \ 3479 COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE); \ 3480 COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE); \ 3481 COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE); 3482 3483 3484 3485 3486#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3487 CONCAT(COLUMN_VECTOR, K0) \ 3488 (IDX_COL, BASENAME, BS, TYPE); 3489 3490 3491#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3492 CONCAT(COLUMN_VECTOR_SCALAR, K0) \ 3493 (IDX_COL, BASENAME, BS, TYPE); 3494 3495 3496#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \ 3497 CONCAT(TRANSPOSE_K0X, N0) \ 3498 (K0, BASENAME, BS, TYPE); 3499 3500 3501#define ADD_ROW_1(BASENAME, BIAS) \ 3502 BASENAME##0 += BIAS##0; 3503 3504#define ADD_ROW_2(BASENAME, BIAS) \ 3505 ADD_ROW_1(BASENAME, BIAS) \ 3506 BASENAME##1 += BIAS##1; 3507 3508#define ADD_ROW_3(BASENAME, BIAS) \ 3509 ADD_ROW_2(BASENAME, BIAS) \ 3510 BASENAME##2 += BIAS##2; 3511 3512#define ADD_ROW_4(BASENAME, BIAS) \ 3513 ADD_ROW_3(BASENAME, BIAS) \ 3514 BASENAME##3 += BIAS##3; 3515 3516#define ADD_ROW_5(BASENAME, BIAS) \ 3517 ADD_ROW_4(BASENAME, BIAS) \ 3518 BASENAME##4 += BIAS##4; 3519 3520#define ADD_ROW_6(BASENAME, BIAS) \ 3521 ADD_ROW_5(BASENAME, BIAS) \ 3522 BASENAME##5 += BIAS##5; 3523 3524#define ADD_ROW_7(BASENAME, BIAS) \ 3525 ADD_ROW_6(BASENAME, BIAS) \ 3526 BASENAME##6 += BIAS##6; 3527 3528#define ADD_ROW_8(BASENAME, BIAS) \ 3529 ADD_ROW_7(BASENAME, BIAS) \ 3530 BASENAME##7 += BIAS##7; 3531 3532#define ADD_ROW_9(BASENAME, BIAS) \ 3533 ADD_ROW_8(BASENAME, BIAS) \ 3534 BASENAME##8 += BIAS##8; 3535 3536#define ADD_ROW_10(BASENAME, BIAS) \ 3537 ADD_ROW_9(BASENAME, BIAS) \ 3538 BASENAME##9 += BIAS##9; 3539 3540#define ADD_ROW_11(BASENAME, BIAS) \ 3541 ADD_ROW_10(BASENAME, BIAS) \ 3542 BASENAME##A += BIAS##A; 3543 3544#define ADD_ROW_12(BASENAME, BIAS) \ 3545 ADD_ROW_11(BASENAME, BIAS) \ 3546 BASENAME##B += BIAS##B; 3547 3548#define ADD_ROW_13(BASENAME, BIAS) \ 3549 ADD_ROW_12(BASENAME, BIAS) \ 3550 BASENAME##C += BIAS##C; 3551 3552#define ADD_ROW_14(BASENAME, BIAS) \ 3553 ADD_ROW_13(BASENAME, BIAS) \ 3554 BASENAME##D += BIAS##D; 3555 3556#define ADD_ROW_15(BASENAME, BIAS) \ 3557 ADD_ROW_14(BASENAME, BIAS) \ 3558 BASENAME##E += BIAS##E; 3559 3560#define ADD_ROW_16(BASENAME, BIAS) \ 3561 ADD_ROW_15(BASENAME, BIAS) \ 3562 BASENAME##F += BIAS##F; 3563 3564 3565 3566 3567#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) 3568#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) 3569 3570 3571 3572#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3573 BASENAME##0 += BIAS; 3574 3575#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3576 ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3577 BASENAME##1 += BIAS; 3578 3579#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3580 ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3581 BASENAME##2 += BIAS; 3582 3583#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3584 ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3585 BASENAME##3 += BIAS; 3586 3587#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3588 ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3589 BASENAME##4 += BIAS; 3590 3591#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3592 ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3593 BASENAME##5 += BIAS; 3594 3595#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3596 ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3597 BASENAME##6 += BIAS; 3598 3599#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3600 ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3601 BASENAME##7 += BIAS; 3602 3603#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3604 ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3605 BASENAME##8 += BIAS; 3606 3607#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3608 ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3609 BASENAME##9 += BIAS; 3610 3611#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3612 ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3613 BASENAME##A += BIAS; 3614 3615#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3616 ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3617 BASENAME##B += BIAS; 3618 3619#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3620 ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3621 BASENAME##C += BIAS; 3622 3623#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3624 ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3625 BASENAME##D += BIAS; 3626 3627#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3628 ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3629 BASENAME##E += BIAS; 3630 3631#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \ 3632 ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3633 BASENAME##F += BIAS; 3634 3635 3636#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) 3637#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) 3638 3639 3640 3641#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3642 BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL); 3643 3644#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3645 ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3646 BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL); 3647 3648#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3649 ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3650 BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL); 3651 3652#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3653 ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3654 BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL); 3655 3656#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3657 ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3658 BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL); 3659 3660#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3661 ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3662 BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL); 3663 3664#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3665 ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3666 BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL); 3667 3668#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3669 ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3670 BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL); 3671 3672#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3673 ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3674 BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL); 3675 3676#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3677 ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3678 BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL); 3679 3680#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3681 ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3682 BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL); 3683 3684#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3685 ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3686 BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL); 3687 3688#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3689 ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3690 BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL); 3691 3692#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3693 ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3694 BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL); 3695 3696#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3697 ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3698 BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL); 3699 3700#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3701 ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3702 BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL); 3703 3704 3705 3706#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3707#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3708 3709 3710 3711#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3712 VEC_DATA_TYPE(DATA_TYPE, N) \ 3713 BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N)); 3714 3715#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3716 CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3717 VEC_DATA_TYPE(DATA_TYPE, N) \ 3718 BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N)); 3719 3720#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3721 CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3722 VEC_DATA_TYPE(DATA_TYPE, N) \ 3723 BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N)); 3724 3725#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3726 CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3727 VEC_DATA_TYPE(DATA_TYPE, N) \ 3728 BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N)); 3729 3730#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3731 CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3732 VEC_DATA_TYPE(DATA_TYPE, N) \ 3733 BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N)); 3734 3735#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3736 CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3737 VEC_DATA_TYPE(DATA_TYPE, N) \ 3738 BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N)); 3739 3740#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3741 CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3742 VEC_DATA_TYPE(DATA_TYPE, N) \ 3743 BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N)); 3744 3745#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3746 CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3747 VEC_DATA_TYPE(DATA_TYPE, N) \ 3748 BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N)); 3749 3750#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3751 CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3752 VEC_DATA_TYPE(DATA_TYPE, N) \ 3753 BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N)); 3754 3755#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3756 CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3757 VEC_DATA_TYPE(DATA_TYPE, N) \ 3758 BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N)); 3759 3760#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3761 CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3762 VEC_DATA_TYPE(DATA_TYPE, N) \ 3763 BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N)); 3764 3765#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3766 CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3767 VEC_DATA_TYPE(DATA_TYPE, N) \ 3768 BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N)); 3769 3770#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3771 CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3772 VEC_DATA_TYPE(DATA_TYPE, N) \ 3773 BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N)); 3774 3775#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3776 CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3777 VEC_DATA_TYPE(DATA_TYPE, N) \ 3778 BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N)); 3779 3780#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3781 CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3782 VEC_DATA_TYPE(DATA_TYPE, N) \ 3783 BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N)); 3784 3785#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3786 CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3787 VEC_DATA_TYPE(DATA_TYPE, N) \ 3788 BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N)); 3789 3790 3791 3792#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3793#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3794 3795 3796#ifndef ARM_COMPUTE_REPEAT_H 3797#define ARM_COMPUTE_REPEAT_H 3798 3799 3800#ifndef ARM_COMPUTE_HELPER_H 3801#define ARM_COMPUTE_HELPER_H 3802 3803 3804 3805 3806#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3807 VSTORE(N0) \ 3808 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3809 3810#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3811 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3812 VSTORE(N0) \ 3813 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3814 3815#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3816 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3817 VSTORE(N0) \ 3818 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3819 3820#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3821 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3822 VSTORE(N0) \ 3823 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3824 3825#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3826 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3827 VSTORE(N0) \ 3828 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3829 3830#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3831 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3832 VSTORE(N0) \ 3833 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3834 3835#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3836 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3837 VSTORE(N0) \ 3838 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3839 3840#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3841 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3842 VSTORE(N0) \ 3843 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3844 3845#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3846 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3847 VSTORE(N0) \ 3848 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3849 3850#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3851 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3852 VSTORE(N0) \ 3853 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3854 3855#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3856 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3857 VSTORE(N0) \ 3858 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3859 3860#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3861 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3862 VSTORE(N0) \ 3863 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3864 3865#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3866 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3867 VSTORE(N0) \ 3868 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3869 3870#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3871 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3872 VSTORE(N0) \ 3873 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3874 3875#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3876 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3877 VSTORE(N0) \ 3878 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3879 3880#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3881 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3882 VSTORE(N0) \ 3883 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3884 3885 3886 3887#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3888 VSTORE(N0) \ 3889 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3890 3891#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3892 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3893 VSTORE(N0) \ 3894 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3895 3896#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3897 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3898 VSTORE(N0) \ 3899 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3900 3901#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3902 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3903 VSTORE(N0) \ 3904 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3905 3906#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3907 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3908 VSTORE(N0) \ 3909 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3910 3911#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3912 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3913 VSTORE(N0) \ 3914 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3915 3916#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3917 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3918 VSTORE(N0) \ 3919 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3920 3921#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3922 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3923 VSTORE(N0) \ 3924 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3925 3926#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3927 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3928 VSTORE(N0) \ 3929 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3930 3931#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 3932 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3933 VSTORE(N0) \ 3934 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3935 3936#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3937 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3938 VSTORE(N0) \ 3939 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3940 3941#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3942 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3943 VSTORE(N0) \ 3944 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3945 3946#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3947 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3948 VSTORE(N0) \ 3949 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3950 3951#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3952 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3953 VSTORE(N0) \ 3954 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3955 3956#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3957 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3958 VSTORE(N0) \ 3959 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3960 3961#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3962 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3963 VSTORE(N0) \ 3964 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3965 3966 3967 3968 3969#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3970#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3971 3972 3973 3974#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3975#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3976 3977 3978 3979#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3980 VSTORE_PARTIAL(N0, STORE_N0) \ 3981 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3982 3983#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3984 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3985 VSTORE_PARTIAL(N0, STORE_N0) \ 3986 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3987 3988#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3989 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3990 VSTORE_PARTIAL(N0, STORE_N0) \ 3991 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3992 3993#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3994 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3995 VSTORE_PARTIAL(N0, STORE_N0) \ 3996 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3997 3998#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3999 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4000 VSTORE_PARTIAL(N0, STORE_N0) \ 4001 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 4002 4003#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4004 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4005 VSTORE_PARTIAL(N0, STORE_N0) \ 4006 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 4007 4008#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4009 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4010 VSTORE_PARTIAL(N0, STORE_N0) \ 4011 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 4012 4013#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4014 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4015 VSTORE_PARTIAL(N0, STORE_N0) \ 4016 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 4017 4018#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4019 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4020 VSTORE_PARTIAL(N0, STORE_N0) \ 4021 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 4022 4023#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4024 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4025 VSTORE_PARTIAL(N0, STORE_N0) \ 4026 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 4027 4028#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4029 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4030 VSTORE_PARTIAL(N0, STORE_N0) \ 4031 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 4032 4033#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4034 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4035 VSTORE_PARTIAL(N0, STORE_N0) \ 4036 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 4037 4038#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4039 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4040 VSTORE_PARTIAL(N0, STORE_N0) \ 4041 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 4042 4043#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4044 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4045 VSTORE_PARTIAL(N0, STORE_N0) \ 4046 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 4047 4048#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4049 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4050 VSTORE_PARTIAL(N0, STORE_N0) \ 4051 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 4052 4053#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4054 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4055 VSTORE_PARTIAL(N0, STORE_N0) \ 4056 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 4057 4058 4059 4060#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4061#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4062 4063#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4064 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 4065 { \ 4066 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4067 } \ 4068 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 4069 { \ 4070 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4071 } \ 4072 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 4073 { \ 4074 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4075 } \ 4076 else \ 4077 { \ 4078 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4079 } 4080 4081#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 4082 if(!(PARTIAL_COND_X)) \ 4083 { \ 4084 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4085 } \ 4086 else \ 4087 { \ 4088 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4089 } 4090 4091#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 4092 if(!(PARTIAL_COND_Y)) \ 4093 { \ 4094 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4095 } \ 4096 else \ 4097 { \ 4098 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4099 } 4100 4101 4102#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 4103 4104 4105#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 4106 4107#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4108 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4109 4110#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 4111 4112#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4113 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 4114 4115#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 4116 4117#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4118 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 4119 4120#else 4121 4122#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4123 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 4124 4125#endif 4126 4127#endif 4128 4129 4130#if defined(PARTIAL_STORE_M0) 4131 4132#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4133 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 4134#else 4135#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4136 ((uint)(y * M0)) 4137#endif 4138 4139 4140 4141#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 4142 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 4143 4144 4145#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4146#pragma OPENCL EXTENSION cl_khr_fp16 : enable 4147#endif 4148 4149#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 4150#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 4151#endif 4152 4153#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 4154#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 4155#endif 4156 4157#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 4158#pragma OPENCL EXTENSION cl_arm_printf : enable 4159#endif 4160 4161#define GPU_ARCH_MIDGARD 0x100 4162#define GPU_ARCH_BIFROST 0x200 4163#define GPU_ARCH_VALHALL 0x300 4164 4165 4166#define CONCAT(a, b) a##b 4167 4168 4169#define EXPAND(x) x 4170 4171 4172#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 4173 4174 4175#define REV1(x) ((x)) 4176#define REV2(x) ((x).s10) 4177#define REV3(x) ((x).s210) 4178#define REV4(x) ((x).s3210) 4179#define REV8(x) ((x).s76543210) 4180#define REV16(x) ((x).sFEDCBA9876543210) 4181 4182 4183 4184#define REVERSE_STR(x, s) REV##s((x)) 4185#define REVERSE(x, s) REVERSE_STR(x, s) 4186 4187 4188 4189#define ROT1_0(x) ((x)) 4190#define ROT1_1(x) ((x)) 4191 4192#define ROT2_0(x) ((x)) 4193#define ROT2_1(x) ((x).s10) 4194#define ROT2_2(x) ((x)) 4195 4196#define ROT3_0(x) ((x)) 4197#define ROT3_1(x) ((x).s201) 4198#define ROT3_2(x) ((x).s120) 4199#define ROT3_3(x) ((x)) 4200 4201#define ROT4_0(x) ((x)) 4202#define ROT4_1(x) ((x).s3012) 4203#define ROT4_2(x) ((x).s2301) 4204#define ROT4_3(x) ((x).s1230) 4205#define ROT4_4(x) ((x)) 4206 4207#define ROT8_0(x) ((x)) 4208#define ROT8_1(x) ((x).s70123456) 4209#define ROT8_2(x) ((x).s67012345) 4210#define ROT8_3(x) ((x).s56701234) 4211#define ROT8_4(x) ((x).s45670123) 4212#define ROT8_5(x) ((x).s34567012) 4213#define ROT8_6(x) ((x).s23456701) 4214#define ROT8_7(x) ((x).s12345670) 4215#define ROT8_8(x) ((x)) 4216 4217#define ROT16_0(x) ((x)) 4218#define ROT16_1(x) ((x).sF0123456789ABCDE) 4219#define ROT16_2(x) ((x).sEF0123456789ABCD) 4220#define ROT16_3(x) ((x).sDEF0123456789ABC) 4221#define ROT16_4(x) ((x).sCDEF0123456789AB) 4222#define ROT16_5(x) ((x).sBCDEF0123456789A) 4223#define ROT16_6(x) ((x).sABCDEF0123456789) 4224#define ROT16_7(x) ((x).s9ABCDEF012345678) 4225#define ROT16_8(x) ((x).s89ABCDEF01234567) 4226#define ROT16_9(x) ((x).s789ABCDEF0123456) 4227#define ROT16_10(x) ((x).s6789ABCDEF012345) 4228#define ROT16_11(x) ((x).s56789ABCDEF01234) 4229#define ROT16_12(x) ((x).s456789ABCDEF0123) 4230#define ROT16_13(x) ((x).s3456789ABCDEF012) 4231#define ROT16_14(x) ((x).s23456789ABCDEF01) 4232#define ROT16_15(x) ((x).s123456789ABCDEF0) 4233#define ROT16_16(x) ((x)) 4234 4235 4236 4237#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 4238#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 4239 4240 4241 4242#define V_OFFS1(dt) (dt##1)(0) 4243#define V_OFFS2(dt) (dt##2)(0, 1) 4244#define V_OFFS3(dt) (dt##3)(0, 1, 2) 4245#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 4246#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 4247#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 4248 4249 4250 4251#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 4252#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 4253 4254 4255#define VLOAD_STR(size) vload##size 4256#define VLOAD(size) VLOAD_STR(size) 4257 4258 4259#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 4260#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 4261 4262#define NO_LOAD(data, offs, ptr) \ 4263 { \ 4264 } 4265 4266 4267#define vload_partial_1_0 NO_LOAD 4268#define vload_partial_1_1 vload1 4269#define vload_partial_1_2 NO_LOAD 4270#define vload_partial_1_3 NO_LOAD 4271#define vload_partial_1_4 NO_LOAD 4272#define vload_partial_1_5 NO_LOAD 4273#define vload_partial_1_6 NO_LOAD 4274#define vload_partial_1_7 NO_LOAD 4275#define vload_partial_1_8 NO_LOAD 4276#define vload_partial_1_9 NO_LOAD 4277#define vload_partial_1_10 NO_LOAD 4278#define vload_partial_1_11 NO_LOAD 4279#define vload_partial_1_12 NO_LOAD 4280#define vload_partial_1_13 NO_LOAD 4281#define vload_partial_1_14 NO_LOAD 4282#define vload_partial_1_15 NO_LOAD 4283#define vload_partial_1_16 NO_LOAD 4284 4285#define vload_partial_2_0 NO_LOAD 4286#define vload_partial_2_1 vload_partial_1 4287#define vload_partial_2_2 vload_partial_2 4288#define vload_partial_2_3 NO_LOAD 4289#define vload_partial_2_4 NO_LOAD 4290#define vload_partial_2_5 NO_LOAD 4291#define vload_partial_2_6 NO_LOAD 4292#define vload_partial_2_7 NO_LOAD 4293#define vload_partial_2_8 NO_LOAD 4294#define vload_partial_2_9 NO_LOAD 4295#define vload_partial_2_10 NO_LOAD 4296#define vload_partial_2_11 NO_LOAD 4297#define vload_partial_2_12 NO_LOAD 4298#define vload_partial_2_13 NO_LOAD 4299#define vload_partial_2_14 NO_LOAD 4300#define vload_partial_2_15 NO_LOAD 4301#define vload_partial_2_16 NO_LOAD 4302 4303#define vload_partial_3_0 NO_LOAD 4304#define vload_partial_3_1 vload_partial_1 4305#define vload_partial_3_2 vload_partial_2 4306#define vload_partial_3_3 vload_partial_3 4307#define vload_partial_3_4 NO_LOAD 4308#define vload_partial_3_5 NO_LOAD 4309#define vload_partial_3_6 NO_LOAD 4310#define vload_partial_3_7 NO_LOAD 4311#define vload_partial_3_8 NO_LOAD 4312#define vload_partial_3_9 NO_LOAD 4313#define vload_partial_3_10 NO_LOAD 4314#define vload_partial_3_11 NO_LOAD 4315#define vload_partial_3_12 NO_LOAD 4316#define vload_partial_3_13 NO_LOAD 4317#define vload_partial_3_14 NO_LOAD 4318#define vload_partial_3_15 NO_LOAD 4319#define vload_partial_3_16 NO_LOAD 4320 4321#define vload_partial_4_0 NO_LOAD 4322#define vload_partial_4_1 vload_partial_1 4323#define vload_partial_4_2 vload_partial_2 4324#define vload_partial_4_3 vload_partial_3 4325#define vload_partial_4_4 vload_partial_4 4326#define vload_partial_4_5 NO_LOAD 4327#define vload_partial_4_6 NO_LOAD 4328#define vload_partial_4_7 NO_LOAD 4329#define vload_partial_4_8 NO_LOAD 4330#define vload_partial_4_9 NO_LOAD 4331#define vload_partial_4_10 NO_LOAD 4332#define vload_partial_4_11 NO_LOAD 4333#define vload_partial_4_12 NO_LOAD 4334#define vload_partial_4_13 NO_LOAD 4335#define vload_partial_4_14 NO_LOAD 4336#define vload_partial_4_15 NO_LOAD 4337#define vload_partial_4_16 NO_LOAD 4338 4339#define vload_partial_8_0 NO_LOAD 4340#define vload_partial_8_1 vload_partial_1 4341#define vload_partial_8_2 vload_partial_2 4342#define vload_partial_8_3 vload_partial_3 4343#define vload_partial_8_4 vload_partial_4 4344#define vload_partial_8_5 vload_partial_5 4345#define vload_partial_8_6 vload_partial_6 4346#define vload_partial_8_7 vload_partial_7 4347#define vload_partial_8_8 vload_partial_8 4348#define vload_partial_8_9 NO_LOAD 4349#define vload_partial_8_10 NO_LOAD 4350#define vload_partial_8_11 NO_LOAD 4351#define vload_partial_8_12 NO_LOAD 4352#define vload_partial_8_13 NO_LOAD 4353#define vload_partial_8_14 NO_LOAD 4354#define vload_partial_8_15 NO_LOAD 4355#define vload_partial_8_16 NO_LOAD 4356 4357#define vload_partial_16_0 NO_LOAD 4358#define vload_partial_16_1 vload_partial_1 4359#define vload_partial_16_2 vload_partial_2 4360#define vload_partial_16_3 vload_partial_3 4361#define vload_partial_16_4 vload_partial_4 4362#define vload_partial_16_5 vload_partial_5 4363#define vload_partial_16_6 vload_partial_6 4364#define vload_partial_16_7 vload_partial_7 4365#define vload_partial_16_8 vload_partial_8 4366#define vload_partial_16_9 vload_partial_9 4367#define vload_partial_16_10 vload_partial_10 4368#define vload_partial_16_11 vload_partial_11 4369#define vload_partial_16_12 vload_partial_12 4370#define vload_partial_16_13 vload_partial_13 4371#define vload_partial_16_14 vload_partial_14 4372#define vload_partial_16_15 vload_partial_15 4373#define vload_partial_16_16 vload_partial_16 4374 4375 4376#define vload_partial_1(DATA, OFFSET, PTR) \ 4377 DATA.s0 = vload1(OFFSET, PTR); 4378 4379#define vload_partial_2(DATA, OFFSET, PTR) \ 4380 DATA.s01 = vload2(OFFSET, PTR); 4381 4382#define vload_partial_3(DATA, OFFSET, PTR) \ 4383 DATA.s012 = vload3(OFFSET, PTR); 4384 4385#define vload_partial_4(DATA, OFFSET, PTR) \ 4386 DATA.s0123 = vload4(OFFSET, PTR); 4387 4388#define vload_partial_5(DATA, OFFSET, PTR) \ 4389 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4390 DATA.s4 = vload1(OFFSET, PTR + 4); 4391 4392#define vload_partial_6(DATA, OFFSET, PTR) \ 4393 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4394 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 4395 4396#define vload_partial_7(DATA, OFFSET, PTR) \ 4397 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4398 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 4399 4400#define vload_partial_8(DATA, OFFSET, PTR) \ 4401 DATA.s01234567 = vload8(OFFSET, PTR); 4402 4403#define vload_partial_9(DATA, OFFSET, PTR) \ 4404 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4405 DATA.s8 = vload1(OFFSET, PTR + 8); 4406 4407#define vload_partial_10(DATA, OFFSET, PTR) \ 4408 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4409 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 4410 4411#define vload_partial_11(DATA, OFFSET, PTR) \ 4412 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4413 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 4414 4415#define vload_partial_12(DATA, OFFSET, PTR) \ 4416 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4417 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 4418 4419#define vload_partial_13(DATA, OFFSET, PTR) \ 4420 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4421 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 4422 4423#define vload_partial_14(DATA, OFFSET, PTR) \ 4424 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4425 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 4426 4427#define vload_partial_15(DATA, OFFSET, PTR) \ 4428 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4429 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 4430 4431#define vload_partial_16(DATA, OFFSET, PTR) \ 4432 DATA = vload16(OFFSET, PTR); 4433 4434 4435 4436#define PIXEL_UNIT4 1 4437#define PIXEL_UNIT8 2 4438#define PIXEL_UNIT16 4 4439 4440 4441#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 4442#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 4443 4444 4445#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 4446#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 4447#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 4448 4449#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4450#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 4451#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 4452#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 4453#endif 4454 4455#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 4456#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4457#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4458 4459#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4460#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 4461#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4462#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4463#endif 4464 4465 4466#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 4467#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 4468 4469 4470#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 4471#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 4472 4473#define VSTORE_STR(size) vstore##size 4474#define VSTORE(size) VSTORE_STR(size) 4475 4476#define float1 float 4477#define half1 half 4478#define char1 char 4479#define uchar1 uchar 4480#define short1 short 4481#define ushort1 ushort 4482#define int1 int 4483#define uint1 uint 4484#define long1 long 4485#define ulong1 ulong 4486#define double1 double 4487 4488#define vload1(OFFSET, PTR) *(OFFSET + PTR) 4489#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 4490 4491 4492#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 4493#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 4494 4495#define NO_STORE(data, offs, ptr) \ 4496 { \ 4497 } 4498 4499 4500#define vstore_partial_1_0 NO_STORE 4501#define vstore_partial_1_1 vstore1 4502#define vstore_partial_1_2 NO_STORE 4503#define vstore_partial_1_3 NO_STORE 4504#define vstore_partial_1_4 NO_STORE 4505#define vstore_partial_1_5 NO_STORE 4506#define vstore_partial_1_6 NO_STORE 4507#define vstore_partial_1_7 NO_STORE 4508#define vstore_partial_1_8 NO_STORE 4509#define vstore_partial_1_9 NO_STORE 4510#define vstore_partial_1_10 NO_STORE 4511#define vstore_partial_1_11 NO_STORE 4512#define vstore_partial_1_12 NO_STORE 4513#define vstore_partial_1_13 NO_STORE 4514#define vstore_partial_1_14 NO_STORE 4515#define vstore_partial_1_15 NO_STORE 4516#define vstore_partial_1_16 NO_STORE 4517 4518#define vstore_partial_2_0 NO_STORE 4519#define vstore_partial_2_1 vstore_partial_1 4520#define vstore_partial_2_2 vstore_partial_2 4521#define vstore_partial_2_3 NO_STORE 4522#define vstore_partial_2_4 NO_STORE 4523#define vstore_partial_2_5 NO_STORE 4524#define vstore_partial_2_6 NO_STORE 4525#define vstore_partial_2_7 NO_STORE 4526#define vstore_partial_2_8 NO_STORE 4527#define vstore_partial_2_9 NO_STORE 4528#define vstore_partial_2_10 NO_STORE 4529#define vstore_partial_2_11 NO_STORE 4530#define vstore_partial_2_12 NO_STORE 4531#define vstore_partial_2_13 NO_STORE 4532#define vstore_partial_2_14 NO_STORE 4533#define vstore_partial_2_15 NO_STORE 4534#define vstore_partial_2_16 NO_STORE 4535 4536#define vstore_partial_3_0 NO_STORE 4537#define vstore_partial_3_1 vstore_partial_1 4538#define vstore_partial_3_2 vstore_partial_2 4539#define vstore_partial_3_3 vstore_partial_3 4540#define vstore_partial_3_4 NO_STORE 4541#define vstore_partial_3_5 NO_STORE 4542#define vstore_partial_3_6 NO_STORE 4543#define vstore_partial_3_7 NO_STORE 4544#define vstore_partial_3_8 NO_STORE 4545#define vstore_partial_3_9 NO_STORE 4546#define vstore_partial_3_10 NO_STORE 4547#define vstore_partial_3_11 NO_STORE 4548#define vstore_partial_3_12 NO_STORE 4549#define vstore_partial_3_13 NO_STORE 4550#define vstore_partial_3_14 NO_STORE 4551#define vstore_partial_3_15 NO_STORE 4552#define vstore_partial_3_16 NO_STORE 4553 4554#define vstore_partial_4_0 NO_STORE 4555#define vstore_partial_4_1 vstore_partial_1 4556#define vstore_partial_4_2 vstore_partial_2 4557#define vstore_partial_4_3 vstore_partial_3 4558#define vstore_partial_4_4 vstore_partial_4 4559#define vstore_partial_4_5 NO_STORE 4560#define vstore_partial_4_6 NO_STORE 4561#define vstore_partial_4_7 NO_STORE 4562#define vstore_partial_4_8 NO_STORE 4563#define vstore_partial_4_9 NO_STORE 4564#define vstore_partial_4_10 NO_STORE 4565#define vstore_partial_4_11 NO_STORE 4566#define vstore_partial_4_12 NO_STORE 4567#define vstore_partial_4_13 NO_STORE 4568#define vstore_partial_4_14 NO_STORE 4569#define vstore_partial_4_15 NO_STORE 4570#define vstore_partial_4_16 NO_STORE 4571 4572#define vstore_partial_8_0 NO_STORE 4573#define vstore_partial_8_1 vstore_partial_1 4574#define vstore_partial_8_2 vstore_partial_2 4575#define vstore_partial_8_3 vstore_partial_3 4576#define vstore_partial_8_4 vstore_partial_4 4577#define vstore_partial_8_5 vstore_partial_5 4578#define vstore_partial_8_6 vstore_partial_6 4579#define vstore_partial_8_7 vstore_partial_7 4580#define vstore_partial_8_8 vstore_partial_8 4581#define vstore_partial_8_9 NO_STORE 4582#define vstore_partial_8_10 NO_STORE 4583#define vstore_partial_8_11 NO_STORE 4584#define vstore_partial_8_12 NO_STORE 4585#define vstore_partial_8_13 NO_STORE 4586#define vstore_partial_8_14 NO_STORE 4587#define vstore_partial_8_15 NO_STORE 4588#define vstore_partial_8_16 NO_STORE 4589 4590#define vstore_partial_16_0 NO_STORE 4591#define vstore_partial_16_1 vstore_partial_1 4592#define vstore_partial_16_2 vstore_partial_2 4593#define vstore_partial_16_3 vstore_partial_3 4594#define vstore_partial_16_4 vstore_partial_4 4595#define vstore_partial_16_5 vstore_partial_5 4596#define vstore_partial_16_6 vstore_partial_6 4597#define vstore_partial_16_7 vstore_partial_7 4598#define vstore_partial_16_8 vstore_partial_8 4599#define vstore_partial_16_9 vstore_partial_9 4600#define vstore_partial_16_10 vstore_partial_10 4601#define vstore_partial_16_11 vstore_partial_11 4602#define vstore_partial_16_12 vstore_partial_12 4603#define vstore_partial_16_13 vstore_partial_13 4604#define vstore_partial_16_14 vstore_partial_14 4605#define vstore_partial_16_15 vstore_partial_15 4606#define vstore_partial_16_16 vstore_partial_16 4607 4608 4609#define vstore_partial_1(DATA, OFFSET, PTR) \ 4610 vstore1(DATA.s0, OFFSET, PTR); 4611 4612#define vstore_partial_2(DATA, OFFSET, PTR) \ 4613 vstore2(DATA.s01, OFFSET, PTR); 4614 4615#define vstore_partial_3(DATA, OFFSET, PTR) \ 4616 vstore3(DATA.s012, OFFSET, PTR); 4617 4618#define vstore_partial_4(DATA, OFFSET, PTR) \ 4619 vstore4(DATA.s0123, OFFSET, PTR); 4620 4621#define vstore_partial_5(DATA, OFFSET, PTR) \ 4622 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4623 vstore1(DATA.s4, OFFSET, PTR + 4); 4624 4625#define vstore_partial_6(DATA, OFFSET, PTR) \ 4626 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4627 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 4628 4629#define vstore_partial_7(DATA, OFFSET, PTR) \ 4630 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4631 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 4632 4633#define vstore_partial_8(DATA, OFFSET, PTR) \ 4634 vstore8(DATA.s01234567, OFFSET, PTR); 4635 4636#define vstore_partial_9(DATA, OFFSET, PTR) \ 4637 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4638 vstore1(DATA.s8, OFFSET, PTR + 8); 4639 4640#define vstore_partial_10(DATA, OFFSET, PTR) \ 4641 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4642 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 4643 4644#define vstore_partial_11(DATA, OFFSET, PTR) \ 4645 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4646 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 4647 4648#define vstore_partial_12(DATA, OFFSET, PTR) \ 4649 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4650 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 4651 4652#define vstore_partial_13(DATA, OFFSET, PTR) \ 4653 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4654 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 4655 4656#define vstore_partial_14(DATA, OFFSET, PTR) \ 4657 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4658 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 4659 4660#define vstore_partial_15(DATA, OFFSET, PTR) \ 4661 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4662 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 4663 4664#define vstore_partial_16(DATA, OFFSET, PTR) \ 4665 vstore16(DATA, OFFSET, PTR); 4666 4667 4668 4669 4670 4671#define convert_float_sat convert_float 4672#define convert_float1_sat convert_float 4673#define convert_float2_sat convert_float2 4674#define convert_float3_sat convert_float3 4675#define convert_float4_sat convert_float4 4676#define convert_float8_sat convert_float8 4677#define convert_float16_sat convert_float16 4678#define convert_half_sat convert_float 4679#define convert_half1_sat convert_half 4680#define convert_half2_sat convert_half2 4681#define convert_half3_sat convert_half3 4682#define convert_half4_sat convert_half4 4683#define convert_half8_sat convert_half8 4684#define convert_half16_sat convert_half16 4685 4686#define convert_float1 convert_float 4687#define convert_half1 convert_half 4688#define convert_char1 convert_char 4689#define convert_uchar1 convert_uchar 4690#define convert_short1 convert_short 4691#define convert_ushort1 convert_ushort 4692#define convert_int1 convert_int 4693#define convert_uint1 convert_uint 4694#define convert_long1 convert_long 4695#define convert_ulong1 convert_ulong 4696#define convert_double1 convert_double 4697 4698#define convert_char1_sat convert_char_sat 4699#define convert_uchar1_sat convert_uchar_sat 4700#define convert_uchar2_sat convert_uchar2_sat 4701#define convert_uchar3_sat convert_uchar3_sat 4702#define convert_uchar4_sat convert_uchar4_sat 4703#define convert_uchar8_sat convert_uchar8_sat 4704#define convert_uchar16_sat convert_uchar16_sat 4705#define convert_short1_sat convert_short_sat 4706#define convert_ushort1_sat convert_ushort_sat 4707#define convert_int1_sat convert_int_sat 4708#define convert_uint1_sat convert_uint_sat 4709#define convert_long1_sat convert_long_sat 4710#define convert_ulong1_sat convert_ulong_sat 4711#define convert_double1_sat convert_double_sat 4712 4713#define VEC_DATA_TYPE_STR(type, size) type##size 4714#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 4715 4716#define CONVERT_STR(x, type) (convert_##type((x))) 4717#define CONVERT(x, type) CONVERT_STR(x, type) 4718 4719#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 4720#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 4721 4722#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 4723#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 4724 4725#define select_vec_dt_uchar(size) uchar##size 4726#define select_vec_dt_char(size) char##size 4727#define select_vec_dt_ushort(size) ushort##size 4728#define select_vec_dt_short(size) short##size 4729#define select_vec_dt_half(size) short##size 4730#define select_vec_dt_uint(size) uint##size 4731#define select_vec_dt_int(size) int##size 4732#define select_vec_dt_float(size) int##size 4733#define select_vec_dt_ulong(size) ulong##size 4734#define select_vec_dt_long(size) long##size 4735 4736#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 4737#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 4738#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 4739 4740#define signed_int_vec_dt_uchar(size) char##size 4741#define signed_int_vec_dt_char(size) char##size 4742#define signed_int_vec_dt_ushort(size) short##size 4743#define signed_int_vec_dt_short(size) short##size 4744#define signed_int_vec_dt_half(size) short##size 4745#define signed_int_vec_dt_uint(size) int##size 4746#define signed_int_vec_dt_int(size) int##size 4747#define signed_int_vec_dt_float(size) int##size 4748#define signed_int_vec_dt_ulong(size) long##size 4749#define signed_int_vec_dt_long(size) long##size 4750 4751#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 4752#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 4753#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 4754 4755#define sum_reduce_1(x) (x) 4756#define sum_reduce_2(x) ((x).s0) + ((x).s1) 4757#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 4758#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 4759#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 4760#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 4761 4762#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 4763#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 4764 4765#define prod_reduce_1(x) (x) 4766#define prod_reduce_2(x) ((x).s0) * ((x).s1) 4767#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 4768#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 4769#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 4770#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 4771 4772#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 4773#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 4774 4775#define max_reduce_1(x) (x) 4776#define max_reduce_2(x) max(((x).s0), ((x).s1)) 4777#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 4778#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 4779#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 4780#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 4781 4782#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 4783#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 4784 4785#define VECTOR_DECLARATION(name) \ 4786 __global uchar *name##_ptr, \ 4787 uint name##_stride_x, \ 4788 uint name##_step_x, \ 4789 uint name##_offset_first_element_in_bytes 4790 4791#define IMAGE_DECLARATION(name) \ 4792 __global uchar *name##_ptr, \ 4793 uint name##_stride_x, \ 4794 uint name##_step_x, \ 4795 uint name##_stride_y, \ 4796 uint name##_step_y, \ 4797 uint name##_offset_first_element_in_bytes 4798 4799#define TENSOR3D_DECLARATION(name) \ 4800 __global uchar *name##_ptr, \ 4801 uint name##_stride_x, \ 4802 uint name##_step_x, \ 4803 uint name##_stride_y, \ 4804 uint name##_step_y, \ 4805 uint name##_stride_z, \ 4806 uint name##_step_z, \ 4807 uint name##_offset_first_element_in_bytes 4808 4809#define TENSOR4D_DECLARATION(name) \ 4810 __global uchar *name##_ptr, \ 4811 uint name##_stride_x, \ 4812 uint name##_step_x, \ 4813 uint name##_stride_y, \ 4814 uint name##_step_y, \ 4815 uint name##_stride_z, \ 4816 uint name##_step_z, \ 4817 uint name##_stride_w, \ 4818 uint name##_step_w, \ 4819 uint name##_offset_first_element_in_bytes 4820 4821#define TENSOR5D_DECLARATION(name) \ 4822 __global uchar *name##_ptr, \ 4823 uint name##_stride_x, \ 4824 uint name##_step_x, \ 4825 uint name##_stride_y, \ 4826 uint name##_step_y, \ 4827 uint name##_stride_z, \ 4828 uint name##_step_z, \ 4829 uint name##_stride_w, \ 4830 uint name##_step_w, \ 4831 uint name##_stride_v, \ 4832 uint name##_step_v, \ 4833 uint name##_offset_first_element_in_bytes 4834 4835#define CONVERT_TO_VECTOR_STRUCT(name) \ 4836 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 4837 4838#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 4839 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 4840 4841#define CONVERT_TO_IMAGE_STRUCT(name) \ 4842 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 4843 4844#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 4845 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 4846 4847#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4848 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4849 4850#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 4851 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 4852 4853#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4854 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4855 4856#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 4857 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4858 name##_stride_z, name##_step_z) 4859 4860#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 4861 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 4862 4863#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 4864 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4865 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 4866 4867#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 4868 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 4869 4870#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 4871 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4872 name##_stride_z, name##_step_z) 4873 4874 4875typedef struct Vector 4876{ 4877 __global uchar *ptr; 4878 int offset_first_element_in_bytes; 4879 int stride_x; 4880} Vector; 4881 4882 4883typedef struct Image 4884{ 4885 __global uchar *ptr; 4886 int offset_first_element_in_bytes; 4887 int stride_x; 4888 int stride_y; 4889} Image; 4890 4891 4892typedef struct Tensor3D 4893{ 4894 __global uchar *ptr; 4895 int offset_first_element_in_bytes; 4896 int stride_x; 4897 int stride_y; 4898 int stride_z; 4899} Tensor3D; 4900 4901 4902typedef struct Tensor4D 4903{ 4904 __global uchar *ptr; 4905 int offset_first_element_in_bytes; 4906 int stride_x; 4907 int stride_y; 4908 int stride_z; 4909 int stride_w; 4910} Tensor4D; 4911 4912 4913inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 4914{ 4915 Vector vector = 4916 { 4917 .ptr = ptr, 4918 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4919 .stride_x = stride_x, 4920 }; 4921 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 4922 return vector; 4923} 4924 4925 4926inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 4927{ 4928 Image img = 4929 { 4930 .ptr = ptr, 4931 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4932 .stride_x = stride_x, 4933 .stride_y = stride_y 4934 }; 4935 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 4936 return img; 4937} 4938 4939 4940inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4941{ 4942 Image img = 4943 { 4944 .ptr = ptr, 4945 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4946 .stride_x = stride_x, 4947 .stride_y = stride_y 4948 }; 4949 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4950 return img; 4951} 4952 4953 4954inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4955{ 4956 Tensor3D tensor = 4957 { 4958 .ptr = ptr, 4959 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4960 .stride_x = stride_x, 4961 .stride_y = stride_y, 4962 .stride_z = stride_z 4963 }; 4964 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4965 return tensor; 4966} 4967 4968 4969inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4970{ 4971 Tensor3D tensor = 4972 { 4973 .ptr = ptr, 4974 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4975 .stride_x = stride_x, 4976 .stride_y = stride_y, 4977 .stride_z = stride_z 4978 }; 4979 return tensor; 4980} 4981 4982inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 4983 uint step_w, 4984 uint mod_size) 4985{ 4986 Tensor4D tensor = 4987 { 4988 .ptr = ptr, 4989 .offset_first_element_in_bytes = offset_first_element_in_bytes, 4990 .stride_x = stride_x, 4991 .stride_y = stride_y, 4992 .stride_z = stride_z, 4993 .stride_w = stride_w 4994 }; 4995 4996 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 4997 return tensor; 4998} 4999 5000 5001inline __global const uchar *vector_offset(const Vector *vec, int x) 5002{ 5003 return vec->ptr + x * vec->stride_x; 5004} 5005 5006 5007inline __global uchar *offset(const Image *img, int x, int y) 5008{ 5009 return img->ptr + x * img->stride_x + y * img->stride_y; 5010} 5011 5012 5013inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 5014{ 5015 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 5016} 5017 5018 5019inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 5020{ 5021 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 5022} 5023 5024 5025inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 5026{ 5027 uint num_elements = width * height; 5028 5029 const uint z = index / num_elements; 5030 5031 index %= num_elements; 5032 5033 const uint y = index / width; 5034 5035 index %= width; 5036 5037 const uint x = index; 5038 5039 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 5040} 5041 5042#endif 5043 5044 5045 5046#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C) 5047#define REPEAT_3_2(P_X, P_A, P_B, P_C) \ 5048 P_X##_DEF(1, P_A, P_B, P_C); \ 5049 REPEAT_3_1(P_X, P_A, P_B, P_C) 5050#define REPEAT_3_3(P_X, P_A, P_B, P_C) \ 5051 P_X##_DEF(2, P_A, P_B, P_C); \ 5052 REPEAT_3_2(P_X, P_A, P_B, P_C) 5053#define REPEAT_3_4(P_X, P_A, P_B, P_C) \ 5054 P_X##_DEF(3, P_A, P_B, P_C); \ 5055 REPEAT_3_3(P_X, P_A, P_B, P_C) 5056#define REPEAT_3_5(P_X, P_A, P_B, P_C) \ 5057 P_X##_DEF(4, P_A, P_B, P_C); \ 5058 REPEAT_3_4(P_X, P_A, P_B, P_C) 5059#define REPEAT_3_6(P_X, P_A, P_B, P_C) \ 5060 P_X##_DEF(5, P_A, P_B, P_C); \ 5061 REPEAT_3_5(P_X, P_A, P_B, P_C) 5062#define REPEAT_3_7(P_X, P_A, P_B, P_C) \ 5063 P_X##_DEF(6, P_A, P_B, P_C); \ 5064 REPEAT_3_6(P_X, P_A, P_B, P_C) 5065#define REPEAT_3_8(P_X, P_A, P_B, P_C) \ 5066 P_X##_DEF(7, P_A, P_B, P_C); \ 5067 REPEAT_3_7(P_X, P_A, P_B, P_C) 5068#define REPEAT_3_9(P_X, P_A, P_B, P_C) \ 5069 P_X##_DEF(8, P_A, P_B, P_C); \ 5070 REPEAT_3_8(P_X, P_A, P_B, P_C) 5071#define REPEAT_3_10(P_X, P_A, P_B, P_C) \ 5072 P_X##_DEF(9, P_A, P_B, P_C); \ 5073 REPEAT_3_9(P_X, P_A, P_B, P_C) 5074#define REPEAT_3_11(P_X, P_A, P_B, P_C) \ 5075 P_X##_DEF(A, P_A, P_B, P_C); \ 5076 REPEAT_3_10(P_X, P_A, P_B, P_C) 5077#define REPEAT_3_12(P_X, P_A, P_B, P_C) \ 5078 P_X##_DEF(B, P_A, P_B, P_C); \ 5079 REPEAT_3_11(P_X, P_A, P_B, P_C) 5080#define REPEAT_3_13(P_X, P_A, P_B, P_C) \ 5081 P_X##_DEF(C, P_A, P_B, P_C); \ 5082 REPEAT_3_12(P_X, P_A, P_B, P_C) 5083#define REPEAT_3_14(P_X, P_A, P_B, P_C) \ 5084 P_X##_DEF(D, P_A, P_B, P_C); \ 5085 REPEAT_3_13(P_X, P_A, P_B, P_C) 5086#define REPEAT_3_15(P_X, P_A, P_B, P_C) \ 5087 P_X##_DEF(E, P_A, P_B, P_C); \ 5088 REPEAT_3_14(P_X, P_A, P_B, P_C) 5089#define REPEAT_3_16(P_X, P_A, P_B, P_C) \ 5090 P_X##_DEF(F, P_A, P_B, P_C); \ 5091 REPEAT_3_15(P_X, P_A, P_B, P_C) 5092 5093#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) 5094#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) 5095 5096 5097#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D) 5098#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \ 5099 P_X##_DEF(1, P_A, P_B, P_C, P_D); \ 5100 REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) 5101#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \ 5102 P_X##_DEF(2, P_A, P_B, P_C, P_D); \ 5103 REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) 5104#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \ 5105 P_X##_DEF(3, P_A, P_B, P_C, P_D); \ 5106 REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) 5107#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \ 5108 P_X##_DEF(4, P_A, P_B, P_C, P_D); \ 5109 REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) 5110#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \ 5111 P_X##_DEF(5, P_A, P_B, P_C, P_D); \ 5112 REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) 5113#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \ 5114 P_X##_DEF(6, P_A, P_B, P_C, P_D); \ 5115 REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) 5116#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \ 5117 P_X##_DEF(7, P_A, P_B, P_C, P_D); \ 5118 REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) 5119#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \ 5120 P_X##_DEF(8, P_A, P_B, P_C, P_D); \ 5121 REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) 5122#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \ 5123 P_X##_DEF(9, P_A, P_B, P_C, P_D); \ 5124 REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) 5125#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \ 5126 P_X##_DEF(A, P_A, P_B, P_C, P_D); \ 5127 REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) 5128#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \ 5129 P_X##_DEF(B, P_A, P_B, P_C, P_D); \ 5130 REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) 5131#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \ 5132 P_X##_DEF(C, P_A, P_B, P_C, P_D); \ 5133 REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) 5134#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \ 5135 P_X##_DEF(D, P_A, P_B, P_C, P_D); \ 5136 REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) 5137#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \ 5138 P_X##_DEF(E, P_A, P_B, P_C, P_D); \ 5139 REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) 5140#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \ 5141 P_X##_DEF(F, P_A, P_B, P_C, P_D); \ 5142 REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) 5143 5144#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) 5145#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) 5146 5147 5148#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL 5149#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) 5150 5151 5152#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) 5153#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) 5154 5155 5156#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) 5157#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) 5158 5159 5160#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL 5161#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) 5162 5163 5164#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL 5165#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) 5166 5167 5168#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC 5169#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) 5170 5171 5172#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID 5173#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) 5174 5175 5176#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) 5177#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) 5178 5179 5180#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) 5181#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) 5182 5183 5184#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 5185#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 5186 5187 5188#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 5189#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 5190 5191 5192#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ 5193 ({ \ 5194 VEC_DATA_TYPE(int, N0) \ 5195 VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 5196 VEC_DATA_TYPE(int, N0) \ 5197 VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 5198 VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ 5199 }) 5200#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) 5201 5202#endif 5203 5204#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) 5205 5206#define CONCAT(a, b) a##b 5207 5208#define ARM_DOT1(a, b, c) \ 5209 ({ \ 5210 c = fma(a, b, c); \ 5211 }) 5212#define ARM_DOT2(a, b, c) \ 5213 ({ \ 5214 c = fma(a.s0, b.s0, c); \ 5215 c = fma(a.s1, b.s1, c); \ 5216 }) 5217#define ARM_DOT3(a, b, c) \ 5218 ({ \ 5219 ARM_DOT2(a, b, c); \ 5220 c = fma((a.s2), (b.s2), c); \ 5221 }) 5222#define ARM_DOT4(a, b, c) \ 5223 ({ \ 5224 ARM_DOT3(a, b, c); \ 5225 c = fma((a.s3), (b.s3), c); \ 5226 }) 5227#define ARM_DOT8(a, b, c) \ 5228 ({ \ 5229 ARM_DOT4((a.lo), (b.lo), c); \ 5230 ARM_DOT4((a.hi), (b.hi), c); \ 5231 }) 5232#define ARM_DOT16(a, b, c) \ 5233 ({ \ 5234 ARM_DOT8((a.lo), (b.lo), c); \ 5235 ARM_DOT8((a.hi), (b.hi), c); \ 5236 }) 5237 5238#if N0 == 2 5239#define ARM_DOT_K0XN0(k0, a, b, c) \ 5240 ({ \ 5241 CONCAT(ARM_DOT, k0) \ 5242 ((a), (b##0), (c.s0)); \ 5243 CONCAT(ARM_DOT, k0) \ 5244 ((a), (b##1), (c.s1)); \ 5245 }) 5246#elif N0 == 3 5247#define ARM_DOT_K0XN0(k0, a, b, c) \ 5248 ({ \ 5249 CONCAT(ARM_DOT, k0) \ 5250 ((a), (b##0), (c.s0)); \ 5251 CONCAT(ARM_DOT, k0) \ 5252 ((a), (b##1), (c.s1)); \ 5253 CONCAT(ARM_DOT, k0) \ 5254 ((a), (b##2), (c.s2)); \ 5255 }) 5256#elif N0 == 4 5257#define ARM_DOT_K0XN0(k0, a, b, c) \ 5258 ({ \ 5259 CONCAT(ARM_DOT, k0) \ 5260 ((a), (b##0), (c.s0)); \ 5261 CONCAT(ARM_DOT, k0) \ 5262 ((a), (b##1), (c.s1)); \ 5263 CONCAT(ARM_DOT, k0) \ 5264 ((a), (b##2), (c.s2)); \ 5265 CONCAT(ARM_DOT, k0) \ 5266 ((a), (b##3), (c.s3)); \ 5267 }) 5268#elif N0 == 8 5269#define ARM_DOT_K0XN0(k0, a, b, c) \ 5270 ({ \ 5271 CONCAT(ARM_DOT, k0) \ 5272 ((a), (b##0), (c.s0)); \ 5273 CONCAT(ARM_DOT, k0) \ 5274 ((a), (b##1), (c.s1)); \ 5275 CONCAT(ARM_DOT, k0) \ 5276 ((a), (b##2), (c.s2)); \ 5277 CONCAT(ARM_DOT, k0) \ 5278 ((a), (b##3), (c.s3)); \ 5279 CONCAT(ARM_DOT, k0) \ 5280 ((a), (b##4), (c.s4)); \ 5281 CONCAT(ARM_DOT, k0) \ 5282 ((a), (b##5), (c.s5)); \ 5283 CONCAT(ARM_DOT, k0) \ 5284 ((a), (b##6), (c.s6)); \ 5285 CONCAT(ARM_DOT, k0) \ 5286 ((a), (b##7), (c.s7)); \ 5287 }) 5288#elif N0 == 16 5289#define ARM_DOT_K0XN0(k0, a, b, c) \ 5290 ({ \ 5291 CONCAT(ARM_DOT, k0) \ 5292 ((a), (b##0), (c.s0)); \ 5293 CONCAT(ARM_DOT, k0) \ 5294 ((a), (b##1), (c.s1)); \ 5295 CONCAT(ARM_DOT, k0) \ 5296 ((a), (b##2), (c.s2)); \ 5297 CONCAT(ARM_DOT, k0) \ 5298 ((a), (b##3), (c.s3)); \ 5299 CONCAT(ARM_DOT, k0) \ 5300 ((a), (b##4), (c.s4)); \ 5301 CONCAT(ARM_DOT, k0) \ 5302 ((a), (b##5), (c.s5)); \ 5303 CONCAT(ARM_DOT, k0) \ 5304 ((a), (b##6), (c.s6)); \ 5305 CONCAT(ARM_DOT, k0) \ 5306 ((a), (b##7), (c.s7)); \ 5307 CONCAT(ARM_DOT, k0) \ 5308 ((a), (b##8), (c.s8)); \ 5309 CONCAT(ARM_DOT, k0) \ 5310 ((a), (b##9), (c.s9)); \ 5311 CONCAT(ARM_DOT, k0) \ 5312 ((a), (b##A), (c.sA)); \ 5313 CONCAT(ARM_DOT, k0) \ 5314 ((a), (b##B), (c.sB)); \ 5315 CONCAT(ARM_DOT, k0) \ 5316 ((a), (b##C), (c.sC)); \ 5317 CONCAT(ARM_DOT, k0) \ 5318 ((a), (b##D), (c.sD)); \ 5319 CONCAT(ARM_DOT, k0) \ 5320 ((a), (b##E), (c.sE)); \ 5321 CONCAT(ARM_DOT, k0) \ 5322 ((a), (b##F), (c.sF)); \ 5323 }) 5324#else 5325#error "N0 value not supported" 5326#endif 5327 5328#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T) 5329 5330__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), 5331 IMAGE_DECLARATION(rhs), 5332#if defined(BETA) 5333 IMAGE_DECLARATION(bias), 5334#endif 5335 IMAGE_DECLARATION(dst), 5336 uint lhs_stride_z, 5337 uint rhs_stride_z, 5338#if defined(BETA) 5339 uint bias_stride_z, 5340#endif 5341 uint dst_stride_z 5342#if defined(REINTERPRET_INPUT_AS_3D) 5343 , 5344 uint lhs_cross_plane_pad 5345#endif 5346#if defined(REINTERPRET_OUTPUT_AS_3D) 5347 , 5348 uint dst_cross_plane_pad 5349#endif 5350 , 5351 const int M, 5352 const int N, 5353 const int K) 5354{ 5355 5356#define RHS_BLOCK_SIZE ((K0) * (N0)) 5357 5358 5359#if defined(RHS_INTERLEAVE) 5360#define RHS_OFFSET_X (K0) 5361#define RHS_STEP_X ((K0) * (H0)) 5362#define RHS_STEP_LOOP (1) 5363#else 5364#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 5365#define RHS_STEP_X (K0) 5366#define RHS_STEP_LOOP (H0) 5367#endif 5368 5369 uint x = get_global_id(0); 5370 uint y = get_global_id(1); 5371 uint z = get_global_id(2); 5372 5373 const bool cond_y = y == 0; 5374 const bool cond_x = ((x + 1) * N0 >= N); 5375 5376#if defined(DUMMY_WORK_ITEMS) 5377 if((x * N0 >= N) || (y * M0 >= M)) 5378 { 5379 return; 5380 } 5381#endif 5382 5383 5384 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 5385 5386 5387 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; 5388 5389#if defined(MATRIX_B_DEPTH) 5390 5391 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; 5392#else 5393 rhs_offset += z * rhs_stride_z; 5394#endif 5395 5396 REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); 5397 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 5398 5399#if defined(REINTERPRET_INPUT_AS_3D) 5400 5401 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 5402 5403 5404 5405 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 5406 5407#else 5408 5409 5410 lhs_offset += z * lhs_stride_z; 5411 5412#endif 5413 5414 5415 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 5416 5417 int i = 0; 5418 for(; i <= (K - K0); i += K0) 5419 { 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 5431 5432 5433 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); 5434 5435 5436 ARM_DOT_K0XN0(K0, a0, b, c0); 5437#if M0 > 1 5438 ARM_DOT_K0XN0(K0, a1, b, c1); 5439#endif 5440#if M0 > 2 5441 ARM_DOT_K0XN0(K0, a2, b, c2); 5442#endif 5443#if M0 > 3 5444 ARM_DOT_K0XN0(K0, a3, b, c3); 5445#endif 5446#if M0 > 4 5447 ARM_DOT_K0XN0(K0, a4, b, c4); 5448#endif 5449#if M0 > 5 5450 ARM_DOT_K0XN0(K0, a5, b, c5); 5451#endif 5452#if M0 > 6 5453 ARM_DOT_K0XN0(K0, a6, b, c6); 5454#endif 5455#if M0 > 7 5456 ARM_DOT_K0XN0(K0, a7, b, c7); 5457#endif 5458 5459 lhs_offset += K0 * sizeof(DATA_TYPE); 5460 rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); 5461 } 5462 5463 5464 for(; i < K; ++i) 5465 { 5466 5467 LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 5468 5469 5470 LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero); 5471 5472 5473 ARM_DOT_K0XN0(1, a0, b, c0); 5474#if M0 > 1 5475 ARM_DOT_K0XN0(1, a1, b, c1); 5476#endif 5477#if M0 > 2 5478 ARM_DOT_K0XN0(1, a2, b, c2); 5479#endif 5480#if M0 > 3 5481 ARM_DOT_K0XN0(1, a3, b, c3); 5482#endif 5483#if M0 > 4 5484 ARM_DOT_K0XN0(1, a4, b, c4); 5485#endif 5486#if M0 > 5 5487 ARM_DOT_K0XN0(1, a5, b, c5); 5488#endif 5489#if M0 > 6 5490 ARM_DOT_K0XN0(1, a6, b, c6); 5491#endif 5492#if M0 > 7 5493 ARM_DOT_K0XN0(1, a7, b, c7); 5494#endif 5495 5496 lhs_offset += sizeof(DATA_TYPE); 5497 rhs_offset += sizeof(DATA_TYPE); 5498 } 5499 5500 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 5501 5502 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); 5503 5504#if defined(REINTERPRET_OUTPUT_AS_3D) 5505 5506 5507 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 5508 5509 5510 5511 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 5512 5513#else 5514 5515 5516 dst_addr += z * dst_stride_z; 5517 5518#endif 5519 5520 5521#if defined(ALPHA) 5522 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 5523#endif 5524 5525 5526#if defined(BETA) 5527#if defined(BROADCAST_BIAS) 5528 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 5529 5530 LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 5531 5532#ifndef UNIT_BETA 5533 SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 5534#endif 5535 5536 5537 ADD_BLOCK_BROADCAST(M0, c, bias0); 5538 5539#else 5540 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 5541 5542 LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 5543 5544#ifndef UNIT_BETA 5545 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 5546#endif 5547 5548 5549 ADD_BLOCK(M0, c, bias); 5550 5551#endif 5552#endif 5553 5554#if defined(ACTIVATION_TYPE) 5555 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 5556#endif 5557 5558 5559 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 5560 5561#undef RHS_BLOCK_SIZE 5562#undef RHS_OFFSET_X 5563#undef RHS_STEP_X 5564#undef RHS_STEP_LOOP 5565} 5566#endif 5567 5568#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE) 5569 5570__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs), 5571 __read_only image2d_t rhs_img, 5572#if defined(BETA) 5573 IMAGE_DECLARATION(bias), 5574#endif 5575 IMAGE_DECLARATION(dst), 5576 uint lhs_stride_z, 5577 uint rhs_stride_z, 5578#if defined(BETA) 5579 uint bias_stride_z, 5580#endif 5581 uint dst_stride_z 5582#if defined(REINTERPRET_INPUT_AS_3D) 5583 , 5584 uint lhs_cross_plane_pad 5585#endif 5586#if defined(REINTERPRET_OUTPUT_AS_3D) 5587 , 5588 uint dst_cross_plane_pad 5589#endif 5590 , 5591 const int M, 5592 const int N, 5593 const int K) 5594{ 5595 5596#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0) 5597 5598 const uint LEFTOVER_K = K % K0; 5599 5600 5601#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0)) 5602 5603 5604#if defined(RHS_INTERLEAVE) 5605#define RHS_OFFSET_X (PIXEL_UNIT) 5606#define RHS_STEP_X (PIXEL_UNIT * (H0)) 5607#define RHS_STEP_LOOP (1) 5608#else 5609#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 5610#define RHS_STEP_X PIXEL_UNIT 5611#define RHS_STEP_LOOP (H0) 5612#endif 5613 5614 uint x = get_global_id(0); 5615 uint y = get_global_id(1); 5616 uint z = get_global_id(2); 5617 5618 const bool cond_y = y == 0; 5619 const bool cond_x = ((x + 1) * N0 >= N); 5620 5621#if defined(DUMMY_WORK_ITEMS) 5622 if((x * N0 >= N) || (y * M0 >= M)) 5623 { 5624 return; 5625 } 5626#endif 5627 5628 5629 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 5630 5631#if defined(MATRIX_B_DEPTH) 5632 5633 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH); 5634#else 5635 const uint z_rhs = get_global_id(2); 5636#endif 5637 5638 5639 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X; 5640 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT; 5641 5642 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); 5643 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 5644 5645#if defined(REINTERPRET_INPUT_AS_3D) 5646 5647 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 5648 5649 5650 5651 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 5652 5653#else 5654 5655 5656 lhs_offset += z * lhs_stride_z; 5657 5658#endif 5659 5660 5661 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 5662 5663 int i = 0; 5664 for(; i <= (K - K0); i += K0) 5665 { 5666 5667 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 5668 5669 5670 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); 5671 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); 5672 5673 5674 ARM_DOT_K0XN0(K0, a0, b, c0); 5675#if M0 > 1 5676 ARM_DOT_K0XN0(K0, a1, b, c1); 5677#endif 5678#if M0 > 2 5679 ARM_DOT_K0XN0(K0, a2, b, c2); 5680#endif 5681#if M0 > 3 5682 ARM_DOT_K0XN0(K0, a3, b, c3); 5683#endif 5684#if M0 > 4 5685 ARM_DOT_K0XN0(K0, a4, b, c4); 5686#endif 5687#if M0 > 5 5688 ARM_DOT_K0XN0(K0, a5, b, c5); 5689#endif 5690#if M0 > 6 5691 ARM_DOT_K0XN0(K0, a6, b, c6); 5692#endif 5693#if M0 > 7 5694 ARM_DOT_K0XN0(K0, a7, b, c7); 5695#endif 5696 5697 lhs_offset += K0 * sizeof(DATA_TYPE); 5698 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP; 5699 } 5700 5701 if(LEFTOVER_K != 0) 5702 { 5703 5704 5705 5706 5707 union UNION_VEC_TYPE 5708 { 5709 DATA_TYPE s[K0]; 5710 VEC_DATA_TYPE(DATA_TYPE, K0) 5711 v; 5712 }; 5713 5714 union UNION_VEC_TYPE a0 = {.v = 0 }; 5715#if M0 > 1 5716 union UNION_VEC_TYPE a1 = {.v = 0 }; 5717#endif 5718#if M0 > 2 5719 union UNION_VEC_TYPE a2 = {.v = 0 }; 5720#endif 5721#if M0 > 3 5722 union UNION_VEC_TYPE a3 = {.v = 0 }; 5723#endif 5724#if M0 > 4 5725 union UNION_VEC_TYPE a4 = {.v = 0 }; 5726#endif 5727#if M0 > 5 5728 union UNION_VEC_TYPE a5 = {.v = 0 }; 5729#endif 5730#if M0 > 6 5731 union UNION_VEC_TYPE a6 = {.v = 0 }; 5732#endif 5733#if M0 > 7 5734 union UNION_VEC_TYPE a7 = {.v = 0 }; 5735#endif 5736 5737 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); 5738 5739 5740 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); 5741 5742 5743 for(int k = 0; k < LEFTOVER_K; ++k) 5744 { 5745 a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0); 5746#if M0 > 1 5747 a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1); 5748#endif 5749#if M0 > 2 5750 a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2); 5751#endif 5752#if M0 > 3 5753 a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3); 5754#endif 5755#if M0 > 4 5756 a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4); 5757#endif 5758#if M0 > 5 5759 a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5); 5760#endif 5761#if M0 > 6 5762 a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6); 5763#endif 5764#if M0 > 7 5765 a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7); 5766#endif 5767 5768 lhs_offset += sizeof(DATA_TYPE); 5769 } 5770 5771 5772 ARM_DOT_K0XN0(K0, a0.v, b, c0); 5773#if M0 > 1 5774 ARM_DOT_K0XN0(K0, a1.v, b, c1); 5775#endif 5776#if M0 > 2 5777 ARM_DOT_K0XN0(K0, a2.v, b, c2); 5778#endif 5779#if M0 > 3 5780 ARM_DOT_K0XN0(K0, a3.v, b, c3); 5781#endif 5782#if M0 > 4 5783 ARM_DOT_K0XN0(K0, a4.v, b, c4); 5784#endif 5785#if M0 > 5 5786 ARM_DOT_K0XN0(K0, a5.v, b, c5); 5787#endif 5788#if M0 > 6 5789 ARM_DOT_K0XN0(K0, a6.v, b, c6); 5790#endif 5791#if M0 > 7 5792 ARM_DOT_K0XN0(K0, a7.v, b, c7); 5793#endif 5794 } 5795 5796 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 5797 5798 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 5799 5800#if defined(REINTERPRET_OUTPUT_AS_3D) 5801 5802 5803 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 5804 5805 5806 5807 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 5808 5809#else 5810 5811 5812 dst_addr += z * dst_stride_z; 5813 5814#endif 5815 5816 5817#if defined(ALPHA) 5818 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 5819#endif 5820 5821 5822#if defined(BETA) 5823#if defined(BROADCAST_BIAS) 5824 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 5825 5826 LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 5827 5828#ifndef UNIT_BETA 5829 SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 5830#endif 5831 5832 5833 ADD_BLOCK_BROADCAST(M0, c, bias0); 5834 5835#else 5836 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 5837 5838 LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 5839 5840#ifndef UNIT_BETA 5841 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 5842#endif 5843 5844 5845 ADD_BLOCK(M0, c, bias); 5846 5847#endif 5848#endif 5849 5850#if defined(ACTIVATION_TYPE) 5851 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 5852#endif 5853 5854 5855 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 5856 5857#undef RHS_BLOCK_SIZE 5858#undef RHS_OFFSET_X 5859#undef RHS_STEP_X 5860#undef RHS_STEP_LOOP 5861#undef PIXEL_UNIT 5862} 5863#endif 5864 5865#define VFMA(a, b, c) \ 5866 ({ \ 5867 c = fma(a, b, c); \ 5868 }) 5869 5870#if M0 == 1 5871#define VFMA_M0xN0(i, a, b, c) \ 5872 ({ \ 5873 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5874 }) 5875#elif M0 == 2 5876#define VFMA_M0xN0(i, a, b, c) \ 5877 ({ \ 5878 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5879 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5880 }) 5881#elif M0 == 3 5882#define VFMA_M0xN0(i, a, b, c) \ 5883 ({ \ 5884 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5885 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5886 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5887 }) 5888#elif M0 == 4 5889#define VFMA_M0xN0(i, a, b, c) \ 5890 ({ \ 5891 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5892 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5893 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5894 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5895 }) 5896#elif M0 == 5 5897#define VFMA_M0xN0(i, a, b, c) \ 5898 ({ \ 5899 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5900 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5901 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5902 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5903 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 5904 }) 5905#elif M0 == 6 5906#define VFMA_M0xN0(i, a, b, c) \ 5907 ({ \ 5908 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5909 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5910 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5911 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5912 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 5913 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 5914 }) 5915#elif M0 == 7 5916#define VFMA_M0xN0(i, a, b, c) \ 5917 ({ \ 5918 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5919 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5920 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5921 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5922 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 5923 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 5924 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ 5925 }) 5926#elif M0 == 8 5927#define VFMA_M0xN0(i, a, b, c) \ 5928 ({ \ 5929 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 5930 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 5931 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 5932 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 5933 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 5934 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 5935 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ 5936 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ 5937 }) 5938#else 5939#error "M0 not supported" 5940#endif 5941 5942#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT) 5943 5944__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), 5945 IMAGE_DECLARATION(rhs), 5946#if defined(BETA) 5947 IMAGE_DECLARATION(bias), 5948#endif 5949 IMAGE_DECLARATION(dst), 5950 uint lhs_stride_z, 5951 uint rhs_stride_z, 5952#if defined(BETA) 5953 uint bias_stride_z, 5954#endif 5955 uint dst_stride_z 5956#if defined(REINTERPRET_INPUT_AS_3D) 5957 , 5958 uint lhs_cross_plane_pad 5959#endif 5960#if defined(REINTERPRET_OUTPUT_AS_3D) 5961 , 5962 uint dst_cross_plane_pad 5963#endif 5964 , 5965 const int M, 5966 const int N, 5967 const int K) 5968{ 5969 5970#define RHS_BLOCK_SIZE ((K0) * (N0)) 5971 5972 5973#if defined(RHS_INTERLEAVE) 5974#define RHS_OFFSET_X (N0) 5975#define RHS_STEP_X ((N0) * (H0)) 5976#define RHS_STEP_LOOP (1) 5977#else 5978#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 5979#define RHS_STEP_X (N0) 5980#define RHS_STEP_LOOP (H0) 5981#endif 5982 5983 uint x = get_global_id(0); 5984 uint y = get_global_id(1); 5985 uint z = get_global_id(2); 5986 5987 const bool cond_y = y == 0; 5988 const bool cond_x = ((x + 1) * N0 >= N); 5989 5990#if defined(DUMMY_WORK_ITEMS) 5991 if((x * N0 >= N) || (y * M0 >= M)) 5992 { 5993 return; 5994 } 5995#endif 5996 5997 5998 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 5999 6000 6001 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; 6002 6003#if defined(MATRIX_B_DEPTH) 6004 6005 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; 6006#else 6007 rhs_offset += z * rhs_stride_z; 6008#endif 6009 6010 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); 6011 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 6012 6013#if defined(REINTERPRET_INPUT_AS_3D) 6014 6015 6016 CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 6017 6018 6019 6020 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 6021 6022#else 6023 6024 6025 lhs_offset += z * lhs_stride_z; 6026 6027#endif 6028 6029 6030 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 6031 6032 int i = 0; 6033 for(; i <= (K - K0); i += K0) 6034 { 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); 6046 6047 VEC_DATA_TYPE(DATA_TYPE, N0) 6048 b0; 6049 6050 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE))); 6051 VFMA_M0xN0(0, a, b0, c); 6052 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE))); 6053 VFMA_M0xN0(1, a, b0, c); 6054#if K0 > 2 6055 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE))); 6056 VFMA_M0xN0(2, a, b0, c); 6057#endif 6058#if K0 > 3 6059 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE))); 6060 VFMA_M0xN0(3, a, b0, c); 6061#endif 6062#if K0 > 4 6063 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE))); 6064 VFMA_M0xN0(4, a, b0, c); 6065 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE))); 6066 VFMA_M0xN0(5, a, b0, c); 6067 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE))); 6068 VFMA_M0xN0(6, a, b0, c); 6069 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE))); 6070 VFMA_M0xN0(7, a, b0, c); 6071#endif 6072#if K0 > 8 6073 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE))); 6074 VFMA_M0xN0(8, a, b0, c); 6075 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE))); 6076 VFMA_M0xN0(9, a, b0, c); 6077 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE))); 6078 VFMA_M0xN0(A, a, b0, c); 6079 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE))); 6080 VFMA_M0xN0(B, a, b0, c); 6081 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE))); 6082 VFMA_M0xN0(C, a, b0, c); 6083 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE))); 6084 VFMA_M0xN0(D, a, b0, c); 6085 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE))); 6086 VFMA_M0xN0(E, a, b0, c); 6087 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE))); 6088 VFMA_M0xN0(F, a, b0, c); 6089#endif 6090 6091 lhs_offset += K0 * sizeof(DATA_TYPE); 6092 rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE); 6093 } 6094 6095 6096 for(; i < K; ++i) 6097 { 6098 6099 VEC_DATA_TYPE(DATA_TYPE, 2) 6100 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); 6101#if M0 > 1 6102 VEC_DATA_TYPE(DATA_TYPE, 2) 6103 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); 6104#endif 6105#if M0 > 2 6106 VEC_DATA_TYPE(DATA_TYPE, 2) 6107 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); 6108#endif 6109#if M0 > 3 6110 VEC_DATA_TYPE(DATA_TYPE, 2) 6111 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); 6112#endif 6113#if M0 > 4 6114 VEC_DATA_TYPE(DATA_TYPE, 2) 6115 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); 6116#endif 6117#if M0 > 5 6118 VEC_DATA_TYPE(DATA_TYPE, 2) 6119 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); 6120#endif 6121#if M0 > 6 6122 VEC_DATA_TYPE(DATA_TYPE, 2) 6123 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); 6124#endif 6125#if M0 > 7 6126 VEC_DATA_TYPE(DATA_TYPE, 2) 6127 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); 6128#endif 6129 6130 VEC_DATA_TYPE(DATA_TYPE, N0) 6131 b0; 6132 6133 b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE))); 6134 VFMA_M0xN0(0, a, b0, c); 6135 6136 lhs_offset += sizeof(DATA_TYPE); 6137 rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE); 6138 } 6139 6140 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 6141 6142 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); 6143 6144#if defined(REINTERPRET_OUTPUT_AS_3D) 6145 6146 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 6147 6148 6149 6150 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 6151 6152#else 6153 6154 6155 dst_addr += z * dst_stride_z; 6156 6157#endif 6158 6159 6160#if defined(ALPHA) 6161 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 6162#endif 6163 6164 6165#if defined(BETA) 6166#if defined(BROADCAST_BIAS) 6167 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 6168 6169 LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 6170 6171#ifndef UNIT_BETA 6172 SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 6173#endif 6174 6175 6176 ADD_BLOCK_BROADCAST(M0, c, bias0); 6177 6178#else 6179 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 6180 6181 LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6182 6183#ifndef UNIT_BETA 6184 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 6185#endif 6186 6187 6188 ADD_BLOCK(M0, c, bias); 6189 6190#endif 6191#endif 6192 6193#if defined(ACTIVATION_TYPE) 6194 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 6195#endif 6196 6197 6198 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6199 6200#undef RHS_BLOCK_SIZE 6201#undef RHS_OFFSET_X 6202#undef RHS_STEP_X 6203#undef RHS_STEP_LOOP 6204} 6205#endif 6206 6207#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE) 6208 6209__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs), 6210 __read_only image2d_t rhs_img, 6211#if defined(BETA) 6212 IMAGE_DECLARATION(bias), 6213#endif 6214 IMAGE_DECLARATION(dst), 6215 uint lhs_stride_z, 6216 uint rhs_stride_z, 6217#if defined(BETA) 6218 uint bias_stride_z, 6219#endif 6220 uint dst_stride_z 6221#if defined(REINTERPRET_INPUT_AS_3D) 6222 , 6223 uint lhs_cross_plane_pad 6224#endif 6225#if defined(REINTERPRET_OUTPUT_AS_3D) 6226 , 6227 uint dst_cross_plane_pad 6228#endif 6229 , 6230 const int M, 6231 const int N, 6232 const int K) 6233{ 6234 6235#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0) 6236 6237 6238#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT)) 6239 6240 6241#if defined(RHS_INTERLEAVE) 6242#define RHS_OFFSET_X (PIXEL_UNIT) 6243#define RHS_STEP_X ((PIXEL_UNIT) * (H0)) 6244#define RHS_STEP_LOOP 1 6245#else 6246#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 6247#define RHS_STEP_X (PIXEL_UNIT) 6248#define RHS_STEP_LOOP (H0) 6249#endif 6250 6251 uint x = get_global_id(0); 6252 uint y = get_global_id(1); 6253 uint z = get_global_id(2); 6254 6255 const bool cond_y = y == 0; 6256 const bool cond_x = ((x + 1) * N0 >= N); 6257 6258#if defined(DUMMY_WORK_ITEMS) 6259 if((x * N0 >= N) || (y * M0 >= M)) 6260 { 6261 return; 6262 } 6263#endif 6264 6265 6266 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 6267 6268#if defined(MATRIX_B_DEPTH) 6269 6270 const uint z_rhs = (z % MATRIX_B_DEPTH); 6271#else 6272 const uint z_rhs = z; 6273#endif 6274 6275 6276 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X; 6277 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT; 6278 6279 REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); 6280 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 6281 6282#if defined(REINTERPRET_INPUT_AS_3D) 6283 6284 6285 CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 6286 6287 6288 6289 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 6290 6291#else 6292 6293 6294 lhs_offset += z * lhs_stride_z; 6295 6296#endif 6297 6298 6299 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 6300 6301 int i = 0; 6302 for(; i <= (K - K0); i += K0) 6303 { 6304 6305 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin); 6306 6307 VEC_DATA_TYPE(DATA_TYPE, N0) 6308 b0; 6309 6310 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); 6311 VFMA_M0xN0(0, a, b0, c); 6312 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs)); 6313 VFMA_M0xN0(1, a, b0, c); 6314#if K0 > 2 6315 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs)); 6316 VFMA_M0xN0(2, a, b0, c); 6317#endif 6318#if K0 > 3 6319 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs)); 6320 VFMA_M0xN0(3, a, b0, c); 6321#endif 6322#if K0 > 4 6323 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs)); 6324 VFMA_M0xN0(4, a, b0, c); 6325 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs)); 6326 VFMA_M0xN0(5, a, b0, c); 6327 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs)); 6328 VFMA_M0xN0(6, a, b0, c); 6329 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs)); 6330 VFMA_M0xN0(7, a, b0, c); 6331#endif 6332#if K0 > 8 6333 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs)); 6334 VFMA_M0xN0(8, a, b0, c); 6335 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs)); 6336 VFMA_M0xN0(9, a, b0, c); 6337 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs)); 6338 VFMA_M0xN0(A, a, b0, c); 6339 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs)); 6340 VFMA_M0xN0(B, a, b0, c); 6341 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs)); 6342 VFMA_M0xN0(C, a, b0, c); 6343 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs)); 6344 VFMA_M0xN0(D, a, b0, c); 6345 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs)); 6346 VFMA_M0xN0(E, a, b0, c); 6347 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs)); 6348 VFMA_M0xN0(F, a, b0, c); 6349#endif 6350 6351 lhs_offset += K0 * sizeof(DATA_TYPE); 6352 x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP; 6353 } 6354 6355 6356 for(; i < K; ++i) 6357 { 6358 6359 VEC_DATA_TYPE(DATA_TYPE, 2) 6360 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0)); 6361#if M0 > 1 6362 VEC_DATA_TYPE(DATA_TYPE, 2) 6363 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1)); 6364#endif 6365#if M0 > 2 6366 VEC_DATA_TYPE(DATA_TYPE, 2) 6367 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2)); 6368#endif 6369#if M0 > 3 6370 VEC_DATA_TYPE(DATA_TYPE, 2) 6371 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3)); 6372#endif 6373#if M0 > 4 6374 VEC_DATA_TYPE(DATA_TYPE, 2) 6375 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4)); 6376#endif 6377#if M0 > 5 6378 VEC_DATA_TYPE(DATA_TYPE, 2) 6379 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5)); 6380#endif 6381#if M0 > 6 6382 VEC_DATA_TYPE(DATA_TYPE, 2) 6383 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6)); 6384#endif 6385#if M0 > 7 6386 VEC_DATA_TYPE(DATA_TYPE, 2) 6387 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7)); 6388#endif 6389 6390 VEC_DATA_TYPE(DATA_TYPE, N0) 6391 b0; 6392 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); 6393 6394 VFMA_M0xN0(0, a, b0, c); 6395 6396 lhs_offset += sizeof(DATA_TYPE); 6397 x_rhs += RHS_STEP_X; 6398 } 6399 6400 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 6401 6402 REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); 6403 6404#if defined(REINTERPRET_OUTPUT_AS_3D) 6405 6406 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 6407 6408 6409 6410 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 6411 6412#else 6413 6414 6415 dst_addr += z * dst_stride_z; 6416 6417#endif 6418 6419 6420#if defined(ALPHA) 6421 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 6422#endif 6423 6424 6425#if defined(BETA) 6426#if defined(BROADCAST_BIAS) 6427 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 6428 6429 LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 6430 6431#ifndef UNIT_BETA 6432 SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 6433#endif 6434 6435 6436 ADD_BLOCK_BROADCAST(M0, c, bias0); 6437 6438#else 6439 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 6440 6441 LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6442 6443#ifndef UNIT_BETA 6444 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 6445#endif 6446 6447 6448 ADD_BLOCK(M0, c, bias); 6449 6450#endif 6451#endif 6452 6453#if defined(ACTIVATION_TYPE) 6454 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 6455#endif 6456 6457 6458 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6459 6460#undef RHS_BLOCK_SIZE 6461#undef RHS_OFFSET_X 6462#undef RHS_STEP_X 6463#undef RHS_STEP_LOOP 6464} 6465#endif 6466#endif 6467 6468#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) 6469 6470#if defined(MIXED_PRECISION) 6471#if K0 == 2 6472#define ARM_DOT_K0(a, b, c) \ 6473 ({ \ 6474 c += a.s0 * b.s0; \ 6475 c += a.s1 * b.s1; \ 6476 }) 6477#elif K0 == 3 6478#define ARM_DOT_K0(a, b, c) \ 6479 ({ \ 6480 c += a.s0 * b.s0; \ 6481 c += a.s1 * b.s1; \ 6482 c += a.s2 * b.s2; \ 6483 }) 6484#elif K0 == 4 6485#define ARM_DOT_K0(a, b, c) \ 6486 ({ \ 6487 c += a.s0 * b.s0; \ 6488 c += a.s1 * b.s1; \ 6489 c += a.s2 * b.s2; \ 6490 c += a.s3 * b.s3; \ 6491 }) 6492#elif K0 == 8 6493#define ARM_DOT_K0(a, b, c) \ 6494 ({ \ 6495 c += a.s0 * b.s0; \ 6496 c += a.s1 * b.s1; \ 6497 c += a.s2 * b.s2; \ 6498 c += a.s3 * b.s3; \ 6499 c += a.s4 * b.s4; \ 6500 c += a.s5 * b.s5; \ 6501 c += a.s6 * b.s6; \ 6502 c += a.s7 * b.s7; \ 6503 }) 6504#elif K0 == 16 6505#define ARM_DOT_K0(a, b, c) \ 6506 ({ \ 6507 c += a.s0 * b.s0; \ 6508 c += a.s1 * b.s1; \ 6509 c += a.s2 * b.s2; \ 6510 c += a.s3 * b.s3; \ 6511 c += a.s4 * b.s4; \ 6512 c += a.s5 * b.s5; \ 6513 c += a.s6 * b.s6; \ 6514 c += a.s7 * b.s7; \ 6515 c += a.s8 * b.s8; \ 6516 c += a.s9 * b.s9; \ 6517 c += a.sA * b.sA; \ 6518 c += a.sB * b.sB; \ 6519 c += a.sC * b.sC; \ 6520 c += a.sD * b.sD; \ 6521 c += a.sE * b.sE; \ 6522 c += a.sF * b.sF; \ 6523 }) 6524#else 6525#error "K0 value not supported" 6526#endif 6527#else 6528#if K0 == 2 6529#define ARM_DOT_K0(a, b, c) \ 6530 ({ \ 6531 c = fma(a.s0, b.s0, c); \ 6532 c = fma(a.s1, b.s1, c); \ 6533 }) 6534#elif K0 == 3 6535#define ARM_DOT_K0(a, b, c) \ 6536 ({ \ 6537 c = fma(a.s0, b.s0, c); \ 6538 c = fma(a.s1, b.s1, c); \ 6539 c = fma(a.s2, b.s2, c); \ 6540 }) 6541#elif K0 == 4 6542#define ARM_DOT_K0(a, b, c) \ 6543 ({ \ 6544 c = fma(a.s0, b.s0, c); \ 6545 c = fma(a.s1, b.s1, c); \ 6546 c = fma(a.s2, b.s2, c); \ 6547 c = fma(a.s3, b.s3, c); \ 6548 }) 6549#elif K0 == 8 6550#define ARM_DOT_K0(a, b, c) \ 6551 ({ \ 6552 c = fma(a.s0, b.s0, c); \ 6553 c = fma(a.s1, b.s1, c); \ 6554 c = fma(a.s2, b.s2, c); \ 6555 c = fma(a.s3, b.s3, c); \ 6556 c = fma(a.s4, b.s4, c); \ 6557 c = fma(a.s5, b.s5, c); \ 6558 c = fma(a.s6, b.s6, c); \ 6559 c = fma(a.s7, b.s7, c); \ 6560 }) 6561#elif K0 == 16 6562#define ARM_DOT_K0(a, b, c) \ 6563 ({ \ 6564 c = fma(a.s0, b.s0, c); \ 6565 c = fma(a.s1, b.s1, c); \ 6566 c = fma(a.s2, b.s2, c); \ 6567 c = fma(a.s3, b.s3, c); \ 6568 c = fma(a.s4, b.s4, c); \ 6569 c = fma(a.s5, b.s5, c); \ 6570 c = fma(a.s6, b.s6, c); \ 6571 c = fma(a.s7, b.s7, c); \ 6572 c = fma(a.s8, b.s8, c); \ 6573 c = fma(a.s9, b.s9, c); \ 6574 c = fma(a.sA, b.sA, c); \ 6575 c = fma(a.sB, b.sB, c); \ 6576 c = fma(a.sC, b.sC, c); \ 6577 c = fma(a.sD, b.sD, c); \ 6578 c = fma(a.sE, b.sE, c); \ 6579 c = fma(a.sF, b.sF, c); \ 6580 }) 6581#else 6582#error "K0 value not supported" 6583#endif 6584#endif 6585 6586#if defined(ARM_DOT_K0XN0) 6587#undef ARM_DOT_K0XN0 6588#endif 6589 6590#if N0 == 2 6591#define ARM_DOT_K0XN0(a, b, c) \ 6592 ({ \ 6593 ARM_DOT_K0((a), (b##0), (c.s0)); \ 6594 ARM_DOT_K0((a), (b##1), (c.s1)); \ 6595 }) 6596#elif N0 == 3 6597#define ARM_DOT_K0XN0(a, b, c) \ 6598 ({ \ 6599 ARM_DOT_K0((a), (b##0), (c.s0)); \ 6600 ARM_DOT_K0((a), (b##1), (c.s1)); \ 6601 ARM_DOT_K0((a), (b##2), (c.s2)); \ 6602 }) 6603#elif N0 == 4 6604#define ARM_DOT_K0XN0(a, b, c) \ 6605 ({ \ 6606 ARM_DOT_K0((a), (b##0), (c.s0)); \ 6607 ARM_DOT_K0((a), (b##1), (c.s1)); \ 6608 ARM_DOT_K0((a), (b##2), (c.s2)); \ 6609 ARM_DOT_K0((a), (b##3), (c.s3)); \ 6610 }) 6611#elif N0 == 8 6612#define ARM_DOT_K0XN0(a, b, c) \ 6613 ({ \ 6614 ARM_DOT_K0((a), (b##0), (c.s0)); \ 6615 ARM_DOT_K0((a), (b##1), (c.s1)); \ 6616 ARM_DOT_K0((a), (b##2), (c.s2)); \ 6617 ARM_DOT_K0((a), (b##3), (c.s3)); \ 6618 ARM_DOT_K0((a), (b##4), (c.s4)); \ 6619 ARM_DOT_K0((a), (b##5), (c.s5)); \ 6620 ARM_DOT_K0((a), (b##6), (c.s6)); \ 6621 ARM_DOT_K0((a), (b##7), (c.s7)); \ 6622 }) 6623#elif N0 == 16 6624#define ARM_DOT_K0XN0(a, b, c) \ 6625 ({ \ 6626 ARM_DOT_K0((a), (b##0), (c.s0)); \ 6627 ARM_DOT_K0((a), (b##1), (c.s1)); \ 6628 ARM_DOT_K0((a), (b##2), (c.s2)); \ 6629 ARM_DOT_K0((a), (b##3), (c.s3)); \ 6630 ARM_DOT_K0((a), (b##4), (c.s4)); \ 6631 ARM_DOT_K0((a), (b##5), (c.s5)); \ 6632 ARM_DOT_K0((a), (b##6), (c.s6)); \ 6633 ARM_DOT_K0((a), (b##7), (c.s7)); \ 6634 ARM_DOT_K0((a), (b##8), (c.s8)); \ 6635 ARM_DOT_K0((a), (b##9), (c.s9)); \ 6636 ARM_DOT_K0((a), (b##A), (c.sA)); \ 6637 ARM_DOT_K0((a), (b##B), (c.sB)); \ 6638 ARM_DOT_K0((a), (b##C), (c.sC)); \ 6639 ARM_DOT_K0((a), (b##D), (c.sD)); \ 6640 ARM_DOT_K0((a), (b##E), (c.sE)); \ 6641 ARM_DOT_K0((a), (b##F), (c.sF)); \ 6642 }) 6643#else 6644#error "N0 value not supported" 6645#endif 6646 6647#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T) 6648 6649__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), 6650 IMAGE_DECLARATION(rhs), 6651#if defined(BETA) 6652 IMAGE_DECLARATION(bias), 6653#endif 6654 IMAGE_DECLARATION(dst), 6655 uint lhs_stride_z, 6656 uint rhs_stride_z, 6657#if defined(BETA) 6658 uint bias_stride_z, 6659#endif 6660 uint dst_stride_z 6661#if defined(REINTERPRET_OUTPUT_AS_3D) 6662 , 6663 uint dst_cross_plane_pad 6664#endif 6665 , 6666 const int M, 6667 const int N, 6668 const int K) 6669{ 6670 6671#define LHS_BLOCK_SIZE ((K0) * (M0)) 6672 6673#if defined(LHS_INTERLEAVE) 6674#define LHS_OFFSET_X (K0) 6675#define LHS_STEP_X ((K0) * (V0)) 6676#define LHS_STEP_LOOP (1) 6677#else 6678#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 6679#define LHS_STEP_X (K0) 6680#define LHS_STEP_LOOP (V0) 6681#endif 6682 6683 6684#define RHS_BLOCK_SIZE ((K0) * (N0)) 6685 6686 6687#if defined(RHS_INTERLEAVE) 6688#define RHS_OFFSET_X (K0) 6689#define RHS_STEP_X ((K0) * (H0)) 6690#define RHS_STEP_LOOP (1) 6691#else 6692#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 6693#define RHS_STEP_X (K0) 6694#define RHS_STEP_LOOP (H0) 6695#endif 6696 6697#if defined(DUMMY_WORK_ITEMS) 6698 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M)) 6699 { 6700 return; 6701 } 6702#endif 6703 6704 6705 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y + 6706 (get_global_id(2) * lhs_stride_z); 6707 6708 6709 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y; 6710 6711#if defined(MATRIX_B_DEPTH) 6712 6713 rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z; 6714#else 6715 rhs_addr += get_global_id(2) * rhs_stride_z; 6716#endif 6717 6718 6719 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); 6720 6721 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); 6722 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 6723 6724 for(int i = 0; i < K; i += K0) 6725 { 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs); 6737 6738 6739 LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero); 6740 6741 6742 ARM_DOT_K0XN0(a0, b, c0); 6743#if M0 > 1 6744 ARM_DOT_K0XN0(a1, b, c1); 6745#endif 6746#if M0 > 2 6747 ARM_DOT_K0XN0(a2, b, c2); 6748#endif 6749#if M0 > 3 6750 ARM_DOT_K0XN0(a3, b, c3); 6751#endif 6752#if M0 > 4 6753 ARM_DOT_K0XN0(a4, b, c4); 6754#endif 6755#if M0 > 5 6756 ARM_DOT_K0XN0(a5, b, c5); 6757#endif 6758#if M0 > 6 6759 ARM_DOT_K0XN0(a6, b, c6); 6760#endif 6761#if M0 > 7 6762 ARM_DOT_K0XN0(a7, b, c7); 6763#endif 6764 6765 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE); 6766 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE); 6767 } 6768 6769 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y); 6770 6771 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 6772 6773 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 6774 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 6775 6776#if defined(REINTERPRET_OUTPUT_AS_3D) 6777 6778 6779 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 6780 6781 6782 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D; 6783 6784#else 6785 6786 6787 dst_addr += get_global_id(2) * dst_stride_z; 6788 6789#endif 6790 6791 6792#if defined(ALPHA) 6793 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 6794#endif 6795 6796 6797#if defined(BETA) 6798#if defined(BROADCAST_BIAS) 6799 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 6800 6801 LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 6802 6803#ifndef UNIT_BETA 6804 SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 6805#endif 6806 6807 6808#if defined(MIXED_PRECISION) 6809 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 6810 ADD_BLOCK_BROADCAST(M0, c, bias_hp0); 6811#else 6812 ADD_BLOCK_BROADCAST(M0, c, bias0); 6813#endif 6814 6815#else 6816 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( 6817 2) * bias_stride_z; 6818 6819 LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6820 6821#ifndef UNIT_BETA 6822 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 6823#endif 6824 6825 6826#if defined(MIXED_PRECISION) 6827 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 6828 ADD_BLOCK(M0, c, bias_hp); 6829#else 6830 ADD_BLOCK(M0, c, bias); 6831#endif 6832 6833#endif 6834#endif 6835 6836#if defined(ACTIVATION_TYPE) 6837#if defined(MIXED_PRECISION) 6838 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL); 6839#else 6840 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 6841#endif 6842#endif 6843 6844 6845#if defined(MIXED_PRECISION) 6846 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); 6847 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6848#else 6849 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 6850#endif 6851 6852#undef LHS_BLOCK_SIZE 6853#undef LHS_OFFSET_X 6854#undef LHS_STEP_X 6855#undef RHS_BLOCK_SIZE 6856#undef RHS_OFFSET_X 6857#undef RHS_STEP_X 6858#undef LHS_STEP_LOOP 6859#undef RHS_STEP_LOOP 6860} 6861#endif 6862 6863#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE) 6864 6865__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs), 6866 __read_only image2d_t rhs_img, 6867#if defined(BETA) 6868 IMAGE_DECLARATION(bias), 6869#endif 6870 IMAGE_DECLARATION(dst), 6871 uint lhs_stride_z, 6872 uint rhs_stride_z, 6873#if defined(BETA) 6874 uint bias_stride_z, 6875#endif 6876 uint dst_stride_z 6877#if defined(REINTERPRET_OUTPUT_AS_3D) 6878 , 6879 uint dst_cross_plane_pad 6880#endif 6881 , 6882 const int M, 6883 const int N, 6884 const int K) 6885{ 6886 6887#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0) 6888 6889 6890#define LHS_BLOCK_SIZE ((K0) * (M0)) 6891 6892#if defined(LHS_INTERLEAVE) 6893#define LHS_OFFSET_X (K0) 6894#define LHS_STEP_X ((K0) * (V0)) 6895#define LHS_STEP_LOOP (1) 6896#else 6897#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 6898#define LHS_STEP_X (K0) 6899#define LHS_STEP_LOOP (V0) 6900#endif 6901 6902 6903#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0)) 6904 6905 6906#if defined(RHS_INTERLEAVE) 6907#define RHS_OFFSET_X (PIXEL_UNIT) 6908#define RHS_STEP_X (PIXEL_UNIT * (H0)) 6909#define RHS_STEP_LOOP (1) 6910#else 6911#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 6912#define RHS_STEP_X PIXEL_UNIT 6913#define RHS_STEP_LOOP (H0) 6914#endif 6915 6916#if defined(DUMMY_WORK_ITEMS) 6917 if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M)) 6918 { 6919 return; 6920 } 6921#endif 6922 6923 6924 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y + 6925 (get_global_id(2) * lhs_stride_z); 6926 6927#if defined(MATRIX_B_DEPTH) 6928 6929 const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH); 6930#else 6931 const uint z_rhs = get_global_id(2); 6932#endif 6933 6934 6935 uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X; 6936 const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT; 6937 6938 6939 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); 6940 6941 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); 6942 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 6943 6944 for(int i = 0; i < K; i += K0) 6945 { 6946 6947 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs); 6948 6949 6950 REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); 6951 LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); 6952 6953 6954 ARM_DOT_K0XN0(a0, b, c0); 6955#if M0 > 1 6956 ARM_DOT_K0XN0(a1, b, c1); 6957#endif 6958#if M0 > 2 6959 ARM_DOT_K0XN0(a2, b, c2); 6960#endif 6961#if M0 > 3 6962 ARM_DOT_K0XN0(a3, b, c3); 6963#endif 6964#if M0 > 4 6965 ARM_DOT_K0XN0(a4, b, c4); 6966#endif 6967#if M0 > 5 6968 ARM_DOT_K0XN0(a5, b, c5); 6969#endif 6970#if M0 > 6 6971 ARM_DOT_K0XN0(a6, b, c6); 6972#endif 6973#if M0 > 7 6974 ARM_DOT_K0XN0(a7, b, c7); 6975#endif 6976 6977 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE); 6978 6979 x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP; 6980 } 6981 6982 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y); 6983 6984 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 6985 6986 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 6987 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 6988 6989#if defined(REINTERPRET_OUTPUT_AS_3D) 6990 6991 6992 CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 6993 6994 6995 dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D; 6996 6997#else 6998 6999 7000 dst_addr += get_global_id(2) * dst_stride_z; 7001 7002#endif 7003 7004 7005#if defined(ALPHA) 7006 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 7007#endif 7008 7009 7010#if defined(BETA) 7011#if defined(BROADCAST_BIAS) 7012 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 7013 7014 LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 7015 7016#ifndef UNIT_BETA 7017 SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 7018#endif 7019 7020 7021#if defined(MIXED_PRECISION) 7022 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7023 ADD_BLOCK_BROADCAST(M0, c, bias_hp0); 7024#else 7025 ADD_BLOCK_BROADCAST(M0, c, bias0); 7026#endif 7027 7028#else 7029 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( 7030 2) * bias_stride_z; 7031 7032 LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7033 7034#ifndef UNIT_BETA 7035 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 7036#endif 7037 7038 7039#if defined(MIXED_PRECISION) 7040 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7041 ADD_BLOCK(M0, c, bias_hp); 7042#else 7043 ADD_BLOCK(M0, c, bias); 7044#endif 7045 7046#endif 7047#endif 7048 7049#if defined(ACTIVATION_TYPE) 7050#if defined(MIXED_PRECISION) 7051 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL); 7052#else 7053 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 7054#endif 7055#endif 7056 7057 7058#if defined(MIXED_PRECISION) 7059 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); 7060 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7061#else 7062 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7063#endif 7064 7065#undef LHS_BLOCK_SIZE 7066#undef LHS_OFFSET_X 7067#undef LHS_STEP_X 7068#undef RHS_BLOCK_SIZE 7069#undef RHS_OFFSET_X 7070#undef RHS_STEP_X 7071#undef PIXEL_UNIT 7072#undef LHS_STEP_LOOP 7073#undef RHS_STEP_LOOP 7074} 7075#endif 7076 7077#if defined(LHS_TRANSPOSE) 7078 7079#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE) 7080 7081#if defined(MIXED_PRECISION) 7082 7083#if(GPU_ARCH == GPU_ARCH_MIDGARD) 7084#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))); 7085#else 7086#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c)); 7087#endif 7088 7089#else 7090 7091#if(GPU_ARCH == GPU_ARCH_MIDGARD) 7092#define ARM_VFMA(N0, a, b, c) c += (a) * (b); 7093#else 7094#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c)); 7095#endif 7096 7097#endif 7098 7099#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \ 7100 ({ \ 7101 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \ 7102 }) 7103#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \ 7104 ({ \ 7105 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \ 7106 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \ 7107 }) 7108#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \ 7109 ({ \ 7110 ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \ 7111 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \ 7112 }) 7113#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \ 7114 ({ \ 7115 ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \ 7116 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \ 7117 }) 7118#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \ 7119 ({ \ 7120 ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \ 7121 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \ 7122 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \ 7123 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \ 7124 ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \ 7125 }) 7126 7127 7128 7129 7130 7131 7132 7133#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C) 7134 7135#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \ 7136 ({ \ 7137 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \ 7138 }) 7139#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \ 7140 ({ \ 7141 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \ 7142 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \ 7143 }) 7144#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \ 7145 ({ \ 7146 ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \ 7147 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \ 7148 }) 7149#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \ 7150 ({ \ 7151 ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \ 7152 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \ 7153 }) 7154#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \ 7155 ({ \ 7156 ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \ 7157 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \ 7158 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \ 7159 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \ 7160 ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \ 7161 }) 7162#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \ 7163 ({ \ 7164 ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \ 7165 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \ 7166 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \ 7167 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \ 7168 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \ 7169 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \ 7170 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \ 7171 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \ 7172 ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \ 7173 }) 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \ 7184 CONCAT(ARM_MM_T_NT_M0xN0x, K0) \ 7185 (M0, N0, TYPE, A, B, C) 7186 7187#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT) 7188 7189__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), 7190 IMAGE_DECLARATION(rhs), 7191#if defined(BETA) 7192 IMAGE_DECLARATION(bias), 7193#endif 7194 IMAGE_DECLARATION(dst), 7195 uint lhs_stride_z, 7196 uint rhs_stride_z, 7197#if defined(BETA) 7198 uint bias_stride_z, 7199#endif 7200 uint dst_stride_z 7201#if defined(REINTERPRET_OUTPUT_AS_3D) 7202 , 7203 uint dst_cross_plane_pad 7204#endif 7205 , 7206 const int M, 7207 const int N, 7208 const int K) 7209{ 7210 7211#define LHS_BLOCK_SIZE ((K0) * (M0)) 7212 7213#if defined(LHS_INTERLEAVE) 7214#define LHS_OFFSET_X (M0) 7215#define LHS_STEP_X ((M0) * (V0)) 7216#define LHS_STEP_LOOP (1) 7217#else 7218#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 7219#define LHS_STEP_X (M0) 7220#define LHS_STEP_LOOP (V0) 7221#endif 7222 7223 7224#define RHS_BLOCK_SIZE ((K0) * (N0)) 7225 7226 7227#if defined(RHS_INTERLEAVE) 7228#define RHS_OFFSET_X (N0) 7229#define RHS_STEP_X ((N0) * (H0)) 7230#else 7231#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 7232#define RHS_STEP_X (N0) 7233#endif 7234 7235 const uint x = get_global_id(0); 7236 const uint y = get_global_id(1); 7237 const uint z = get_global_id(2); 7238 7239 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 7240 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 7241 7242#if defined(DUMMY_WORK_ITEMS) 7243 if((x * N0 >= N) || (y * M0 >= M)) 7244 { 7245 return; 7246 } 7247#endif 7248 7249 7250 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z); 7251 7252 7253 __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y; 7254 7255#if defined(MATRIX_B_DEPTH) 7256 7257 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; 7258#else 7259 rhs_addr += z * rhs_stride_z; 7260#endif 7261 7262 7263 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); 7264 7265 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0); 7266 7267 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); 7268 __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr); 7269 7270 for(int i = 0; i < K; i += K0) 7271 { 7272 VEC_DATA_TYPE(DATA_TYPE, M0) 7273 a0; 7274 VEC_DATA_TYPE(DATA_TYPE, N0) 7275 b0; 7276 7277 a0 = VLOAD(M0)(0, lhs); 7278 b0 = VLOAD(N0)(0, rhs); 7279 7280 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7281 7282 lhs += LHS_STEP_X; 7283 rhs += RHS_STEP_X; 7284 7285#if K0 > 1 7286 a0 = VLOAD(M0)(0, lhs); 7287 b0 = VLOAD(N0)(0, rhs); 7288 7289 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7290 7291 lhs += LHS_STEP_X; 7292 rhs += RHS_STEP_X; 7293#endif 7294 7295#if K0 > 2 7296 a0 = VLOAD(M0)(0, lhs); 7297 b0 = VLOAD(N0)(0, rhs); 7298 7299 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7300 7301 lhs += LHS_STEP_X; 7302 rhs += RHS_STEP_X; 7303#endif 7304 7305#if K0 > 3 7306 a0 = VLOAD(M0)(0, lhs); 7307 b0 = VLOAD(N0)(0, rhs); 7308 7309 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7310 7311 lhs += LHS_STEP_X; 7312 rhs += RHS_STEP_X; 7313#endif 7314 7315#if K0 > 4 7316 a0 = VLOAD(M0)(0, lhs); 7317 b0 = VLOAD(N0)(0, rhs); 7318 7319 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7320 7321 lhs += LHS_STEP_X; 7322 rhs += RHS_STEP_X; 7323 7324 a0 = VLOAD(M0)(0, lhs); 7325 b0 = VLOAD(N0)(0, rhs); 7326 7327 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7328 7329 lhs += LHS_STEP_X; 7330 rhs += RHS_STEP_X; 7331 7332 a0 = VLOAD(M0)(0, lhs); 7333 b0 = VLOAD(N0)(0, rhs); 7334 7335 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7336 7337 lhs += LHS_STEP_X; 7338 rhs += RHS_STEP_X; 7339 7340 a0 = VLOAD(M0)(0, lhs); 7341 b0 = VLOAD(N0)(0, rhs); 7342 7343 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7344 7345 lhs += LHS_STEP_X; 7346 rhs += RHS_STEP_X; 7347#endif 7348 7349#if K0 > 8 7350 a0 = VLOAD(M0)(0, lhs); 7351 b0 = VLOAD(N0)(0, rhs); 7352 7353 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7354 7355 lhs += LHS_STEP_X; 7356 rhs += RHS_STEP_X; 7357 7358 a0 = VLOAD(M0)(0, lhs); 7359 b0 = VLOAD(N0)(0, rhs); 7360 7361 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7362 7363 lhs += LHS_STEP_X; 7364 rhs += RHS_STEP_X; 7365 7366 a0 = VLOAD(M0)(0, lhs); 7367 b0 = VLOAD(N0)(0, rhs); 7368 7369 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7370 7371 lhs += LHS_STEP_X; 7372 rhs += RHS_STEP_X; 7373 7374 a0 = VLOAD(M0)(0, lhs); 7375 b0 = VLOAD(N0)(0, rhs); 7376 7377 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7378 7379 lhs += LHS_STEP_X; 7380 rhs += RHS_STEP_X; 7381 7382 a0 = VLOAD(M0)(0, lhs); 7383 b0 = VLOAD(N0)(0, rhs); 7384 7385 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7386 7387 lhs += LHS_STEP_X; 7388 rhs += RHS_STEP_X; 7389 7390 a0 = VLOAD(M0)(0, lhs); 7391 b0 = VLOAD(N0)(0, rhs); 7392 7393 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7394 7395 lhs += LHS_STEP_X; 7396 rhs += RHS_STEP_X; 7397 7398 a0 = VLOAD(M0)(0, lhs); 7399 b0 = VLOAD(N0)(0, rhs); 7400 7401 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7402 7403 lhs += LHS_STEP_X; 7404 rhs += RHS_STEP_X; 7405 7406 a0 = VLOAD(M0)(0, lhs); 7407 b0 = VLOAD(N0)(0, rhs); 7408 7409 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7410 7411 lhs += LHS_STEP_X; 7412 rhs += RHS_STEP_X; 7413#endif 7414 7415#ifndef LHS_INTERLEAVE 7416 lhs += (M0 * K0 * (V0 - 1)); 7417#endif 7418 7419#ifndef RHS_INTERLEAVE 7420 rhs += (N0 * K0 * (H0 - 1)); 7421#endif 7422 } 7423 7424 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); 7425 7426 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 7427 7428#if defined(REINTERPRET_OUTPUT_AS_3D) 7429 7430 7431 CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 7432 7433 7434 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 7435 7436#else 7437 7438 7439 dst_addr += z * dst_stride_z; 7440 7441#endif 7442 7443 7444#if defined(ALPHA) 7445 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 7446#endif 7447 7448 7449#if defined(BETA) 7450#if defined(BROADCAST_BIAS) 7451 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)); 7452 7453 LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 7454 7455#ifndef UNIT_BETA 7456 SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 7457#endif 7458 7459 7460#if defined(MIXED_PRECISION) 7461 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7462 ADD_BLOCK_BROADCAST(M0, c, bias_hp0); 7463#else 7464 ADD_BLOCK_BROADCAST(M0, c, bias0); 7465#endif 7466 7467#else 7468 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( 7469 2) * bias_stride_z; 7470 7471 LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7472 7473#ifndef UNIT_BETA 7474 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 7475#endif 7476 7477#if defined(MIXED_PRECISION) 7478 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7479 ADD_BLOCK(M0, c, bias_hp); 7480#else 7481 ADD_BLOCK(M0, c, bias); 7482#endif 7483 7484#endif 7485#endif 7486 7487#if defined(ACTIVATION_TYPE) 7488#if defined(MIXED_PRECISION) 7489 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL); 7490#else 7491 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 7492#endif 7493#endif 7494 7495 7496#if defined(MIXED_PRECISION) 7497 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); 7498 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7499#else 7500 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7501#endif 7502 7503#undef LHS_BLOCK_SIZE 7504#undef LHS_OFFSET_X 7505#undef LHS_STEP_X 7506#undef RHS_BLOCK_SIZE 7507#undef RHS_OFFSET_X 7508#undef RHS_STEP_X 7509} 7510#endif 7511 7512#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE) 7513 7514__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs), 7515 __read_only image2d_t rhs_img, 7516#if defined(BETA) 7517 IMAGE_DECLARATION(bias), 7518#endif 7519 IMAGE_DECLARATION(dst), 7520 uint lhs_stride_z, 7521 uint rhs_stride_z, 7522#if defined(BETA) 7523 uint bias_stride_z, 7524#endif 7525 uint dst_stride_z 7526#if defined(REINTERPRET_OUTPUT_AS_3D) 7527 , 7528 uint dst_cross_plane_pad 7529#endif 7530 , 7531 const int M, 7532 const int N, 7533 const int K) 7534{ 7535 7536#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0) 7537 7538 7539#define LHS_BLOCK_SIZE ((K0) * (M0)) 7540 7541#if defined(LHS_INTERLEAVE) 7542#define LHS_OFFSET_X (M0) 7543#define LHS_STEP_X ((M0) * (V0)) 7544#define LHS_STEP_LOOP (1) 7545#else 7546#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 7547#define LHS_STEP_X (M0) 7548#define LHS_STEP_LOOP (V0) 7549#endif 7550 7551 7552#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT)) 7553 7554 7555#if defined(RHS_INTERLEAVE) 7556#define RHS_OFFSET_X (PIXEL_UNIT) 7557#define RHS_STEP_X ((PIXEL_UNIT) * (H0)) 7558#else 7559#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 7560#define RHS_STEP_X (PIXEL_UNIT) 7561#endif 7562 7563 const uint x = get_global_id(0); 7564 const uint y = get_global_id(1); 7565 const uint z = get_global_id(2); 7566 7567#if defined(DUMMY_WORK_ITEMS) 7568 if((x * N0 >= N) || (y * M0 >= M)) 7569 { 7570 return; 7571 } 7572#endif 7573 7574 7575 __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z); 7576 7577#if defined(MATRIX_B_DEPTH) 7578 7579 const uint z_rhs = (z % MATRIX_B_DEPTH); 7580#else 7581 const uint z_rhs = z; 7582#endif 7583 7584 7585 uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X; 7586 const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT; 7587 7588 7589 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); 7590 7591 REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0); 7592 7593 __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); 7594 7595 for(int i = 0; i < K; i += K0) 7596 { 7597 VEC_DATA_TYPE(DATA_TYPE, M0) 7598 a0; 7599 VEC_DATA_TYPE(DATA_TYPE, N0) 7600 b0; 7601 7602 a0 = VLOAD(M0)(0, lhs); 7603 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); 7604 7605 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7606 7607 lhs += LHS_STEP_X; 7608 7609#if K0 > 1 7610 a0 = VLOAD(M0)(0, lhs); 7611 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs)); 7612 7613 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7614 7615 lhs += LHS_STEP_X; 7616#endif 7617 7618#if K0 > 2 7619 a0 = VLOAD(M0)(0, lhs); 7620 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs)); 7621 7622 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7623 7624 lhs += LHS_STEP_X; 7625#endif 7626 7627#if K0 > 3 7628 a0 = VLOAD(M0)(0, lhs); 7629 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs)); 7630 7631 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7632 7633 lhs += LHS_STEP_X; 7634#endif 7635 7636#if K0 > 4 7637 a0 = VLOAD(M0)(0, lhs); 7638 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs)); 7639 7640 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7641 7642 lhs += LHS_STEP_X; 7643 7644 a0 = VLOAD(M0)(0, lhs); 7645 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs)); 7646 7647 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7648 7649 lhs += LHS_STEP_X; 7650 7651 a0 = VLOAD(M0)(0, lhs); 7652 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs)); 7653 7654 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7655 7656 lhs += LHS_STEP_X; 7657 7658 a0 = VLOAD(M0)(0, lhs); 7659 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs)); 7660 7661 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7662 7663 lhs += LHS_STEP_X; 7664#endif 7665 7666#if K0 > 8 7667 a0 = VLOAD(M0)(0, lhs); 7668 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs)); 7669 7670 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7671 7672 lhs += LHS_STEP_X; 7673 7674 a0 = VLOAD(M0)(0, lhs); 7675 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs)); 7676 7677 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7678 7679 lhs += LHS_STEP_X; 7680 7681 a0 = VLOAD(M0)(0, lhs); 7682 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs)); 7683 7684 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7685 7686 lhs += LHS_STEP_X; 7687 7688 a0 = VLOAD(M0)(0, lhs); 7689 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs)); 7690 7691 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7692 7693 lhs += LHS_STEP_X; 7694 7695 a0 = VLOAD(M0)(0, lhs); 7696 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs)); 7697 7698 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7699 7700 lhs += LHS_STEP_X; 7701 7702 a0 = VLOAD(M0)(0, lhs); 7703 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs)); 7704 7705 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7706 7707 lhs += LHS_STEP_X; 7708 7709 a0 = VLOAD(M0)(0, lhs); 7710 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs)); 7711 7712 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7713 7714 lhs += LHS_STEP_X; 7715 7716 a0 = VLOAD(M0)(0, lhs); 7717 b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs)); 7718 7719 ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); 7720 7721 lhs += LHS_STEP_X; 7722#endif 7723 7724#ifndef LHS_INTERLEAVE 7725 lhs += (M0 * K0 * (V0 - 1)); 7726#endif 7727 7728 x_rhs += K0 * RHS_STEP_X; 7729#ifndef RHS_INTERLEAVE 7730 x_rhs += (PIXEL_UNIT * K0 * (H0 - 1)); 7731#endif 7732 } 7733 7734 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); 7735 7736 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 7737 7738 const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 7739 const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 7740 7741#if defined(REINTERPRET_OUTPUT_AS_3D) 7742 7743 7744 CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 7745 7746 7747 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 7748 7749#else 7750 7751 7752 dst_addr += z * dst_stride_z; 7753 7754#endif 7755 7756 7757#if defined(ALPHA) 7758 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 7759#endif 7760 7761 7762#if defined(BETA) 7763#if defined(BROADCAST_BIAS) 7764 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)); 7765 7766 LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x); 7767 7768#ifndef UNIT_BETA 7769 SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 7770#endif 7771 7772 7773#if defined(MIXED_PRECISION) 7774 CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7775 ADD_BLOCK_BROADCAST(M0, c, bias_hp0); 7776#else 7777 ADD_BLOCK_BROADCAST(M0, c, bias0); 7778#endif 7779 7780#else 7781 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z; 7782 7783 LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7784 7785#ifndef UNIT_BETA 7786 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 7787#endif 7788 7789#if defined(MIXED_PRECISION) 7790 CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); 7791 ADD_BLOCK(M0, c, bias_hp); 7792#else 7793 ADD_BLOCK(M0, c, bias); 7794#endif 7795 7796#endif 7797#endif 7798 7799#if defined(ACTIVATION_TYPE) 7800#if defined(MIXED_PRECISION) 7801 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL); 7802#else 7803 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 7804#endif 7805#endif 7806 7807 7808#if defined(MIXED_PRECISION) 7809 CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp); 7810 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7811#else 7812 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 7813#endif 7814 7815#undef LHS_BLOCK_SIZE 7816#undef LHS_OFFSET_X 7817#undef LHS_STEP_X 7818#undef RHS_BLOCK_SIZE 7819#undef RHS_OFFSET_X 7820#undef RHS_STEP_X 7821#undef PIXEL_UNIT 7822#undef LHS_STEP_LOOP 7823#undef RHS_STEP_LOOP 7824} 7825#endif 7826 7827#endif 7828 7829#endif 7830 7831#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE) 7832 7833#define VFMA(a, b, c) \ 7834 ({ \ 7835 c = fma(a, b, c); \ 7836 }) 7837 7838#if M0 == 1 7839#define RHS_VFMA_M0xN0(i, a, b, c) \ 7840 ({ \ 7841 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7842 }) 7843#elif M0 == 2 7844#define RHS_VFMA_M0xN0(i, a, b, c) \ 7845 ({ \ 7846 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7847 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7848 }) 7849#elif M0 == 3 7850#define RHS_VFMA_M0xN0(i, a, b, c) \ 7851 ({ \ 7852 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7853 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7854 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7855 }) 7856#elif M0 == 4 7857#define RHS_VFMA_M0xN0(i, a, b, c) \ 7858 ({ \ 7859 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7860 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7861 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7862 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7863 }) 7864#elif M0 == 5 7865#define RHS_VFMA_M0xN0(i, a, b, c) \ 7866 ({ \ 7867 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7868 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7869 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7870 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7871 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 7872 }) 7873#elif M0 == 6 7874#define RHS_VFMA_M0xN0(i, a, b, c) \ 7875 ({ \ 7876 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7877 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7878 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7879 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7880 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 7881 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 7882 }) 7883#elif M0 == 7 7884#define RHS_VFMA_M0xN0(i, a, b, c) \ 7885 ({ \ 7886 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7887 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7888 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7889 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7890 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 7891 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 7892 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ 7893 }) 7894#elif M0 == 8 7895#define RHS_VFMA_M0xN0(i, a, b, c) \ 7896 ({ \ 7897 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ 7898 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ 7899 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ 7900 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ 7901 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ 7902 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ 7903 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ 7904 VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ 7905 }) 7906#else 7907#error "M0 not supported" 7908#endif 7909 7910#if defined(GEMM_MM_NATIVE) 7911 7912__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), 7913 IMAGE_DECLARATION(rhs), 7914#if defined(BETA) 7915 IMAGE_DECLARATION(bias), 7916#endif 7917 IMAGE_DECLARATION(dst), 7918 uint lhs_stride_z, 7919 uint rhs_stride_z, 7920#if defined(BETA) 7921 uint bias_stride_z, 7922#endif 7923 uint dst_stride_z, 7924 const int M, 7925 const int N, 7926 const int K 7927#if defined(REINTERPRET_INPUT_AS_3D) 7928 , 7929 uint lhs_cross_plane_pad 7930#endif 7931#if defined(REINTERPRET_OUTPUT_AS_3D) 7932 , 7933 uint dst_cross_plane_pad 7934#endif 7935 ) 7936{ 7937 7938#define RHS_BLOCK_SIZE ((K0) * (N0)) 7939 7940 7941#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 7942 7943 uint x = get_global_id(0); 7944 uint y = get_global_id(1); 7945 uint z = get_global_id(2); 7946 7947#if defined(DUMMY_WORK_ITEMS) 7948 if((x * N0 >= N) || (y * M0 >= M)) 7949 { 7950 return; 7951 } 7952#endif 7953 7954 7955 uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 7956 7957 7958 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE); 7959 7960#if defined(MATRIX_B_DEPTH) 7961 7962 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; 7963#else 7964 rhs_offset += z * rhs_stride_z; 7965#endif 7966 7967 REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); 7968 REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); 7969 7970#if defined(REINTERPRET_INPUT_AS_3D) 7971 7972 CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 7973 7974 7975 7976 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 7977 7978#else 7979 7980 7981 lhs_offset += z * lhs_stride_z; 7982 7983#endif 7984 7985 7986 REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); 7987 7988 int i = 0; 7989#if K0 > 1 7990 for(; i <= (K - K0); i += K0) 7991 { 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 8003 8004 8005 LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero); 8006 8007 RHS_VFMA_M0xN0(0, a, b0, c); 8008 RHS_VFMA_M0xN0(1, a, b1, c); 8009#if K0 > 2 8010 RHS_VFMA_M0xN0(2, a, b2, c); 8011#endif 8012#if K0 > 3 8013 RHS_VFMA_M0xN0(3, a, b3, c); 8014#endif 8015#if K0 > 4 8016 RHS_VFMA_M0xN0(4, a, b4, c); 8017 RHS_VFMA_M0xN0(5, a, b5, c); 8018 RHS_VFMA_M0xN0(6, a, b6, c); 8019 RHS_VFMA_M0xN0(7, a, b7, c); 8020#endif 8021#if K0 > 8 8022 RHS_VFMA_M0xN0(8, a, b8, c); 8023 RHS_VFMA_M0xN0(9, a, b9, c); 8024 RHS_VFMA_M0xN0(A, a, bA, c); 8025 RHS_VFMA_M0xN0(B, a, bB, c); 8026 RHS_VFMA_M0xN0(C, a, bC, c); 8027 RHS_VFMA_M0xN0(D, a, bD, c); 8028 RHS_VFMA_M0xN0(E, a, bE, c); 8029 RHS_VFMA_M0xN0(F, a, bF, c); 8030#endif 8031 8032 lhs_offset += K0 * sizeof(DATA_TYPE); 8033 rhs_offset += K0 * rhs_stride_y; 8034 } 8035#endif 8036 8037 for(; i < K; ++i) 8038 { 8039 8040 VEC_DATA_TYPE(DATA_TYPE, 2) 8041 a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0)); 8042#if M0 > 1 8043 VEC_DATA_TYPE(DATA_TYPE, 2) 8044 a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1)); 8045#endif 8046#if M0 > 2 8047 VEC_DATA_TYPE(DATA_TYPE, 2) 8048 a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2)); 8049#endif 8050#if M0 > 3 8051 VEC_DATA_TYPE(DATA_TYPE, 2) 8052 a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3)); 8053#endif 8054#if M0 > 4 8055 VEC_DATA_TYPE(DATA_TYPE, 2) 8056 a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4)); 8057#endif 8058#if M0 > 5 8059 VEC_DATA_TYPE(DATA_TYPE, 2) 8060 a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5)); 8061#endif 8062#if M0 > 6 8063 VEC_DATA_TYPE(DATA_TYPE, 2) 8064 a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6)); 8065#endif 8066#if M0 > 7 8067 VEC_DATA_TYPE(DATA_TYPE, 2) 8068 a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7)); 8069#endif 8070 8071 VEC_DATA_TYPE(DATA_TYPE, N0) 8072 b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y)); 8073 RHS_VFMA_M0xN0(0, a, b, c); 8074 8075 lhs_offset += sizeof(DATA_TYPE); 8076 rhs_offset += rhs_stride_y; 8077 } 8078 8079 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 8080 8081 REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 8082 8083#if defined(REINTERPRET_OUTPUT_AS_3D) 8084 8085 CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 8086 8087 8088 8089 dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 8090 8091#else 8092 8093 8094 dst_addr += z * dst_stride_z; 8095 8096#endif 8097 8098 8099#if defined(ALPHA) 8100 SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); 8101#endif 8102 8103 8104#if defined(BETA) 8105#if defined(BROADCAST_BIAS) 8106 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); 8107 8108 LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); 8109 8110#ifndef UNIT_BETA 8111 SCALE_BLOCK(1, DATA_TYPE, bias, BETA); 8112#endif 8113 8114 8115 ADD_BLOCK_BROADCAST(M0, c, bias0); 8116 8117#else 8118 __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z; 8119 8120 LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); 8121 8122#ifndef UNIT_BETA 8123 SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); 8124#endif 8125 8126 8127 ADD_BLOCK(M0, c, bias); 8128 8129#endif 8130#endif 8131 8132#if defined(ACTIVATION_TYPE) 8133 ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL); 8134#endif 8135 8136 const bool cond_y = y == 0; 8137 const bool cond_x = ((x + 1) * N0 >= N); 8138 8139 8140 STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 8141} 8142#endif 8143#endif 8144 8145#if defined(BETA) 8146 8147__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src), 8148 TENSOR3D_DECLARATION(dst)) 8149{ 8150 8151 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 8152 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); 8153 8154 8155 float4 alpha_ab = vload4(0, (__global float *)dst.ptr); 8156 8157 8158 float4 c = vload4(0, (__global float *)src.ptr); 8159 8160 8161 float4 out = alpha_ab + (float4)BETA * c; 8162 8163 8164 vstore4(out, 0, (__global float *)dst.ptr); 8165} 8166 8167#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) 8168 8169__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src), 8170 TENSOR3D_DECLARATION(dst)) 8171{ 8172 8173 Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 8174 Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); 8175 8176 8177 half8 alpha_ab = vload8(0, (__global half *)dst.ptr); 8178 8179 8180 half8 c = vload8(0, (__global half *)src.ptr); 8181 8182 8183 half8 out = alpha_ab + (half8)BETA * c; 8184 8185 8186 vstore8(out, 0, (__global half *)dst.ptr); 8187} 8188#endif 8189#endif )"