1R"( 2#define PARTIAL_STORE_M0 VEC_SIZE_LEFTOVER_X 3#define PARTIAL_STORE_N0 VEC_SIZE_LEFTOVER_Y 4 5 6#ifndef ARM_COMPUTE_HELPER_H 7#define ARM_COMPUTE_HELPER_H 8 9 10 11 12#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 13 VSTORE(N0) \ 14 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 15 16#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 17 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 18 VSTORE(N0) \ 19 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 20 21#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 22 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 23 VSTORE(N0) \ 24 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 25 26#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 27 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 28 VSTORE(N0) \ 29 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 30 31#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 32 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 33 VSTORE(N0) \ 34 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 35 36#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 37 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 38 VSTORE(N0) \ 39 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 40 41#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 42 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 43 VSTORE(N0) \ 44 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 45 46#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 47 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 48 VSTORE(N0) \ 49 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 50 51#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 52 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 53 VSTORE(N0) \ 54 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 55 56#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 57 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 58 VSTORE(N0) \ 59 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 60 61#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 62 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 63 VSTORE(N0) \ 64 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 65 66#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 67 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 68 VSTORE(N0) \ 69 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 70 71#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 72 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 73 VSTORE(N0) \ 74 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 75 76#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 77 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 78 VSTORE(N0) \ 79 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 80 81#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 82 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 83 VSTORE(N0) \ 84 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 85 86#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 87 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 88 VSTORE(N0) \ 89 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 90 91 92 93#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 94 VSTORE(N0) \ 95 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 96 97#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 98 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 99 VSTORE(N0) \ 100 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 101 102#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 103 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 104 VSTORE(N0) \ 105 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 106 107#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 108 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 109 VSTORE(N0) \ 110 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 111 112#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 113 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114 VSTORE(N0) \ 115 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 116 117#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 118 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119 VSTORE(N0) \ 120 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 121 122#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 123 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124 VSTORE(N0) \ 125 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 126 127#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 128 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129 VSTORE(N0) \ 130 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 131 132#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 133 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 134 VSTORE(N0) \ 135 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 136 137#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 138 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139 VSTORE(N0) \ 140 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 141 142#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 143 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 144 VSTORE(N0) \ 145 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 146 147#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 148 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 149 VSTORE(N0) \ 150 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 151 152#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 153 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 154 VSTORE(N0) \ 155 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 156 157#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 158 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 159 VSTORE(N0) \ 160 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 161 162#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 163 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 164 VSTORE(N0) \ 165 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 166 167#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 168 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 169 VSTORE(N0) \ 170 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 171 172 173 174 175#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 176#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 177 178 179 180#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 181#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 182 183 184 185#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186 VSTORE_PARTIAL(N0, STORE_N0) \ 187 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 188 189#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 190 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 191 VSTORE_PARTIAL(N0, STORE_N0) \ 192 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 193 194#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 195 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 196 VSTORE_PARTIAL(N0, STORE_N0) \ 197 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 198 199#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 200 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 201 VSTORE_PARTIAL(N0, STORE_N0) \ 202 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 203 204#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 205 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 206 VSTORE_PARTIAL(N0, STORE_N0) \ 207 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 208 209#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 210 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 211 VSTORE_PARTIAL(N0, STORE_N0) \ 212 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 213 214#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 215 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 216 VSTORE_PARTIAL(N0, STORE_N0) \ 217 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 218 219#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 220 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 221 VSTORE_PARTIAL(N0, STORE_N0) \ 222 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 223 224#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 225 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 226 VSTORE_PARTIAL(N0, STORE_N0) \ 227 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 228 229#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 230 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 231 VSTORE_PARTIAL(N0, STORE_N0) \ 232 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 233 234#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 235 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 236 VSTORE_PARTIAL(N0, STORE_N0) \ 237 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 238 239#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 240 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 241 VSTORE_PARTIAL(N0, STORE_N0) \ 242 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 243 244#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 245 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 246 VSTORE_PARTIAL(N0, STORE_N0) \ 247 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 248 249#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 250 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 251 VSTORE_PARTIAL(N0, STORE_N0) \ 252 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 253 254#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 255 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 256 VSTORE_PARTIAL(N0, STORE_N0) \ 257 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 258 259#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 260 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 261 VSTORE_PARTIAL(N0, STORE_N0) \ 262 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 263 264 265 266#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 267#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 268 269#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 270 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 271 { \ 272 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 273 } \ 274 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 275 { \ 276 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 277 } \ 278 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 279 { \ 280 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 281 } \ 282 else \ 283 { \ 284 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 285 } 286 287#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 288 if(!(PARTIAL_COND_X)) \ 289 { \ 290 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 291 } \ 292 else \ 293 { \ 294 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 295 } 296 297#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 298 if(!(PARTIAL_COND_Y)) \ 299 { \ 300 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 301 } \ 302 else \ 303 { \ 304 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 305 } 306 307 308#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 309 310 311#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 312 313#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 314 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 315 316#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 317 318#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 319 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 320 321#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 322 323#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 324 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 325 326#else 327 328#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 329 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 330 331#endif 332 333#endif 334 335 336#if defined(PARTIAL_STORE_M0) 337 338#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 339 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 340#else 341#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 342 ((uint)(y * M0)) 343#endif 344 345 346 347#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 348 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 349 350 351#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 352#pragma OPENCL EXTENSION cl_khr_fp16 : enable 353#endif 354 355#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 357#endif 358 359#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 360#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 361#endif 362 363#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 364#pragma OPENCL EXTENSION cl_arm_printf : enable 365#endif 366 367#define GPU_ARCH_MIDGARD 0x100 368#define GPU_ARCH_BIFROST 0x200 369#define GPU_ARCH_VALHALL 0x300 370 371 372#define CONCAT(a, b) a##b 373 374 375#define EXPAND(x) x 376 377 378#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 379 380 381#define REV1(x) ((x)) 382#define REV2(x) ((x).s10) 383#define REV3(x) ((x).s210) 384#define REV4(x) ((x).s3210) 385#define REV8(x) ((x).s76543210) 386#define REV16(x) ((x).sFEDCBA9876543210) 387 388 389 390#define REVERSE_STR(x, s) REV##s((x)) 391#define REVERSE(x, s) REVERSE_STR(x, s) 392 393 394 395#define ROT1_0(x) ((x)) 396#define ROT1_1(x) ((x)) 397 398#define ROT2_0(x) ((x)) 399#define ROT2_1(x) ((x).s10) 400#define ROT2_2(x) ((x)) 401 402#define ROT3_0(x) ((x)) 403#define ROT3_1(x) ((x).s201) 404#define ROT3_2(x) ((x).s120) 405#define ROT3_3(x) ((x)) 406 407#define ROT4_0(x) ((x)) 408#define ROT4_1(x) ((x).s3012) 409#define ROT4_2(x) ((x).s2301) 410#define ROT4_3(x) ((x).s1230) 411#define ROT4_4(x) ((x)) 412 413#define ROT8_0(x) ((x)) 414#define ROT8_1(x) ((x).s70123456) 415#define ROT8_2(x) ((x).s67012345) 416#define ROT8_3(x) ((x).s56701234) 417#define ROT8_4(x) ((x).s45670123) 418#define ROT8_5(x) ((x).s34567012) 419#define ROT8_6(x) ((x).s23456701) 420#define ROT8_7(x) ((x).s12345670) 421#define ROT8_8(x) ((x)) 422 423#define ROT16_0(x) ((x)) 424#define ROT16_1(x) ((x).sF0123456789ABCDE) 425#define ROT16_2(x) ((x).sEF0123456789ABCD) 426#define ROT16_3(x) ((x).sDEF0123456789ABC) 427#define ROT16_4(x) ((x).sCDEF0123456789AB) 428#define ROT16_5(x) ((x).sBCDEF0123456789A) 429#define ROT16_6(x) ((x).sABCDEF0123456789) 430#define ROT16_7(x) ((x).s9ABCDEF012345678) 431#define ROT16_8(x) ((x).s89ABCDEF01234567) 432#define ROT16_9(x) ((x).s789ABCDEF0123456) 433#define ROT16_10(x) ((x).s6789ABCDEF012345) 434#define ROT16_11(x) ((x).s56789ABCDEF01234) 435#define ROT16_12(x) ((x).s456789ABCDEF0123) 436#define ROT16_13(x) ((x).s3456789ABCDEF012) 437#define ROT16_14(x) ((x).s23456789ABCDEF01) 438#define ROT16_15(x) ((x).s123456789ABCDEF0) 439#define ROT16_16(x) ((x)) 440 441 442 443#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 444#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 445 446 447 448#define V_OFFS1(dt) (dt##1)(0) 449#define V_OFFS2(dt) (dt##2)(0, 1) 450#define V_OFFS3(dt) (dt##3)(0, 1, 2) 451#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 452#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 453#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 454 455 456 457#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 458#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 459 460 461#define VLOAD_STR(size) vload##size 462#define VLOAD(size) VLOAD_STR(size) 463 464 465#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 466#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 467 468#define NO_LOAD(data, offs, ptr) \ 469 { \ 470 } 471 472 473#define vload_partial_1_0 NO_LOAD 474#define vload_partial_1_1 vload1 475#define vload_partial_1_2 NO_LOAD 476#define vload_partial_1_3 NO_LOAD 477#define vload_partial_1_4 NO_LOAD 478#define vload_partial_1_5 NO_LOAD 479#define vload_partial_1_6 NO_LOAD 480#define vload_partial_1_7 NO_LOAD 481#define vload_partial_1_8 NO_LOAD 482#define vload_partial_1_9 NO_LOAD 483#define vload_partial_1_10 NO_LOAD 484#define vload_partial_1_11 NO_LOAD 485#define vload_partial_1_12 NO_LOAD 486#define vload_partial_1_13 NO_LOAD 487#define vload_partial_1_14 NO_LOAD 488#define vload_partial_1_15 NO_LOAD 489#define vload_partial_1_16 NO_LOAD 490 491#define vload_partial_2_0 NO_LOAD 492#define vload_partial_2_1 vload_partial_1 493#define vload_partial_2_2 vload_partial_2 494#define vload_partial_2_3 NO_LOAD 495#define vload_partial_2_4 NO_LOAD 496#define vload_partial_2_5 NO_LOAD 497#define vload_partial_2_6 NO_LOAD 498#define vload_partial_2_7 NO_LOAD 499#define vload_partial_2_8 NO_LOAD 500#define vload_partial_2_9 NO_LOAD 501#define vload_partial_2_10 NO_LOAD 502#define vload_partial_2_11 NO_LOAD 503#define vload_partial_2_12 NO_LOAD 504#define vload_partial_2_13 NO_LOAD 505#define vload_partial_2_14 NO_LOAD 506#define vload_partial_2_15 NO_LOAD 507#define vload_partial_2_16 NO_LOAD 508 509#define vload_partial_3_0 NO_LOAD 510#define vload_partial_3_1 vload_partial_1 511#define vload_partial_3_2 vload_partial_2 512#define vload_partial_3_3 vload_partial_3 513#define vload_partial_3_4 NO_LOAD 514#define vload_partial_3_5 NO_LOAD 515#define vload_partial_3_6 NO_LOAD 516#define vload_partial_3_7 NO_LOAD 517#define vload_partial_3_8 NO_LOAD 518#define vload_partial_3_9 NO_LOAD 519#define vload_partial_3_10 NO_LOAD 520#define vload_partial_3_11 NO_LOAD 521#define vload_partial_3_12 NO_LOAD 522#define vload_partial_3_13 NO_LOAD 523#define vload_partial_3_14 NO_LOAD 524#define vload_partial_3_15 NO_LOAD 525#define vload_partial_3_16 NO_LOAD 526 527#define vload_partial_4_0 NO_LOAD 528#define vload_partial_4_1 vload_partial_1 529#define vload_partial_4_2 vload_partial_2 530#define vload_partial_4_3 vload_partial_3 531#define vload_partial_4_4 vload_partial_4 532#define vload_partial_4_5 NO_LOAD 533#define vload_partial_4_6 NO_LOAD 534#define vload_partial_4_7 NO_LOAD 535#define vload_partial_4_8 NO_LOAD 536#define vload_partial_4_9 NO_LOAD 537#define vload_partial_4_10 NO_LOAD 538#define vload_partial_4_11 NO_LOAD 539#define vload_partial_4_12 NO_LOAD 540#define vload_partial_4_13 NO_LOAD 541#define vload_partial_4_14 NO_LOAD 542#define vload_partial_4_15 NO_LOAD 543#define vload_partial_4_16 NO_LOAD 544 545#define vload_partial_8_0 NO_LOAD 546#define vload_partial_8_1 vload_partial_1 547#define vload_partial_8_2 vload_partial_2 548#define vload_partial_8_3 vload_partial_3 549#define vload_partial_8_4 vload_partial_4 550#define vload_partial_8_5 vload_partial_5 551#define vload_partial_8_6 vload_partial_6 552#define vload_partial_8_7 vload_partial_7 553#define vload_partial_8_8 vload_partial_8 554#define vload_partial_8_9 NO_LOAD 555#define vload_partial_8_10 NO_LOAD 556#define vload_partial_8_11 NO_LOAD 557#define vload_partial_8_12 NO_LOAD 558#define vload_partial_8_13 NO_LOAD 559#define vload_partial_8_14 NO_LOAD 560#define vload_partial_8_15 NO_LOAD 561#define vload_partial_8_16 NO_LOAD 562 563#define vload_partial_16_0 NO_LOAD 564#define vload_partial_16_1 vload_partial_1 565#define vload_partial_16_2 vload_partial_2 566#define vload_partial_16_3 vload_partial_3 567#define vload_partial_16_4 vload_partial_4 568#define vload_partial_16_5 vload_partial_5 569#define vload_partial_16_6 vload_partial_6 570#define vload_partial_16_7 vload_partial_7 571#define vload_partial_16_8 vload_partial_8 572#define vload_partial_16_9 vload_partial_9 573#define vload_partial_16_10 vload_partial_10 574#define vload_partial_16_11 vload_partial_11 575#define vload_partial_16_12 vload_partial_12 576#define vload_partial_16_13 vload_partial_13 577#define vload_partial_16_14 vload_partial_14 578#define vload_partial_16_15 vload_partial_15 579#define vload_partial_16_16 vload_partial_16 580 581 582#define vload_partial_1(DATA, OFFSET, PTR) \ 583 DATA.s0 = vload1(OFFSET, PTR); 584 585#define vload_partial_2(DATA, OFFSET, PTR) \ 586 DATA.s01 = vload2(OFFSET, PTR); 587 588#define vload_partial_3(DATA, OFFSET, PTR) \ 589 DATA.s012 = vload3(OFFSET, PTR); 590 591#define vload_partial_4(DATA, OFFSET, PTR) \ 592 DATA.s0123 = vload4(OFFSET, PTR); 593 594#define vload_partial_5(DATA, OFFSET, PTR) \ 595 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 596 DATA.s4 = vload1(OFFSET, PTR + 4); 597 598#define vload_partial_6(DATA, OFFSET, PTR) \ 599 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 600 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 601 602#define vload_partial_7(DATA, OFFSET, PTR) \ 603 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 604 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 605 606#define vload_partial_8(DATA, OFFSET, PTR) \ 607 DATA.s01234567 = vload8(OFFSET, PTR); 608 609#define vload_partial_9(DATA, OFFSET, PTR) \ 610 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 611 DATA.s8 = vload1(OFFSET, PTR + 8); 612 613#define vload_partial_10(DATA, OFFSET, PTR) \ 614 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 615 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 616 617#define vload_partial_11(DATA, OFFSET, PTR) \ 618 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 619 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 620 621#define vload_partial_12(DATA, OFFSET, PTR) \ 622 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 623 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 624 625#define vload_partial_13(DATA, OFFSET, PTR) \ 626 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 627 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 628 629#define vload_partial_14(DATA, OFFSET, PTR) \ 630 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 631 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 632 633#define vload_partial_15(DATA, OFFSET, PTR) \ 634 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 635 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 636 637#define vload_partial_16(DATA, OFFSET, PTR) \ 638 DATA = vload16(OFFSET, PTR); 639 640 641 642#define PIXEL_UNIT4 1 643#define PIXEL_UNIT8 2 644#define PIXEL_UNIT16 4 645 646 647#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 648#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 649 650 651#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 652#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 653#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 654 655#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 656#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 657#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 658#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 659#endif 660 661#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 662#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 663#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 664 665#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 666#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 667#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 668#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 669#endif 670 671 672#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 673#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 674 675 676#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 677#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 678 679#define VSTORE_STR(size) vstore##size 680#define VSTORE(size) VSTORE_STR(size) 681 682#define float1 float 683#define half1 half 684#define char1 char 685#define uchar1 uchar 686#define short1 short 687#define ushort1 ushort 688#define int1 int 689#define uint1 uint 690#define long1 long 691#define ulong1 ulong 692#define double1 double 693 694#define vload1(OFFSET, PTR) *(OFFSET + PTR) 695#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 696 697 698#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 699#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 700 701#define NO_STORE(data, offs, ptr) \ 702 { \ 703 } 704 705 706#define vstore_partial_1_0 NO_STORE 707#define vstore_partial_1_1 vstore1 708#define vstore_partial_1_2 NO_STORE 709#define vstore_partial_1_3 NO_STORE 710#define vstore_partial_1_4 NO_STORE 711#define vstore_partial_1_5 NO_STORE 712#define vstore_partial_1_6 NO_STORE 713#define vstore_partial_1_7 NO_STORE 714#define vstore_partial_1_8 NO_STORE 715#define vstore_partial_1_9 NO_STORE 716#define vstore_partial_1_10 NO_STORE 717#define vstore_partial_1_11 NO_STORE 718#define vstore_partial_1_12 NO_STORE 719#define vstore_partial_1_13 NO_STORE 720#define vstore_partial_1_14 NO_STORE 721#define vstore_partial_1_15 NO_STORE 722#define vstore_partial_1_16 NO_STORE 723 724#define vstore_partial_2_0 NO_STORE 725#define vstore_partial_2_1 vstore_partial_1 726#define vstore_partial_2_2 vstore_partial_2 727#define vstore_partial_2_3 NO_STORE 728#define vstore_partial_2_4 NO_STORE 729#define vstore_partial_2_5 NO_STORE 730#define vstore_partial_2_6 NO_STORE 731#define vstore_partial_2_7 NO_STORE 732#define vstore_partial_2_8 NO_STORE 733#define vstore_partial_2_9 NO_STORE 734#define vstore_partial_2_10 NO_STORE 735#define vstore_partial_2_11 NO_STORE 736#define vstore_partial_2_12 NO_STORE 737#define vstore_partial_2_13 NO_STORE 738#define vstore_partial_2_14 NO_STORE 739#define vstore_partial_2_15 NO_STORE 740#define vstore_partial_2_16 NO_STORE 741 742#define vstore_partial_3_0 NO_STORE 743#define vstore_partial_3_1 vstore_partial_1 744#define vstore_partial_3_2 vstore_partial_2 745#define vstore_partial_3_3 vstore_partial_3 746#define vstore_partial_3_4 NO_STORE 747#define vstore_partial_3_5 NO_STORE 748#define vstore_partial_3_6 NO_STORE 749#define vstore_partial_3_7 NO_STORE 750#define vstore_partial_3_8 NO_STORE 751#define vstore_partial_3_9 NO_STORE 752#define vstore_partial_3_10 NO_STORE 753#define vstore_partial_3_11 NO_STORE 754#define vstore_partial_3_12 NO_STORE 755#define vstore_partial_3_13 NO_STORE 756#define vstore_partial_3_14 NO_STORE 757#define vstore_partial_3_15 NO_STORE 758#define vstore_partial_3_16 NO_STORE 759 760#define vstore_partial_4_0 NO_STORE 761#define vstore_partial_4_1 vstore_partial_1 762#define vstore_partial_4_2 vstore_partial_2 763#define vstore_partial_4_3 vstore_partial_3 764#define vstore_partial_4_4 vstore_partial_4 765#define vstore_partial_4_5 NO_STORE 766#define vstore_partial_4_6 NO_STORE 767#define vstore_partial_4_7 NO_STORE 768#define vstore_partial_4_8 NO_STORE 769#define vstore_partial_4_9 NO_STORE 770#define vstore_partial_4_10 NO_STORE 771#define vstore_partial_4_11 NO_STORE 772#define vstore_partial_4_12 NO_STORE 773#define vstore_partial_4_13 NO_STORE 774#define vstore_partial_4_14 NO_STORE 775#define vstore_partial_4_15 NO_STORE 776#define vstore_partial_4_16 NO_STORE 777 778#define vstore_partial_8_0 NO_STORE 779#define vstore_partial_8_1 vstore_partial_1 780#define vstore_partial_8_2 vstore_partial_2 781#define vstore_partial_8_3 vstore_partial_3 782#define vstore_partial_8_4 vstore_partial_4 783#define vstore_partial_8_5 vstore_partial_5 784#define vstore_partial_8_6 vstore_partial_6 785#define vstore_partial_8_7 vstore_partial_7 786#define vstore_partial_8_8 vstore_partial_8 787#define vstore_partial_8_9 NO_STORE 788#define vstore_partial_8_10 NO_STORE 789#define vstore_partial_8_11 NO_STORE 790#define vstore_partial_8_12 NO_STORE 791#define vstore_partial_8_13 NO_STORE 792#define vstore_partial_8_14 NO_STORE 793#define vstore_partial_8_15 NO_STORE 794#define vstore_partial_8_16 NO_STORE 795 796#define vstore_partial_16_0 NO_STORE 797#define vstore_partial_16_1 vstore_partial_1 798#define vstore_partial_16_2 vstore_partial_2 799#define vstore_partial_16_3 vstore_partial_3 800#define vstore_partial_16_4 vstore_partial_4 801#define vstore_partial_16_5 vstore_partial_5 802#define vstore_partial_16_6 vstore_partial_6 803#define vstore_partial_16_7 vstore_partial_7 804#define vstore_partial_16_8 vstore_partial_8 805#define vstore_partial_16_9 vstore_partial_9 806#define vstore_partial_16_10 vstore_partial_10 807#define vstore_partial_16_11 vstore_partial_11 808#define vstore_partial_16_12 vstore_partial_12 809#define vstore_partial_16_13 vstore_partial_13 810#define vstore_partial_16_14 vstore_partial_14 811#define vstore_partial_16_15 vstore_partial_15 812#define vstore_partial_16_16 vstore_partial_16 813 814 815#define vstore_partial_1(DATA, OFFSET, PTR) \ 816 vstore1(DATA.s0, OFFSET, PTR); 817 818#define vstore_partial_2(DATA, OFFSET, PTR) \ 819 vstore2(DATA.s01, OFFSET, PTR); 820 821#define vstore_partial_3(DATA, OFFSET, PTR) \ 822 vstore3(DATA.s012, OFFSET, PTR); 823 824#define vstore_partial_4(DATA, OFFSET, PTR) \ 825 vstore4(DATA.s0123, OFFSET, PTR); 826 827#define vstore_partial_5(DATA, OFFSET, PTR) \ 828 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 829 vstore1(DATA.s4, OFFSET, PTR + 4); 830 831#define vstore_partial_6(DATA, OFFSET, PTR) \ 832 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 833 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 834 835#define vstore_partial_7(DATA, OFFSET, PTR) \ 836 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 837 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 838 839#define vstore_partial_8(DATA, OFFSET, PTR) \ 840 vstore8(DATA.s01234567, OFFSET, PTR); 841 842#define vstore_partial_9(DATA, OFFSET, PTR) \ 843 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 844 vstore1(DATA.s8, OFFSET, PTR + 8); 845 846#define vstore_partial_10(DATA, OFFSET, PTR) \ 847 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 848 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 849 850#define vstore_partial_11(DATA, OFFSET, PTR) \ 851 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 852 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 853 854#define vstore_partial_12(DATA, OFFSET, PTR) \ 855 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 856 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 857 858#define vstore_partial_13(DATA, OFFSET, PTR) \ 859 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 860 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 861 862#define vstore_partial_14(DATA, OFFSET, PTR) \ 863 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 864 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 865 866#define vstore_partial_15(DATA, OFFSET, PTR) \ 867 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 868 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 869 870#define vstore_partial_16(DATA, OFFSET, PTR) \ 871 vstore16(DATA, OFFSET, PTR); 872 873 874 875 876 877#define convert_float_sat convert_float 878#define convert_float1_sat convert_float 879#define convert_float2_sat convert_float2 880#define convert_float3_sat convert_float3 881#define convert_float4_sat convert_float4 882#define convert_float8_sat convert_float8 883#define convert_float16_sat convert_float16 884#define convert_half_sat convert_float 885#define convert_half1_sat convert_half 886#define convert_half2_sat convert_half2 887#define convert_half3_sat convert_half3 888#define convert_half4_sat convert_half4 889#define convert_half8_sat convert_half8 890#define convert_half16_sat convert_half16 891 892#define convert_float1 convert_float 893#define convert_half1 convert_half 894#define convert_char1 convert_char 895#define convert_uchar1 convert_uchar 896#define convert_short1 convert_short 897#define convert_ushort1 convert_ushort 898#define convert_int1 convert_int 899#define convert_uint1 convert_uint 900#define convert_long1 convert_long 901#define convert_ulong1 convert_ulong 902#define convert_double1 convert_double 903 904#define convert_char1_sat convert_char_sat 905#define convert_uchar1_sat convert_uchar_sat 906#define convert_uchar2_sat convert_uchar2_sat 907#define convert_uchar3_sat convert_uchar3_sat 908#define convert_uchar4_sat convert_uchar4_sat 909#define convert_uchar8_sat convert_uchar8_sat 910#define convert_uchar16_sat convert_uchar16_sat 911#define convert_short1_sat convert_short_sat 912#define convert_ushort1_sat convert_ushort_sat 913#define convert_int1_sat convert_int_sat 914#define convert_uint1_sat convert_uint_sat 915#define convert_long1_sat convert_long_sat 916#define convert_ulong1_sat convert_ulong_sat 917#define convert_double1_sat convert_double_sat 918 919#define VEC_DATA_TYPE_STR(type, size) type##size 920#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 921 922#define CONVERT_STR(x, type) (convert_##type((x))) 923#define CONVERT(x, type) CONVERT_STR(x, type) 924 925#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 926#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 927 928#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 929#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 930 931#define select_vec_dt_uchar(size) uchar##size 932#define select_vec_dt_char(size) char##size 933#define select_vec_dt_ushort(size) ushort##size 934#define select_vec_dt_short(size) short##size 935#define select_vec_dt_half(size) short##size 936#define select_vec_dt_uint(size) uint##size 937#define select_vec_dt_int(size) int##size 938#define select_vec_dt_float(size) int##size 939#define select_vec_dt_ulong(size) ulong##size 940#define select_vec_dt_long(size) long##size 941 942#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 943#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 944#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 945 946#define signed_int_vec_dt_uchar(size) char##size 947#define signed_int_vec_dt_char(size) char##size 948#define signed_int_vec_dt_ushort(size) short##size 949#define signed_int_vec_dt_short(size) short##size 950#define signed_int_vec_dt_half(size) short##size 951#define signed_int_vec_dt_uint(size) int##size 952#define signed_int_vec_dt_int(size) int##size 953#define signed_int_vec_dt_float(size) int##size 954#define signed_int_vec_dt_ulong(size) long##size 955#define signed_int_vec_dt_long(size) long##size 956 957#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 958#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 959#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 960 961#define sum_reduce_1(x) (x) 962#define sum_reduce_2(x) ((x).s0) + ((x).s1) 963#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 964#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 965#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 966#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 967 968#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 969#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 970 971#define prod_reduce_1(x) (x) 972#define prod_reduce_2(x) ((x).s0) * ((x).s1) 973#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 974#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 975#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 976#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 977 978#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 979#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 980 981#define max_reduce_1(x) (x) 982#define max_reduce_2(x) max(((x).s0), ((x).s1)) 983#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 984#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 985#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 986#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 987 988#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 989#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 990 991#define VECTOR_DECLARATION(name) \ 992 __global uchar *name##_ptr, \ 993 uint name##_stride_x, \ 994 uint name##_step_x, \ 995 uint name##_offset_first_element_in_bytes 996 997#define IMAGE_DECLARATION(name) \ 998 __global uchar *name##_ptr, \ 999 uint name##_stride_x, \ 1000 uint name##_step_x, \ 1001 uint name##_stride_y, \ 1002 uint name##_step_y, \ 1003 uint name##_offset_first_element_in_bytes 1004 1005#define TENSOR3D_DECLARATION(name) \ 1006 __global uchar *name##_ptr, \ 1007 uint name##_stride_x, \ 1008 uint name##_step_x, \ 1009 uint name##_stride_y, \ 1010 uint name##_step_y, \ 1011 uint name##_stride_z, \ 1012 uint name##_step_z, \ 1013 uint name##_offset_first_element_in_bytes 1014 1015#define TENSOR4D_DECLARATION(name) \ 1016 __global uchar *name##_ptr, \ 1017 uint name##_stride_x, \ 1018 uint name##_step_x, \ 1019 uint name##_stride_y, \ 1020 uint name##_step_y, \ 1021 uint name##_stride_z, \ 1022 uint name##_step_z, \ 1023 uint name##_stride_w, \ 1024 uint name##_step_w, \ 1025 uint name##_offset_first_element_in_bytes 1026 1027#define TENSOR5D_DECLARATION(name) \ 1028 __global uchar *name##_ptr, \ 1029 uint name##_stride_x, \ 1030 uint name##_step_x, \ 1031 uint name##_stride_y, \ 1032 uint name##_step_y, \ 1033 uint name##_stride_z, \ 1034 uint name##_step_z, \ 1035 uint name##_stride_w, \ 1036 uint name##_step_w, \ 1037 uint name##_stride_v, \ 1038 uint name##_step_v, \ 1039 uint name##_offset_first_element_in_bytes 1040 1041#define CONVERT_TO_VECTOR_STRUCT(name) \ 1042 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1043 1044#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1045 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1046 1047#define CONVERT_TO_IMAGE_STRUCT(name) \ 1048 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1049 1050#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1051 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1052 1053#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1054 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1055 1056#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1057 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1058 1059#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1060 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1061 1062#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1063 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1064 name##_stride_z, name##_step_z) 1065 1066#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1067 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1068 1069#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1070 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1071 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1072 1073#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1074 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1075 1076#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1077 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1078 name##_stride_z, name##_step_z) 1079 1080 1081typedef struct Vector 1082{ 1083 __global uchar *ptr; 1084 int offset_first_element_in_bytes; 1085 int stride_x; 1086} Vector; 1087 1088 1089typedef struct Image 1090{ 1091 __global uchar *ptr; 1092 int offset_first_element_in_bytes; 1093 int stride_x; 1094 int stride_y; 1095} Image; 1096 1097 1098typedef struct Tensor3D 1099{ 1100 __global uchar *ptr; 1101 int offset_first_element_in_bytes; 1102 int stride_x; 1103 int stride_y; 1104 int stride_z; 1105} Tensor3D; 1106 1107 1108typedef struct Tensor4D 1109{ 1110 __global uchar *ptr; 1111 int offset_first_element_in_bytes; 1112 int stride_x; 1113 int stride_y; 1114 int stride_z; 1115 int stride_w; 1116} Tensor4D; 1117 1118 1119inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1120{ 1121 Vector vector = 1122 { 1123 .ptr = ptr, 1124 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1125 .stride_x = stride_x, 1126 }; 1127 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1128 return vector; 1129} 1130 1131 1132inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1133{ 1134 Image img = 1135 { 1136 .ptr = ptr, 1137 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1138 .stride_x = stride_x, 1139 .stride_y = stride_y 1140 }; 1141 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1142 return img; 1143} 1144 1145 1146inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1147{ 1148 Image img = 1149 { 1150 .ptr = ptr, 1151 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1152 .stride_x = stride_x, 1153 .stride_y = stride_y 1154 }; 1155 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1156 return img; 1157} 1158 1159 1160inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1161{ 1162 Tensor3D tensor = 1163 { 1164 .ptr = ptr, 1165 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1166 .stride_x = stride_x, 1167 .stride_y = stride_y, 1168 .stride_z = stride_z 1169 }; 1170 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1171 return tensor; 1172} 1173 1174 1175inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1176{ 1177 Tensor3D tensor = 1178 { 1179 .ptr = ptr, 1180 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1181 .stride_x = stride_x, 1182 .stride_y = stride_y, 1183 .stride_z = stride_z 1184 }; 1185 return tensor; 1186} 1187 1188inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1189 uint step_w, 1190 uint mod_size) 1191{ 1192 Tensor4D tensor = 1193 { 1194 .ptr = ptr, 1195 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1196 .stride_x = stride_x, 1197 .stride_y = stride_y, 1198 .stride_z = stride_z, 1199 .stride_w = stride_w 1200 }; 1201 1202 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1203 return tensor; 1204} 1205 1206 1207inline __global const uchar *vector_offset(const Vector *vec, int x) 1208{ 1209 return vec->ptr + x * vec->stride_x; 1210} 1211 1212 1213inline __global uchar *offset(const Image *img, int x, int y) 1214{ 1215 return img->ptr + x * img->stride_x + y * img->stride_y; 1216} 1217 1218 1219inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1220{ 1221 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1222} 1223 1224 1225inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1226{ 1227 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1228} 1229 1230 1231inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1232{ 1233 uint num_elements = width * height; 1234 1235 const uint z = index / num_elements; 1236 1237 index %= num_elements; 1238 1239 const uint y = index / width; 1240 1241 index %= width; 1242 1243 const uint x = index; 1244 1245 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1246} 1247 1248#endif 1249 1250#ifndef ARM_COMPUTE_REPEAT_H 1251#define ARM_COMPUTE_REPEAT_H 1252 1253 1254#ifndef ARM_COMPUTE_HELPER_H 1255#define ARM_COMPUTE_HELPER_H 1256 1257 1258 1259 1260#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1261 VSTORE(N0) \ 1262 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1263 1264#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1265 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1266 VSTORE(N0) \ 1267 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1268 1269#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1270 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1271 VSTORE(N0) \ 1272 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1273 1274#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1275 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1276 VSTORE(N0) \ 1277 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1278 1279#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1280 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1281 VSTORE(N0) \ 1282 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1283 1284#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1285 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1286 VSTORE(N0) \ 1287 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1288 1289#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1290 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1291 VSTORE(N0) \ 1292 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1293 1294#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1295 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1296 VSTORE(N0) \ 1297 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1298 1299#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1300 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1301 VSTORE(N0) \ 1302 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1303 1304#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1305 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1306 VSTORE(N0) \ 1307 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1308 1309#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1310 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1311 VSTORE(N0) \ 1312 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1313 1314#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1315 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1316 VSTORE(N0) \ 1317 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1318 1319#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1320 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1321 VSTORE(N0) \ 1322 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1323 1324#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1325 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1326 VSTORE(N0) \ 1327 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1328 1329#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1330 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1331 VSTORE(N0) \ 1332 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1333 1334#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1335 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1336 VSTORE(N0) \ 1337 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1338 1339 1340 1341#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1342 VSTORE(N0) \ 1343 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1344 1345#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1346 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1347 VSTORE(N0) \ 1348 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1349 1350#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1351 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1352 VSTORE(N0) \ 1353 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1354 1355#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1356 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1357 VSTORE(N0) \ 1358 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1359 1360#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1361 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1362 VSTORE(N0) \ 1363 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1364 1365#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1366 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1367 VSTORE(N0) \ 1368 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1369 1370#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1371 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1372 VSTORE(N0) \ 1373 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1374 1375#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1376 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1377 VSTORE(N0) \ 1378 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1379 1380#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1381 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1382 VSTORE(N0) \ 1383 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1384 1385#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 1386 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1387 VSTORE(N0) \ 1388 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1389 1390#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1391 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1392 VSTORE(N0) \ 1393 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1394 1395#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1396 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1397 VSTORE(N0) \ 1398 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1399 1400#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1401 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1402 VSTORE(N0) \ 1403 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1404 1405#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1406 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1407 VSTORE(N0) \ 1408 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1409 1410#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1411 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1412 VSTORE(N0) \ 1413 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1414 1415#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1416 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1417 VSTORE(N0) \ 1418 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1419 1420 1421 1422 1423#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1424#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1425 1426 1427 1428#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1429#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1430 1431 1432 1433#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1434 VSTORE_PARTIAL(N0, STORE_N0) \ 1435 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1436 1437#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1438 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1439 VSTORE_PARTIAL(N0, STORE_N0) \ 1440 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1441 1442#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1443 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1444 VSTORE_PARTIAL(N0, STORE_N0) \ 1445 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1446 1447#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1448 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1449 VSTORE_PARTIAL(N0, STORE_N0) \ 1450 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1451 1452#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1453 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1454 VSTORE_PARTIAL(N0, STORE_N0) \ 1455 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1456 1457#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1458 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1459 VSTORE_PARTIAL(N0, STORE_N0) \ 1460 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1461 1462#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1463 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1464 VSTORE_PARTIAL(N0, STORE_N0) \ 1465 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1466 1467#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1468 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1469 VSTORE_PARTIAL(N0, STORE_N0) \ 1470 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1471 1472#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1473 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1474 VSTORE_PARTIAL(N0, STORE_N0) \ 1475 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1476 1477#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1478 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1479 VSTORE_PARTIAL(N0, STORE_N0) \ 1480 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1481 1482#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1483 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1484 VSTORE_PARTIAL(N0, STORE_N0) \ 1485 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1486 1487#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1488 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1489 VSTORE_PARTIAL(N0, STORE_N0) \ 1490 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1491 1492#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1493 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1494 VSTORE_PARTIAL(N0, STORE_N0) \ 1495 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1496 1497#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1498 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1499 VSTORE_PARTIAL(N0, STORE_N0) \ 1500 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1501 1502#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1503 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1504 VSTORE_PARTIAL(N0, STORE_N0) \ 1505 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1506 1507#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1508 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1509 VSTORE_PARTIAL(N0, STORE_N0) \ 1510 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1511 1512 1513 1514#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1515#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1516 1517#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1518 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 1519 { \ 1520 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1521 } \ 1522 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 1523 { \ 1524 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1525 } \ 1526 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 1527 { \ 1528 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1529 } \ 1530 else \ 1531 { \ 1532 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1533 } 1534 1535#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 1536 if(!(PARTIAL_COND_X)) \ 1537 { \ 1538 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1539 } \ 1540 else \ 1541 { \ 1542 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1543 } 1544 1545#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 1546 if(!(PARTIAL_COND_Y)) \ 1547 { \ 1548 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1549 } \ 1550 else \ 1551 { \ 1552 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1553 } 1554 1555 1556#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1557 1558 1559#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 1560 1561#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1562 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1563 1564#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 1565 1566#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1567 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 1568 1569#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 1570 1571#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1572 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 1573 1574#else 1575 1576#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1577 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 1578 1579#endif 1580 1581#endif 1582 1583 1584#if defined(PARTIAL_STORE_M0) 1585 1586#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1587 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 1588#else 1589#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1590 ((uint)(y * M0)) 1591#endif 1592 1593 1594 1595#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 1596 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 1597 1598 1599#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1600#pragma OPENCL EXTENSION cl_khr_fp16 : enable 1601#endif 1602 1603#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1604#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 1605#endif 1606 1607#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 1608#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 1609#endif 1610 1611#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 1612#pragma OPENCL EXTENSION cl_arm_printf : enable 1613#endif 1614 1615#define GPU_ARCH_MIDGARD 0x100 1616#define GPU_ARCH_BIFROST 0x200 1617#define GPU_ARCH_VALHALL 0x300 1618 1619 1620#define CONCAT(a, b) a##b 1621 1622 1623#define EXPAND(x) x 1624 1625 1626#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 1627 1628 1629#define REV1(x) ((x)) 1630#define REV2(x) ((x).s10) 1631#define REV3(x) ((x).s210) 1632#define REV4(x) ((x).s3210) 1633#define REV8(x) ((x).s76543210) 1634#define REV16(x) ((x).sFEDCBA9876543210) 1635 1636 1637 1638#define REVERSE_STR(x, s) REV##s((x)) 1639#define REVERSE(x, s) REVERSE_STR(x, s) 1640 1641 1642 1643#define ROT1_0(x) ((x)) 1644#define ROT1_1(x) ((x)) 1645 1646#define ROT2_0(x) ((x)) 1647#define ROT2_1(x) ((x).s10) 1648#define ROT2_2(x) ((x)) 1649 1650#define ROT3_0(x) ((x)) 1651#define ROT3_1(x) ((x).s201) 1652#define ROT3_2(x) ((x).s120) 1653#define ROT3_3(x) ((x)) 1654 1655#define ROT4_0(x) ((x)) 1656#define ROT4_1(x) ((x).s3012) 1657#define ROT4_2(x) ((x).s2301) 1658#define ROT4_3(x) ((x).s1230) 1659#define ROT4_4(x) ((x)) 1660 1661#define ROT8_0(x) ((x)) 1662#define ROT8_1(x) ((x).s70123456) 1663#define ROT8_2(x) ((x).s67012345) 1664#define ROT8_3(x) ((x).s56701234) 1665#define ROT8_4(x) ((x).s45670123) 1666#define ROT8_5(x) ((x).s34567012) 1667#define ROT8_6(x) ((x).s23456701) 1668#define ROT8_7(x) ((x).s12345670) 1669#define ROT8_8(x) ((x)) 1670 1671#define ROT16_0(x) ((x)) 1672#define ROT16_1(x) ((x).sF0123456789ABCDE) 1673#define ROT16_2(x) ((x).sEF0123456789ABCD) 1674#define ROT16_3(x) ((x).sDEF0123456789ABC) 1675#define ROT16_4(x) ((x).sCDEF0123456789AB) 1676#define ROT16_5(x) ((x).sBCDEF0123456789A) 1677#define ROT16_6(x) ((x).sABCDEF0123456789) 1678#define ROT16_7(x) ((x).s9ABCDEF012345678) 1679#define ROT16_8(x) ((x).s89ABCDEF01234567) 1680#define ROT16_9(x) ((x).s789ABCDEF0123456) 1681#define ROT16_10(x) ((x).s6789ABCDEF012345) 1682#define ROT16_11(x) ((x).s56789ABCDEF01234) 1683#define ROT16_12(x) ((x).s456789ABCDEF0123) 1684#define ROT16_13(x) ((x).s3456789ABCDEF012) 1685#define ROT16_14(x) ((x).s23456789ABCDEF01) 1686#define ROT16_15(x) ((x).s123456789ABCDEF0) 1687#define ROT16_16(x) ((x)) 1688 1689 1690 1691#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 1692#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 1693 1694 1695 1696#define V_OFFS1(dt) (dt##1)(0) 1697#define V_OFFS2(dt) (dt##2)(0, 1) 1698#define V_OFFS3(dt) (dt##3)(0, 1, 2) 1699#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 1700#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 1701#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 1702 1703 1704 1705#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 1706#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 1707 1708 1709#define VLOAD_STR(size) vload##size 1710#define VLOAD(size) VLOAD_STR(size) 1711 1712 1713#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 1714#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 1715 1716#define NO_LOAD(data, offs, ptr) \ 1717 { \ 1718 } 1719 1720 1721#define vload_partial_1_0 NO_LOAD 1722#define vload_partial_1_1 vload1 1723#define vload_partial_1_2 NO_LOAD 1724#define vload_partial_1_3 NO_LOAD 1725#define vload_partial_1_4 NO_LOAD 1726#define vload_partial_1_5 NO_LOAD 1727#define vload_partial_1_6 NO_LOAD 1728#define vload_partial_1_7 NO_LOAD 1729#define vload_partial_1_8 NO_LOAD 1730#define vload_partial_1_9 NO_LOAD 1731#define vload_partial_1_10 NO_LOAD 1732#define vload_partial_1_11 NO_LOAD 1733#define vload_partial_1_12 NO_LOAD 1734#define vload_partial_1_13 NO_LOAD 1735#define vload_partial_1_14 NO_LOAD 1736#define vload_partial_1_15 NO_LOAD 1737#define vload_partial_1_16 NO_LOAD 1738 1739#define vload_partial_2_0 NO_LOAD 1740#define vload_partial_2_1 vload_partial_1 1741#define vload_partial_2_2 vload_partial_2 1742#define vload_partial_2_3 NO_LOAD 1743#define vload_partial_2_4 NO_LOAD 1744#define vload_partial_2_5 NO_LOAD 1745#define vload_partial_2_6 NO_LOAD 1746#define vload_partial_2_7 NO_LOAD 1747#define vload_partial_2_8 NO_LOAD 1748#define vload_partial_2_9 NO_LOAD 1749#define vload_partial_2_10 NO_LOAD 1750#define vload_partial_2_11 NO_LOAD 1751#define vload_partial_2_12 NO_LOAD 1752#define vload_partial_2_13 NO_LOAD 1753#define vload_partial_2_14 NO_LOAD 1754#define vload_partial_2_15 NO_LOAD 1755#define vload_partial_2_16 NO_LOAD 1756 1757#define vload_partial_3_0 NO_LOAD 1758#define vload_partial_3_1 vload_partial_1 1759#define vload_partial_3_2 vload_partial_2 1760#define vload_partial_3_3 vload_partial_3 1761#define vload_partial_3_4 NO_LOAD 1762#define vload_partial_3_5 NO_LOAD 1763#define vload_partial_3_6 NO_LOAD 1764#define vload_partial_3_7 NO_LOAD 1765#define vload_partial_3_8 NO_LOAD 1766#define vload_partial_3_9 NO_LOAD 1767#define vload_partial_3_10 NO_LOAD 1768#define vload_partial_3_11 NO_LOAD 1769#define vload_partial_3_12 NO_LOAD 1770#define vload_partial_3_13 NO_LOAD 1771#define vload_partial_3_14 NO_LOAD 1772#define vload_partial_3_15 NO_LOAD 1773#define vload_partial_3_16 NO_LOAD 1774 1775#define vload_partial_4_0 NO_LOAD 1776#define vload_partial_4_1 vload_partial_1 1777#define vload_partial_4_2 vload_partial_2 1778#define vload_partial_4_3 vload_partial_3 1779#define vload_partial_4_4 vload_partial_4 1780#define vload_partial_4_5 NO_LOAD 1781#define vload_partial_4_6 NO_LOAD 1782#define vload_partial_4_7 NO_LOAD 1783#define vload_partial_4_8 NO_LOAD 1784#define vload_partial_4_9 NO_LOAD 1785#define vload_partial_4_10 NO_LOAD 1786#define vload_partial_4_11 NO_LOAD 1787#define vload_partial_4_12 NO_LOAD 1788#define vload_partial_4_13 NO_LOAD 1789#define vload_partial_4_14 NO_LOAD 1790#define vload_partial_4_15 NO_LOAD 1791#define vload_partial_4_16 NO_LOAD 1792 1793#define vload_partial_8_0 NO_LOAD 1794#define vload_partial_8_1 vload_partial_1 1795#define vload_partial_8_2 vload_partial_2 1796#define vload_partial_8_3 vload_partial_3 1797#define vload_partial_8_4 vload_partial_4 1798#define vload_partial_8_5 vload_partial_5 1799#define vload_partial_8_6 vload_partial_6 1800#define vload_partial_8_7 vload_partial_7 1801#define vload_partial_8_8 vload_partial_8 1802#define vload_partial_8_9 NO_LOAD 1803#define vload_partial_8_10 NO_LOAD 1804#define vload_partial_8_11 NO_LOAD 1805#define vload_partial_8_12 NO_LOAD 1806#define vload_partial_8_13 NO_LOAD 1807#define vload_partial_8_14 NO_LOAD 1808#define vload_partial_8_15 NO_LOAD 1809#define vload_partial_8_16 NO_LOAD 1810 1811#define vload_partial_16_0 NO_LOAD 1812#define vload_partial_16_1 vload_partial_1 1813#define vload_partial_16_2 vload_partial_2 1814#define vload_partial_16_3 vload_partial_3 1815#define vload_partial_16_4 vload_partial_4 1816#define vload_partial_16_5 vload_partial_5 1817#define vload_partial_16_6 vload_partial_6 1818#define vload_partial_16_7 vload_partial_7 1819#define vload_partial_16_8 vload_partial_8 1820#define vload_partial_16_9 vload_partial_9 1821#define vload_partial_16_10 vload_partial_10 1822#define vload_partial_16_11 vload_partial_11 1823#define vload_partial_16_12 vload_partial_12 1824#define vload_partial_16_13 vload_partial_13 1825#define vload_partial_16_14 vload_partial_14 1826#define vload_partial_16_15 vload_partial_15 1827#define vload_partial_16_16 vload_partial_16 1828 1829 1830#define vload_partial_1(DATA, OFFSET, PTR) \ 1831 DATA.s0 = vload1(OFFSET, PTR); 1832 1833#define vload_partial_2(DATA, OFFSET, PTR) \ 1834 DATA.s01 = vload2(OFFSET, PTR); 1835 1836#define vload_partial_3(DATA, OFFSET, PTR) \ 1837 DATA.s012 = vload3(OFFSET, PTR); 1838 1839#define vload_partial_4(DATA, OFFSET, PTR) \ 1840 DATA.s0123 = vload4(OFFSET, PTR); 1841 1842#define vload_partial_5(DATA, OFFSET, PTR) \ 1843 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1844 DATA.s4 = vload1(OFFSET, PTR + 4); 1845 1846#define vload_partial_6(DATA, OFFSET, PTR) \ 1847 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1848 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 1849 1850#define vload_partial_7(DATA, OFFSET, PTR) \ 1851 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1852 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 1853 1854#define vload_partial_8(DATA, OFFSET, PTR) \ 1855 DATA.s01234567 = vload8(OFFSET, PTR); 1856 1857#define vload_partial_9(DATA, OFFSET, PTR) \ 1858 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1859 DATA.s8 = vload1(OFFSET, PTR + 8); 1860 1861#define vload_partial_10(DATA, OFFSET, PTR) \ 1862 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1863 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 1864 1865#define vload_partial_11(DATA, OFFSET, PTR) \ 1866 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1867 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 1868 1869#define vload_partial_12(DATA, OFFSET, PTR) \ 1870 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1871 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 1872 1873#define vload_partial_13(DATA, OFFSET, PTR) \ 1874 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1875 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 1876 1877#define vload_partial_14(DATA, OFFSET, PTR) \ 1878 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1879 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 1880 1881#define vload_partial_15(DATA, OFFSET, PTR) \ 1882 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1883 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 1884 1885#define vload_partial_16(DATA, OFFSET, PTR) \ 1886 DATA = vload16(OFFSET, PTR); 1887 1888 1889 1890#define PIXEL_UNIT4 1 1891#define PIXEL_UNIT8 2 1892#define PIXEL_UNIT16 4 1893 1894 1895#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 1896#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 1897 1898 1899#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 1900#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 1901#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 1902 1903#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1904#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 1905#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 1906#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 1907#endif 1908 1909#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 1910#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1911#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1912 1913#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1914#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 1915#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1916#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1917#endif 1918 1919 1920#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 1921#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 1922 1923 1924#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 1925#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 1926 1927#define VSTORE_STR(size) vstore##size 1928#define VSTORE(size) VSTORE_STR(size) 1929 1930#define float1 float 1931#define half1 half 1932#define char1 char 1933#define uchar1 uchar 1934#define short1 short 1935#define ushort1 ushort 1936#define int1 int 1937#define uint1 uint 1938#define long1 long 1939#define ulong1 ulong 1940#define double1 double 1941 1942#define vload1(OFFSET, PTR) *(OFFSET + PTR) 1943#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 1944 1945 1946#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 1947#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 1948 1949#define NO_STORE(data, offs, ptr) \ 1950 { \ 1951 } 1952 1953 1954#define vstore_partial_1_0 NO_STORE 1955#define vstore_partial_1_1 vstore1 1956#define vstore_partial_1_2 NO_STORE 1957#define vstore_partial_1_3 NO_STORE 1958#define vstore_partial_1_4 NO_STORE 1959#define vstore_partial_1_5 NO_STORE 1960#define vstore_partial_1_6 NO_STORE 1961#define vstore_partial_1_7 NO_STORE 1962#define vstore_partial_1_8 NO_STORE 1963#define vstore_partial_1_9 NO_STORE 1964#define vstore_partial_1_10 NO_STORE 1965#define vstore_partial_1_11 NO_STORE 1966#define vstore_partial_1_12 NO_STORE 1967#define vstore_partial_1_13 NO_STORE 1968#define vstore_partial_1_14 NO_STORE 1969#define vstore_partial_1_15 NO_STORE 1970#define vstore_partial_1_16 NO_STORE 1971 1972#define vstore_partial_2_0 NO_STORE 1973#define vstore_partial_2_1 vstore_partial_1 1974#define vstore_partial_2_2 vstore_partial_2 1975#define vstore_partial_2_3 NO_STORE 1976#define vstore_partial_2_4 NO_STORE 1977#define vstore_partial_2_5 NO_STORE 1978#define vstore_partial_2_6 NO_STORE 1979#define vstore_partial_2_7 NO_STORE 1980#define vstore_partial_2_8 NO_STORE 1981#define vstore_partial_2_9 NO_STORE 1982#define vstore_partial_2_10 NO_STORE 1983#define vstore_partial_2_11 NO_STORE 1984#define vstore_partial_2_12 NO_STORE 1985#define vstore_partial_2_13 NO_STORE 1986#define vstore_partial_2_14 NO_STORE 1987#define vstore_partial_2_15 NO_STORE 1988#define vstore_partial_2_16 NO_STORE 1989 1990#define vstore_partial_3_0 NO_STORE 1991#define vstore_partial_3_1 vstore_partial_1 1992#define vstore_partial_3_2 vstore_partial_2 1993#define vstore_partial_3_3 vstore_partial_3 1994#define vstore_partial_3_4 NO_STORE 1995#define vstore_partial_3_5 NO_STORE 1996#define vstore_partial_3_6 NO_STORE 1997#define vstore_partial_3_7 NO_STORE 1998#define vstore_partial_3_8 NO_STORE 1999#define vstore_partial_3_9 NO_STORE 2000#define vstore_partial_3_10 NO_STORE 2001#define vstore_partial_3_11 NO_STORE 2002#define vstore_partial_3_12 NO_STORE 2003#define vstore_partial_3_13 NO_STORE 2004#define vstore_partial_3_14 NO_STORE 2005#define vstore_partial_3_15 NO_STORE 2006#define vstore_partial_3_16 NO_STORE 2007 2008#define vstore_partial_4_0 NO_STORE 2009#define vstore_partial_4_1 vstore_partial_1 2010#define vstore_partial_4_2 vstore_partial_2 2011#define vstore_partial_4_3 vstore_partial_3 2012#define vstore_partial_4_4 vstore_partial_4 2013#define vstore_partial_4_5 NO_STORE 2014#define vstore_partial_4_6 NO_STORE 2015#define vstore_partial_4_7 NO_STORE 2016#define vstore_partial_4_8 NO_STORE 2017#define vstore_partial_4_9 NO_STORE 2018#define vstore_partial_4_10 NO_STORE 2019#define vstore_partial_4_11 NO_STORE 2020#define vstore_partial_4_12 NO_STORE 2021#define vstore_partial_4_13 NO_STORE 2022#define vstore_partial_4_14 NO_STORE 2023#define vstore_partial_4_15 NO_STORE 2024#define vstore_partial_4_16 NO_STORE 2025 2026#define vstore_partial_8_0 NO_STORE 2027#define vstore_partial_8_1 vstore_partial_1 2028#define vstore_partial_8_2 vstore_partial_2 2029#define vstore_partial_8_3 vstore_partial_3 2030#define vstore_partial_8_4 vstore_partial_4 2031#define vstore_partial_8_5 vstore_partial_5 2032#define vstore_partial_8_6 vstore_partial_6 2033#define vstore_partial_8_7 vstore_partial_7 2034#define vstore_partial_8_8 vstore_partial_8 2035#define vstore_partial_8_9 NO_STORE 2036#define vstore_partial_8_10 NO_STORE 2037#define vstore_partial_8_11 NO_STORE 2038#define vstore_partial_8_12 NO_STORE 2039#define vstore_partial_8_13 NO_STORE 2040#define vstore_partial_8_14 NO_STORE 2041#define vstore_partial_8_15 NO_STORE 2042#define vstore_partial_8_16 NO_STORE 2043 2044#define vstore_partial_16_0 NO_STORE 2045#define vstore_partial_16_1 vstore_partial_1 2046#define vstore_partial_16_2 vstore_partial_2 2047#define vstore_partial_16_3 vstore_partial_3 2048#define vstore_partial_16_4 vstore_partial_4 2049#define vstore_partial_16_5 vstore_partial_5 2050#define vstore_partial_16_6 vstore_partial_6 2051#define vstore_partial_16_7 vstore_partial_7 2052#define vstore_partial_16_8 vstore_partial_8 2053#define vstore_partial_16_9 vstore_partial_9 2054#define vstore_partial_16_10 vstore_partial_10 2055#define vstore_partial_16_11 vstore_partial_11 2056#define vstore_partial_16_12 vstore_partial_12 2057#define vstore_partial_16_13 vstore_partial_13 2058#define vstore_partial_16_14 vstore_partial_14 2059#define vstore_partial_16_15 vstore_partial_15 2060#define vstore_partial_16_16 vstore_partial_16 2061 2062 2063#define vstore_partial_1(DATA, OFFSET, PTR) \ 2064 vstore1(DATA.s0, OFFSET, PTR); 2065 2066#define vstore_partial_2(DATA, OFFSET, PTR) \ 2067 vstore2(DATA.s01, OFFSET, PTR); 2068 2069#define vstore_partial_3(DATA, OFFSET, PTR) \ 2070 vstore3(DATA.s012, OFFSET, PTR); 2071 2072#define vstore_partial_4(DATA, OFFSET, PTR) \ 2073 vstore4(DATA.s0123, OFFSET, PTR); 2074 2075#define vstore_partial_5(DATA, OFFSET, PTR) \ 2076 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2077 vstore1(DATA.s4, OFFSET, PTR + 4); 2078 2079#define vstore_partial_6(DATA, OFFSET, PTR) \ 2080 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2081 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 2082 2083#define vstore_partial_7(DATA, OFFSET, PTR) \ 2084 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2085 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 2086 2087#define vstore_partial_8(DATA, OFFSET, PTR) \ 2088 vstore8(DATA.s01234567, OFFSET, PTR); 2089 2090#define vstore_partial_9(DATA, OFFSET, PTR) \ 2091 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2092 vstore1(DATA.s8, OFFSET, PTR + 8); 2093 2094#define vstore_partial_10(DATA, OFFSET, PTR) \ 2095 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2096 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 2097 2098#define vstore_partial_11(DATA, OFFSET, PTR) \ 2099 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2100 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 2101 2102#define vstore_partial_12(DATA, OFFSET, PTR) \ 2103 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2104 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 2105 2106#define vstore_partial_13(DATA, OFFSET, PTR) \ 2107 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2108 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 2109 2110#define vstore_partial_14(DATA, OFFSET, PTR) \ 2111 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2112 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 2113 2114#define vstore_partial_15(DATA, OFFSET, PTR) \ 2115 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2116 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 2117 2118#define vstore_partial_16(DATA, OFFSET, PTR) \ 2119 vstore16(DATA, OFFSET, PTR); 2120 2121 2122 2123 2124 2125#define convert_float_sat convert_float 2126#define convert_float1_sat convert_float 2127#define convert_float2_sat convert_float2 2128#define convert_float3_sat convert_float3 2129#define convert_float4_sat convert_float4 2130#define convert_float8_sat convert_float8 2131#define convert_float16_sat convert_float16 2132#define convert_half_sat convert_float 2133#define convert_half1_sat convert_half 2134#define convert_half2_sat convert_half2 2135#define convert_half3_sat convert_half3 2136#define convert_half4_sat convert_half4 2137#define convert_half8_sat convert_half8 2138#define convert_half16_sat convert_half16 2139 2140#define convert_float1 convert_float 2141#define convert_half1 convert_half 2142#define convert_char1 convert_char 2143#define convert_uchar1 convert_uchar 2144#define convert_short1 convert_short 2145#define convert_ushort1 convert_ushort 2146#define convert_int1 convert_int 2147#define convert_uint1 convert_uint 2148#define convert_long1 convert_long 2149#define convert_ulong1 convert_ulong 2150#define convert_double1 convert_double 2151 2152#define convert_char1_sat convert_char_sat 2153#define convert_uchar1_sat convert_uchar_sat 2154#define convert_uchar2_sat convert_uchar2_sat 2155#define convert_uchar3_sat convert_uchar3_sat 2156#define convert_uchar4_sat convert_uchar4_sat 2157#define convert_uchar8_sat convert_uchar8_sat 2158#define convert_uchar16_sat convert_uchar16_sat 2159#define convert_short1_sat convert_short_sat 2160#define convert_ushort1_sat convert_ushort_sat 2161#define convert_int1_sat convert_int_sat 2162#define convert_uint1_sat convert_uint_sat 2163#define convert_long1_sat convert_long_sat 2164#define convert_ulong1_sat convert_ulong_sat 2165#define convert_double1_sat convert_double_sat 2166 2167#define VEC_DATA_TYPE_STR(type, size) type##size 2168#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 2169 2170#define CONVERT_STR(x, type) (convert_##type((x))) 2171#define CONVERT(x, type) CONVERT_STR(x, type) 2172 2173#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 2174#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 2175 2176#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 2177#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 2178 2179#define select_vec_dt_uchar(size) uchar##size 2180#define select_vec_dt_char(size) char##size 2181#define select_vec_dt_ushort(size) ushort##size 2182#define select_vec_dt_short(size) short##size 2183#define select_vec_dt_half(size) short##size 2184#define select_vec_dt_uint(size) uint##size 2185#define select_vec_dt_int(size) int##size 2186#define select_vec_dt_float(size) int##size 2187#define select_vec_dt_ulong(size) ulong##size 2188#define select_vec_dt_long(size) long##size 2189 2190#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 2191#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 2192#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 2193 2194#define signed_int_vec_dt_uchar(size) char##size 2195#define signed_int_vec_dt_char(size) char##size 2196#define signed_int_vec_dt_ushort(size) short##size 2197#define signed_int_vec_dt_short(size) short##size 2198#define signed_int_vec_dt_half(size) short##size 2199#define signed_int_vec_dt_uint(size) int##size 2200#define signed_int_vec_dt_int(size) int##size 2201#define signed_int_vec_dt_float(size) int##size 2202#define signed_int_vec_dt_ulong(size) long##size 2203#define signed_int_vec_dt_long(size) long##size 2204 2205#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 2206#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 2207#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 2208 2209#define sum_reduce_1(x) (x) 2210#define sum_reduce_2(x) ((x).s0) + ((x).s1) 2211#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 2212#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 2213#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 2214#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 2215 2216#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 2217#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 2218 2219#define prod_reduce_1(x) (x) 2220#define prod_reduce_2(x) ((x).s0) * ((x).s1) 2221#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 2222#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 2223#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 2224#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 2225 2226#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 2227#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 2228 2229#define max_reduce_1(x) (x) 2230#define max_reduce_2(x) max(((x).s0), ((x).s1)) 2231#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 2232#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 2233#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 2234#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 2235 2236#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 2237#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 2238 2239#define VECTOR_DECLARATION(name) \ 2240 __global uchar *name##_ptr, \ 2241 uint name##_stride_x, \ 2242 uint name##_step_x, \ 2243 uint name##_offset_first_element_in_bytes 2244 2245#define IMAGE_DECLARATION(name) \ 2246 __global uchar *name##_ptr, \ 2247 uint name##_stride_x, \ 2248 uint name##_step_x, \ 2249 uint name##_stride_y, \ 2250 uint name##_step_y, \ 2251 uint name##_offset_first_element_in_bytes 2252 2253#define TENSOR3D_DECLARATION(name) \ 2254 __global uchar *name##_ptr, \ 2255 uint name##_stride_x, \ 2256 uint name##_step_x, \ 2257 uint name##_stride_y, \ 2258 uint name##_step_y, \ 2259 uint name##_stride_z, \ 2260 uint name##_step_z, \ 2261 uint name##_offset_first_element_in_bytes 2262 2263#define TENSOR4D_DECLARATION(name) \ 2264 __global uchar *name##_ptr, \ 2265 uint name##_stride_x, \ 2266 uint name##_step_x, \ 2267 uint name##_stride_y, \ 2268 uint name##_step_y, \ 2269 uint name##_stride_z, \ 2270 uint name##_step_z, \ 2271 uint name##_stride_w, \ 2272 uint name##_step_w, \ 2273 uint name##_offset_first_element_in_bytes 2274 2275#define TENSOR5D_DECLARATION(name) \ 2276 __global uchar *name##_ptr, \ 2277 uint name##_stride_x, \ 2278 uint name##_step_x, \ 2279 uint name##_stride_y, \ 2280 uint name##_step_y, \ 2281 uint name##_stride_z, \ 2282 uint name##_step_z, \ 2283 uint name##_stride_w, \ 2284 uint name##_step_w, \ 2285 uint name##_stride_v, \ 2286 uint name##_step_v, \ 2287 uint name##_offset_first_element_in_bytes 2288 2289#define CONVERT_TO_VECTOR_STRUCT(name) \ 2290 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 2291 2292#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 2293 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 2294 2295#define CONVERT_TO_IMAGE_STRUCT(name) \ 2296 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 2297 2298#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 2299 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 2300 2301#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2302 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2303 2304#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 2305 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 2306 2307#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2308 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2309 2310#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 2311 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2312 name##_stride_z, name##_step_z) 2313 2314#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 2315 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 2316 2317#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 2318 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2319 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 2320 2321#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 2322 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 2323 2324#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 2325 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2326 name##_stride_z, name##_step_z) 2327 2328 2329typedef struct Vector 2330{ 2331 __global uchar *ptr; 2332 int offset_first_element_in_bytes; 2333 int stride_x; 2334} Vector; 2335 2336 2337typedef struct Image 2338{ 2339 __global uchar *ptr; 2340 int offset_first_element_in_bytes; 2341 int stride_x; 2342 int stride_y; 2343} Image; 2344 2345 2346typedef struct Tensor3D 2347{ 2348 __global uchar *ptr; 2349 int offset_first_element_in_bytes; 2350 int stride_x; 2351 int stride_y; 2352 int stride_z; 2353} Tensor3D; 2354 2355 2356typedef struct Tensor4D 2357{ 2358 __global uchar *ptr; 2359 int offset_first_element_in_bytes; 2360 int stride_x; 2361 int stride_y; 2362 int stride_z; 2363 int stride_w; 2364} Tensor4D; 2365 2366 2367inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 2368{ 2369 Vector vector = 2370 { 2371 .ptr = ptr, 2372 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2373 .stride_x = stride_x, 2374 }; 2375 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 2376 return vector; 2377} 2378 2379 2380inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 2381{ 2382 Image img = 2383 { 2384 .ptr = ptr, 2385 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2386 .stride_x = stride_x, 2387 .stride_y = stride_y 2388 }; 2389 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 2390 return img; 2391} 2392 2393 2394inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2395{ 2396 Image img = 2397 { 2398 .ptr = ptr, 2399 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2400 .stride_x = stride_x, 2401 .stride_y = stride_y 2402 }; 2403 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2404 return img; 2405} 2406 2407 2408inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2409{ 2410 Tensor3D tensor = 2411 { 2412 .ptr = ptr, 2413 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2414 .stride_x = stride_x, 2415 .stride_y = stride_y, 2416 .stride_z = stride_z 2417 }; 2418 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2419 return tensor; 2420} 2421 2422 2423inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2424{ 2425 Tensor3D tensor = 2426 { 2427 .ptr = ptr, 2428 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2429 .stride_x = stride_x, 2430 .stride_y = stride_y, 2431 .stride_z = stride_z 2432 }; 2433 return tensor; 2434} 2435 2436inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 2437 uint step_w, 2438 uint mod_size) 2439{ 2440 Tensor4D tensor = 2441 { 2442 .ptr = ptr, 2443 .offset_first_element_in_bytes = offset_first_element_in_bytes, 2444 .stride_x = stride_x, 2445 .stride_y = stride_y, 2446 .stride_z = stride_z, 2447 .stride_w = stride_w 2448 }; 2449 2450 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 2451 return tensor; 2452} 2453 2454 2455inline __global const uchar *vector_offset(const Vector *vec, int x) 2456{ 2457 return vec->ptr + x * vec->stride_x; 2458} 2459 2460 2461inline __global uchar *offset(const Image *img, int x, int y) 2462{ 2463 return img->ptr + x * img->stride_x + y * img->stride_y; 2464} 2465 2466 2467inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 2468{ 2469 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 2470} 2471 2472 2473inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 2474{ 2475 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 2476} 2477 2478 2479inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 2480{ 2481 uint num_elements = width * height; 2482 2483 const uint z = index / num_elements; 2484 2485 index %= num_elements; 2486 2487 const uint y = index / width; 2488 2489 index %= width; 2490 2491 const uint x = index; 2492 2493 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 2494} 2495 2496#endif 2497 2498 2499 2500#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C) 2501#define REPEAT_3_2(P_X, P_A, P_B, P_C) \ 2502 P_X##_DEF(1, P_A, P_B, P_C); \ 2503 REPEAT_3_1(P_X, P_A, P_B, P_C) 2504#define REPEAT_3_3(P_X, P_A, P_B, P_C) \ 2505 P_X##_DEF(2, P_A, P_B, P_C); \ 2506 REPEAT_3_2(P_X, P_A, P_B, P_C) 2507#define REPEAT_3_4(P_X, P_A, P_B, P_C) \ 2508 P_X##_DEF(3, P_A, P_B, P_C); \ 2509 REPEAT_3_3(P_X, P_A, P_B, P_C) 2510#define REPEAT_3_5(P_X, P_A, P_B, P_C) \ 2511 P_X##_DEF(4, P_A, P_B, P_C); \ 2512 REPEAT_3_4(P_X, P_A, P_B, P_C) 2513#define REPEAT_3_6(P_X, P_A, P_B, P_C) \ 2514 P_X##_DEF(5, P_A, P_B, P_C); \ 2515 REPEAT_3_5(P_X, P_A, P_B, P_C) 2516#define REPEAT_3_7(P_X, P_A, P_B, P_C) \ 2517 P_X##_DEF(6, P_A, P_B, P_C); \ 2518 REPEAT_3_6(P_X, P_A, P_B, P_C) 2519#define REPEAT_3_8(P_X, P_A, P_B, P_C) \ 2520 P_X##_DEF(7, P_A, P_B, P_C); \ 2521 REPEAT_3_7(P_X, P_A, P_B, P_C) 2522#define REPEAT_3_9(P_X, P_A, P_B, P_C) \ 2523 P_X##_DEF(8, P_A, P_B, P_C); \ 2524 REPEAT_3_8(P_X, P_A, P_B, P_C) 2525#define REPEAT_3_10(P_X, P_A, P_B, P_C) \ 2526 P_X##_DEF(9, P_A, P_B, P_C); \ 2527 REPEAT_3_9(P_X, P_A, P_B, P_C) 2528#define REPEAT_3_11(P_X, P_A, P_B, P_C) \ 2529 P_X##_DEF(A, P_A, P_B, P_C); \ 2530 REPEAT_3_10(P_X, P_A, P_B, P_C) 2531#define REPEAT_3_12(P_X, P_A, P_B, P_C) \ 2532 P_X##_DEF(B, P_A, P_B, P_C); \ 2533 REPEAT_3_11(P_X, P_A, P_B, P_C) 2534#define REPEAT_3_13(P_X, P_A, P_B, P_C) \ 2535 P_X##_DEF(C, P_A, P_B, P_C); \ 2536 REPEAT_3_12(P_X, P_A, P_B, P_C) 2537#define REPEAT_3_14(P_X, P_A, P_B, P_C) \ 2538 P_X##_DEF(D, P_A, P_B, P_C); \ 2539 REPEAT_3_13(P_X, P_A, P_B, P_C) 2540#define REPEAT_3_15(P_X, P_A, P_B, P_C) \ 2541 P_X##_DEF(E, P_A, P_B, P_C); \ 2542 REPEAT_3_14(P_X, P_A, P_B, P_C) 2543#define REPEAT_3_16(P_X, P_A, P_B, P_C) \ 2544 P_X##_DEF(F, P_A, P_B, P_C); \ 2545 REPEAT_3_15(P_X, P_A, P_B, P_C) 2546 2547#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) 2548#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) 2549 2550 2551#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D) 2552#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \ 2553 P_X##_DEF(1, P_A, P_B, P_C, P_D); \ 2554 REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) 2555#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \ 2556 P_X##_DEF(2, P_A, P_B, P_C, P_D); \ 2557 REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) 2558#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \ 2559 P_X##_DEF(3, P_A, P_B, P_C, P_D); \ 2560 REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) 2561#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \ 2562 P_X##_DEF(4, P_A, P_B, P_C, P_D); \ 2563 REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) 2564#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \ 2565 P_X##_DEF(5, P_A, P_B, P_C, P_D); \ 2566 REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) 2567#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \ 2568 P_X##_DEF(6, P_A, P_B, P_C, P_D); \ 2569 REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) 2570#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \ 2571 P_X##_DEF(7, P_A, P_B, P_C, P_D); \ 2572 REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) 2573#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \ 2574 P_X##_DEF(8, P_A, P_B, P_C, P_D); \ 2575 REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) 2576#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \ 2577 P_X##_DEF(9, P_A, P_B, P_C, P_D); \ 2578 REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) 2579#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \ 2580 P_X##_DEF(A, P_A, P_B, P_C, P_D); \ 2581 REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) 2582#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \ 2583 P_X##_DEF(B, P_A, P_B, P_C, P_D); \ 2584 REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) 2585#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \ 2586 P_X##_DEF(C, P_A, P_B, P_C, P_D); \ 2587 REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) 2588#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \ 2589 P_X##_DEF(D, P_A, P_B, P_C, P_D); \ 2590 REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) 2591#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \ 2592 P_X##_DEF(E, P_A, P_B, P_C, P_D); \ 2593 REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) 2594#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \ 2595 P_X##_DEF(F, P_A, P_B, P_C, P_D); \ 2596 REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) 2597 2598#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) 2599#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) 2600 2601 2602#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL 2603#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) 2604 2605 2606#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) 2607#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) 2608 2609 2610#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) 2611#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) 2612 2613 2614#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL 2615#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) 2616 2617 2618#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL 2619#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) 2620 2621 2622#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC 2623#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) 2624 2625 2626#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID 2627#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) 2628 2629 2630#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) 2631#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) 2632 2633 2634#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) 2635#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) 2636 2637 2638#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 2639#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 2640 2641 2642#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 2643#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 2644 2645 2646#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ 2647 ({ \ 2648 VEC_DATA_TYPE(int, N0) \ 2649 VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 2650 VEC_DATA_TYPE(int, N0) \ 2651 VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 2652 VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ 2653 }) 2654#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) 2655 2656#endif 2657 2658#if defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y) 2659 2660#if VEC_SIZE_X == 1 2661#if VEC_SIZE_Y == 1 2662#define TRANSPOSED_U(val) \ 2663 { \ 2664 u0 \ 2665 } 2666#elif VEC_SIZE_Y == 2 2667#define TRANSPOSED_U(val) \ 2668 { \ 2669 u0, u1 \ 2670 } 2671#elif VEC_SIZE_Y == 3 2672#define TRANSPOSED_U(val) \ 2673 { \ 2674 u0, u1, u2 \ 2675 } 2676#elif VEC_SIZE_Y == 4 2677#define TRANSPOSED_U(val) \ 2678 { \ 2679 u0, u1, u2, u3 \ 2680 } 2681#elif VEC_SIZE_Y == 8 2682#define TRANSPOSED_U(val) \ 2683 { \ 2684 u0, u1, u2, u3, u4, u5, u6, u7 \ 2685 } 2686#elif VEC_SIZE_Y == 16 2687#define TRANSPOSED_U(val) \ 2688 { \ 2689 u0, u1, u2, u3, u4, u5, u6, u7, \ 2690 u8, u9, u10, u11, u12, u13, u14, u15 \ 2691 } 2692#endif 2693#else 2694#if VEC_SIZE_Y == 1 2695#define TRANSPOSED_U(val) \ 2696 { \ 2697 u0.val \ 2698 } 2699#elif VEC_SIZE_Y == 2 2700#define TRANSPOSED_U(val) \ 2701 { \ 2702 u0.val, u1.val \ 2703 } 2704#elif VEC_SIZE_Y == 3 2705#define TRANSPOSED_U(val) \ 2706 { \ 2707 u0.val, u1.val, u2.val \ 2708 } 2709#elif VEC_SIZE_Y == 4 2710#define TRANSPOSED_U(val) \ 2711 { \ 2712 u0.val, u1.val, u2.val, u3.val \ 2713 } 2714#elif VEC_SIZE_Y == 8 2715#define TRANSPOSED_U(val) \ 2716 { \ 2717 u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val \ 2718 } 2719#elif VEC_SIZE_Y == 16 2720#define TRANSPOSED_U(val) \ 2721 { \ 2722 u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val, \ 2723 u8.val, u9.val, u10.val, u11.val, u12.val, u13.val, u14.val, u15.val \ 2724 } 2725#endif 2726#endif 2727 2728#if DATA_TYPE_IN_BYTES == 4 2729#define DATA_TYPE uint 2730#elif DATA_TYPE_IN_BYTES == 2 2731#define DATA_TYPE ushort 2732#elif DATA_TYPE_IN_BYTES == 1 2733#define DATA_TYPE uchar 2734#else 2735#error DATA_TYPE_IN_BYTES not supported for transpose 2736#endif 2737 2738 2739__kernel void transpose(IMAGE_DECLARATION(src), 2740 IMAGE_DECLARATION(dst)) 2741{ 2742 uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0); 2743 uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0); 2744 2745 2746 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y; 2747 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y; 2748 2749 2750 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2751 u0 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)src_addr); 2752#if VEC_SIZE_Y > 1 2753 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2754 u1 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + src_stride_y)); 2755#endif 2756#if VEC_SIZE_Y > 2 2757 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2758 u2 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); 2759#endif 2760#if VEC_SIZE_Y > 3 2761 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2762 u3 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); 2763#endif 2764#if VEC_SIZE_Y > 4 2765 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2766 u4 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); 2767 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2768 u5 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y)); 2769 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2770 u6 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y)); 2771 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2772 u7 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y)); 2773#endif 2774#if VEC_SIZE_Y > 8 2775 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2776 u8 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y)); 2777 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2778 u9 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 9 * src_stride_y)); 2779 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2780 u10 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 10 * src_stride_y)); 2781 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2782 u11 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 11 * src_stride_y)); 2783 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2784 u12 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 12 * src_stride_y)); 2785 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2786 u13 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 13 * src_stride_y)); 2787 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2788 u14 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 14 * src_stride_y)); 2789 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X) 2790 u15 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 15 * src_stride_y)); 2791#endif 2792 2793 2794 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2795 t0 = TRANSPOSED_U(s0); 2796#if VEC_SIZE_X > 1 2797 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2798 t1 = TRANSPOSED_U(s1); 2799#endif 2800#if VEC_SIZE_X > 2 2801 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2802 t2 = TRANSPOSED_U(s2); 2803#endif 2804#if VEC_SIZE_X > 3 2805 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2806 t3 = TRANSPOSED_U(s3); 2807#endif 2808#if VEC_SIZE_X > 4 2809 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2810 t4 = TRANSPOSED_U(s4); 2811 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2812 t5 = TRANSPOSED_U(s5); 2813 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2814 t6 = TRANSPOSED_U(s6); 2815 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2816 t7 = TRANSPOSED_U(s7); 2817#endif 2818#if VEC_SIZE_X > 8 2819 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2820 t8 = TRANSPOSED_U(s8); 2821 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2822 t9 = TRANSPOSED_U(s9); 2823 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2824 tA = TRANSPOSED_U(sA); 2825 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2826 tB = TRANSPOSED_U(sB); 2827 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2828 tC = TRANSPOSED_U(sC); 2829 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2830 tD = TRANSPOSED_U(sD); 2831 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2832 tE = TRANSPOSED_U(sE); 2833 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y) 2834 tF = TRANSPOSED_U(sF); 2835#endif 2836 2837 2838 REPEAT_VAR_INIT_TO_CONST(VEC_SIZE_X, uint, zout, 0); 2839 STORE_BLOCK_BOUNDARY_AWARE(VEC_SIZE_X, VEC_SIZE_Y, DATA_TYPE, t, (__global uchar *)dst_addr, dst_stride_y, zout, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_Y, VEC_SIZE_LEFTOVER_X != 0 2840 && get_global_id(0) == 0, 2841 VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0); 2842} 2843 2844#endif )"