1R"( 2#ifndef ARM_COMPUTE_HELPER_H 3#define ARM_COMPUTE_HELPER_H 4 5 6 7 8#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 9 VSTORE(N0) \ 10 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 11 12#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 13 STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 14 VSTORE(N0) \ 15 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 16 17#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 18 STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 19 VSTORE(N0) \ 20 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 21 22#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 23 STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 24 VSTORE(N0) \ 25 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 26 27#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 28 STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 29 VSTORE(N0) \ 30 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 31 32#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 33 STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 34 VSTORE(N0) \ 35 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 36 37#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 38 STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 39 VSTORE(N0) \ 40 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 41 42#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 43 STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 44 VSTORE(N0) \ 45 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 46 47#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 48 STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 49 VSTORE(N0) \ 50 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 51 52#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 53 STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 54 VSTORE(N0) \ 55 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 56 57#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 58 STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 59 VSTORE(N0) \ 60 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 61 62#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 63 STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 64 VSTORE(N0) \ 65 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 66 67#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 68 STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 69 VSTORE(N0) \ 70 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 71 72#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 73 STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 74 VSTORE(N0) \ 75 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 76 77#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 78 STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 79 VSTORE(N0) \ 80 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 81 82#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 83 STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 84 VSTORE(N0) \ 85 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 86 87 88 89#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 90 VSTORE(N0) \ 91 (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 92 93#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 94 CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 95 VSTORE(N0) \ 96 (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 97 98#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 99 CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 100 VSTORE(N0) \ 101 (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 102 103#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 104 CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 105 VSTORE(N0) \ 106 (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 107 108#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 109 CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 110 VSTORE(N0) \ 111 (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 112 113#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114 CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 115 VSTORE(N0) \ 116 (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 117 118#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119 CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 120 VSTORE(N0) \ 121 (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 122 123#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124 CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 125 VSTORE(N0) \ 126 (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 127 128#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129 CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 130 VSTORE(N0) \ 131 (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 132 133#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 134 CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 135 VSTORE(N0) \ 136 (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 137 138#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139 CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 140 VSTORE(N0) \ 141 (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 142 143#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 144 CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 145 VSTORE(N0) \ 146 (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 147 148#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 149 CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 150 VSTORE(N0) \ 151 (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 152 153#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 154 CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 155 VSTORE(N0) \ 156 (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 157 158#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 159 CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 160 VSTORE(N0) \ 161 (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 162 163#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 164 CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 165 VSTORE(N0) \ 166 (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 167 168 169 170 171#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 172#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 173 174 175 176#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 177#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 178 179 180 181#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 182 VSTORE_PARTIAL(N0, STORE_N0) \ 183 (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 184 185#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186 STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 187 VSTORE_PARTIAL(N0, STORE_N0) \ 188 (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 189 190#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 191 STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 192 VSTORE_PARTIAL(N0, STORE_N0) \ 193 (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 194 195#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 196 STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 197 VSTORE_PARTIAL(N0, STORE_N0) \ 198 (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 199 200#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 201 STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 202 VSTORE_PARTIAL(N0, STORE_N0) \ 203 (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 204 205#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 206 STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 207 VSTORE_PARTIAL(N0, STORE_N0) \ 208 (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 209 210#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 211 STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 212 VSTORE_PARTIAL(N0, STORE_N0) \ 213 (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 214 215#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 216 STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 217 VSTORE_PARTIAL(N0, STORE_N0) \ 218 (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 219 220#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 221 STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 222 VSTORE_PARTIAL(N0, STORE_N0) \ 223 (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 224 225#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 226 STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 227 VSTORE_PARTIAL(N0, STORE_N0) \ 228 (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 229 230#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 231 STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 232 VSTORE_PARTIAL(N0, STORE_N0) \ 233 (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 234 235#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 236 STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 237 VSTORE_PARTIAL(N0, STORE_N0) \ 238 (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 239 240#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 241 STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 242 VSTORE_PARTIAL(N0, STORE_N0) \ 243 (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 244 245#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 246 STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 247 VSTORE_PARTIAL(N0, STORE_N0) \ 248 (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 249 250#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 251 STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 252 VSTORE_PARTIAL(N0, STORE_N0) \ 253 (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 254 255#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 256 STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 257 VSTORE_PARTIAL(N0, STORE_N0) \ 258 (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 259 260 261 262#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 263#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 264 265#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 266 if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 267 { \ 268 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 269 } \ 270 else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 271 { \ 272 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 273 } \ 274 else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 275 { \ 276 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 277 } \ 278 else \ 279 { \ 280 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 281 } 282 283#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 284 if(!(PARTIAL_COND_X)) \ 285 { \ 286 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 287 } \ 288 else \ 289 { \ 290 STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 291 } 292 293#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 294 if(!(PARTIAL_COND_Y)) \ 295 { \ 296 STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 297 } \ 298 else \ 299 { \ 300 STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 301 } 302 303 304#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 305 306 307#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 308 309#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 310 STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 311 312#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 313 314#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 315 STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 316 317#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 318 319#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 320 STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 321 322#else 323 324#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 325 STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 326 327#endif 328 329#endif 330 331 332#if defined(PARTIAL_STORE_M0) 333 334#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 335 ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 336#else 337#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 338 ((uint)(y * M0)) 339#endif 340 341 342 343#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 344 STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 345 346 347#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 348#pragma OPENCL EXTENSION cl_khr_fp16 : enable 349#endif 350 351#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 352#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 353#endif 354 355#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 356#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 357#endif 358 359#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 360#pragma OPENCL EXTENSION cl_arm_printf : enable 361#endif 362 363#define GPU_ARCH_MIDGARD 0x100 364#define GPU_ARCH_BIFROST 0x200 365#define GPU_ARCH_VALHALL 0x300 366 367 368#define CONCAT(a, b) a##b 369 370 371#define EXPAND(x) x 372 373 374#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 375 376 377#define REV1(x) ((x)) 378#define REV2(x) ((x).s10) 379#define REV3(x) ((x).s210) 380#define REV4(x) ((x).s3210) 381#define REV8(x) ((x).s76543210) 382#define REV16(x) ((x).sFEDCBA9876543210) 383 384 385 386#define REVERSE_STR(x, s) REV##s((x)) 387#define REVERSE(x, s) REVERSE_STR(x, s) 388 389 390 391#define ROT1_0(x) ((x)) 392#define ROT1_1(x) ((x)) 393 394#define ROT2_0(x) ((x)) 395#define ROT2_1(x) ((x).s10) 396#define ROT2_2(x) ((x)) 397 398#define ROT3_0(x) ((x)) 399#define ROT3_1(x) ((x).s201) 400#define ROT3_2(x) ((x).s120) 401#define ROT3_3(x) ((x)) 402 403#define ROT4_0(x) ((x)) 404#define ROT4_1(x) ((x).s3012) 405#define ROT4_2(x) ((x).s2301) 406#define ROT4_3(x) ((x).s1230) 407#define ROT4_4(x) ((x)) 408 409#define ROT8_0(x) ((x)) 410#define ROT8_1(x) ((x).s70123456) 411#define ROT8_2(x) ((x).s67012345) 412#define ROT8_3(x) ((x).s56701234) 413#define ROT8_4(x) ((x).s45670123) 414#define ROT8_5(x) ((x).s34567012) 415#define ROT8_6(x) ((x).s23456701) 416#define ROT8_7(x) ((x).s12345670) 417#define ROT8_8(x) ((x)) 418 419#define ROT16_0(x) ((x)) 420#define ROT16_1(x) ((x).sF0123456789ABCDE) 421#define ROT16_2(x) ((x).sEF0123456789ABCD) 422#define ROT16_3(x) ((x).sDEF0123456789ABC) 423#define ROT16_4(x) ((x).sCDEF0123456789AB) 424#define ROT16_5(x) ((x).sBCDEF0123456789A) 425#define ROT16_6(x) ((x).sABCDEF0123456789) 426#define ROT16_7(x) ((x).s9ABCDEF012345678) 427#define ROT16_8(x) ((x).s89ABCDEF01234567) 428#define ROT16_9(x) ((x).s789ABCDEF0123456) 429#define ROT16_10(x) ((x).s6789ABCDEF012345) 430#define ROT16_11(x) ((x).s56789ABCDEF01234) 431#define ROT16_12(x) ((x).s456789ABCDEF0123) 432#define ROT16_13(x) ((x).s3456789ABCDEF012) 433#define ROT16_14(x) ((x).s23456789ABCDEF01) 434#define ROT16_15(x) ((x).s123456789ABCDEF0) 435#define ROT16_16(x) ((x)) 436 437 438 439#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 440#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 441 442 443 444#define V_OFFS1(dt) (dt##1)(0) 445#define V_OFFS2(dt) (dt##2)(0, 1) 446#define V_OFFS3(dt) (dt##3)(0, 1, 2) 447#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 448#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 449#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 450 451 452 453#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 454#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 455 456 457#define VLOAD_STR(size) vload##size 458#define VLOAD(size) VLOAD_STR(size) 459 460 461#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 462#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 463 464#define NO_LOAD(data, offs, ptr) \ 465 { \ 466 } 467 468 469#define vload_partial_1_0 NO_LOAD 470#define vload_partial_1_1 vload1 471#define vload_partial_1_2 NO_LOAD 472#define vload_partial_1_3 NO_LOAD 473#define vload_partial_1_4 NO_LOAD 474#define vload_partial_1_5 NO_LOAD 475#define vload_partial_1_6 NO_LOAD 476#define vload_partial_1_7 NO_LOAD 477#define vload_partial_1_8 NO_LOAD 478#define vload_partial_1_9 NO_LOAD 479#define vload_partial_1_10 NO_LOAD 480#define vload_partial_1_11 NO_LOAD 481#define vload_partial_1_12 NO_LOAD 482#define vload_partial_1_13 NO_LOAD 483#define vload_partial_1_14 NO_LOAD 484#define vload_partial_1_15 NO_LOAD 485#define vload_partial_1_16 NO_LOAD 486 487#define vload_partial_2_0 NO_LOAD 488#define vload_partial_2_1 vload_partial_1 489#define vload_partial_2_2 vload_partial_2 490#define vload_partial_2_3 NO_LOAD 491#define vload_partial_2_4 NO_LOAD 492#define vload_partial_2_5 NO_LOAD 493#define vload_partial_2_6 NO_LOAD 494#define vload_partial_2_7 NO_LOAD 495#define vload_partial_2_8 NO_LOAD 496#define vload_partial_2_9 NO_LOAD 497#define vload_partial_2_10 NO_LOAD 498#define vload_partial_2_11 NO_LOAD 499#define vload_partial_2_12 NO_LOAD 500#define vload_partial_2_13 NO_LOAD 501#define vload_partial_2_14 NO_LOAD 502#define vload_partial_2_15 NO_LOAD 503#define vload_partial_2_16 NO_LOAD 504 505#define vload_partial_3_0 NO_LOAD 506#define vload_partial_3_1 vload_partial_1 507#define vload_partial_3_2 vload_partial_2 508#define vload_partial_3_3 vload_partial_3 509#define vload_partial_3_4 NO_LOAD 510#define vload_partial_3_5 NO_LOAD 511#define vload_partial_3_6 NO_LOAD 512#define vload_partial_3_7 NO_LOAD 513#define vload_partial_3_8 NO_LOAD 514#define vload_partial_3_9 NO_LOAD 515#define vload_partial_3_10 NO_LOAD 516#define vload_partial_3_11 NO_LOAD 517#define vload_partial_3_12 NO_LOAD 518#define vload_partial_3_13 NO_LOAD 519#define vload_partial_3_14 NO_LOAD 520#define vload_partial_3_15 NO_LOAD 521#define vload_partial_3_16 NO_LOAD 522 523#define vload_partial_4_0 NO_LOAD 524#define vload_partial_4_1 vload_partial_1 525#define vload_partial_4_2 vload_partial_2 526#define vload_partial_4_3 vload_partial_3 527#define vload_partial_4_4 vload_partial_4 528#define vload_partial_4_5 NO_LOAD 529#define vload_partial_4_6 NO_LOAD 530#define vload_partial_4_7 NO_LOAD 531#define vload_partial_4_8 NO_LOAD 532#define vload_partial_4_9 NO_LOAD 533#define vload_partial_4_10 NO_LOAD 534#define vload_partial_4_11 NO_LOAD 535#define vload_partial_4_12 NO_LOAD 536#define vload_partial_4_13 NO_LOAD 537#define vload_partial_4_14 NO_LOAD 538#define vload_partial_4_15 NO_LOAD 539#define vload_partial_4_16 NO_LOAD 540 541#define vload_partial_8_0 NO_LOAD 542#define vload_partial_8_1 vload_partial_1 543#define vload_partial_8_2 vload_partial_2 544#define vload_partial_8_3 vload_partial_3 545#define vload_partial_8_4 vload_partial_4 546#define vload_partial_8_5 vload_partial_5 547#define vload_partial_8_6 vload_partial_6 548#define vload_partial_8_7 vload_partial_7 549#define vload_partial_8_8 vload_partial_8 550#define vload_partial_8_9 NO_LOAD 551#define vload_partial_8_10 NO_LOAD 552#define vload_partial_8_11 NO_LOAD 553#define vload_partial_8_12 NO_LOAD 554#define vload_partial_8_13 NO_LOAD 555#define vload_partial_8_14 NO_LOAD 556#define vload_partial_8_15 NO_LOAD 557#define vload_partial_8_16 NO_LOAD 558 559#define vload_partial_16_0 NO_LOAD 560#define vload_partial_16_1 vload_partial_1 561#define vload_partial_16_2 vload_partial_2 562#define vload_partial_16_3 vload_partial_3 563#define vload_partial_16_4 vload_partial_4 564#define vload_partial_16_5 vload_partial_5 565#define vload_partial_16_6 vload_partial_6 566#define vload_partial_16_7 vload_partial_7 567#define vload_partial_16_8 vload_partial_8 568#define vload_partial_16_9 vload_partial_9 569#define vload_partial_16_10 vload_partial_10 570#define vload_partial_16_11 vload_partial_11 571#define vload_partial_16_12 vload_partial_12 572#define vload_partial_16_13 vload_partial_13 573#define vload_partial_16_14 vload_partial_14 574#define vload_partial_16_15 vload_partial_15 575#define vload_partial_16_16 vload_partial_16 576 577 578#define vload_partial_1(DATA, OFFSET, PTR) \ 579 DATA.s0 = vload1(OFFSET, PTR); 580 581#define vload_partial_2(DATA, OFFSET, PTR) \ 582 DATA.s01 = vload2(OFFSET, PTR); 583 584#define vload_partial_3(DATA, OFFSET, PTR) \ 585 DATA.s012 = vload3(OFFSET, PTR); 586 587#define vload_partial_4(DATA, OFFSET, PTR) \ 588 DATA.s0123 = vload4(OFFSET, PTR); 589 590#define vload_partial_5(DATA, OFFSET, PTR) \ 591 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 592 DATA.s4 = vload1(OFFSET, PTR + 4); 593 594#define vload_partial_6(DATA, OFFSET, PTR) \ 595 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 596 vload_partial_2(DATA.s45, OFFSET, PTR + 4); 597 598#define vload_partial_7(DATA, OFFSET, PTR) \ 599 vload_partial_4(DATA.s0123, OFFSET, PTR); \ 600 vload_partial_3(DATA.s456, OFFSET, PTR + 4); 601 602#define vload_partial_8(DATA, OFFSET, PTR) \ 603 DATA.s01234567 = vload8(OFFSET, PTR); 604 605#define vload_partial_9(DATA, OFFSET, PTR) \ 606 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 607 DATA.s8 = vload1(OFFSET, PTR + 8); 608 609#define vload_partial_10(DATA, OFFSET, PTR) \ 610 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 611 vload_partial_2(DATA.s89, OFFSET, PTR + 8); 612 613#define vload_partial_11(DATA, OFFSET, PTR) \ 614 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 615 vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 616 617#define vload_partial_12(DATA, OFFSET, PTR) \ 618 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 619 vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 620 621#define vload_partial_13(DATA, OFFSET, PTR) \ 622 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 623 vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 624 625#define vload_partial_14(DATA, OFFSET, PTR) \ 626 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 627 vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 628 629#define vload_partial_15(DATA, OFFSET, PTR) \ 630 vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 631 vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 632 633#define vload_partial_16(DATA, OFFSET, PTR) \ 634 DATA = vload16(OFFSET, PTR); 635 636 637 638#define PIXEL_UNIT4 1 639#define PIXEL_UNIT8 2 640#define PIXEL_UNIT16 4 641 642 643#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 644#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 645 646 647#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 648#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 649#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 650 651#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 652#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 653#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 654#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 655#endif 656 657#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 658#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 659#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 660 661#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 662#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 663#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 664#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 665#endif 666 667 668#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 669#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 670 671 672#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 673#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 674 675#define VSTORE_STR(size) vstore##size 676#define VSTORE(size) VSTORE_STR(size) 677 678#define float1 float 679#define half1 half 680#define char1 char 681#define uchar1 uchar 682#define short1 short 683#define ushort1 ushort 684#define int1 int 685#define uint1 uint 686#define long1 long 687#define ulong1 ulong 688#define double1 double 689 690#define vload1(OFFSET, PTR) *(OFFSET + PTR) 691#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 692 693 694#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 695#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 696 697#define NO_STORE(data, offs, ptr) \ 698 { \ 699 } 700 701 702#define vstore_partial_1_0 NO_STORE 703#define vstore_partial_1_1 vstore1 704#define vstore_partial_1_2 NO_STORE 705#define vstore_partial_1_3 NO_STORE 706#define vstore_partial_1_4 NO_STORE 707#define vstore_partial_1_5 NO_STORE 708#define vstore_partial_1_6 NO_STORE 709#define vstore_partial_1_7 NO_STORE 710#define vstore_partial_1_8 NO_STORE 711#define vstore_partial_1_9 NO_STORE 712#define vstore_partial_1_10 NO_STORE 713#define vstore_partial_1_11 NO_STORE 714#define vstore_partial_1_12 NO_STORE 715#define vstore_partial_1_13 NO_STORE 716#define vstore_partial_1_14 NO_STORE 717#define vstore_partial_1_15 NO_STORE 718#define vstore_partial_1_16 NO_STORE 719 720#define vstore_partial_2_0 NO_STORE 721#define vstore_partial_2_1 vstore_partial_1 722#define vstore_partial_2_2 vstore_partial_2 723#define vstore_partial_2_3 NO_STORE 724#define vstore_partial_2_4 NO_STORE 725#define vstore_partial_2_5 NO_STORE 726#define vstore_partial_2_6 NO_STORE 727#define vstore_partial_2_7 NO_STORE 728#define vstore_partial_2_8 NO_STORE 729#define vstore_partial_2_9 NO_STORE 730#define vstore_partial_2_10 NO_STORE 731#define vstore_partial_2_11 NO_STORE 732#define vstore_partial_2_12 NO_STORE 733#define vstore_partial_2_13 NO_STORE 734#define vstore_partial_2_14 NO_STORE 735#define vstore_partial_2_15 NO_STORE 736#define vstore_partial_2_16 NO_STORE 737 738#define vstore_partial_3_0 NO_STORE 739#define vstore_partial_3_1 vstore_partial_1 740#define vstore_partial_3_2 vstore_partial_2 741#define vstore_partial_3_3 vstore_partial_3 742#define vstore_partial_3_4 NO_STORE 743#define vstore_partial_3_5 NO_STORE 744#define vstore_partial_3_6 NO_STORE 745#define vstore_partial_3_7 NO_STORE 746#define vstore_partial_3_8 NO_STORE 747#define vstore_partial_3_9 NO_STORE 748#define vstore_partial_3_10 NO_STORE 749#define vstore_partial_3_11 NO_STORE 750#define vstore_partial_3_12 NO_STORE 751#define vstore_partial_3_13 NO_STORE 752#define vstore_partial_3_14 NO_STORE 753#define vstore_partial_3_15 NO_STORE 754#define vstore_partial_3_16 NO_STORE 755 756#define vstore_partial_4_0 NO_STORE 757#define vstore_partial_4_1 vstore_partial_1 758#define vstore_partial_4_2 vstore_partial_2 759#define vstore_partial_4_3 vstore_partial_3 760#define vstore_partial_4_4 vstore_partial_4 761#define vstore_partial_4_5 NO_STORE 762#define vstore_partial_4_6 NO_STORE 763#define vstore_partial_4_7 NO_STORE 764#define vstore_partial_4_8 NO_STORE 765#define vstore_partial_4_9 NO_STORE 766#define vstore_partial_4_10 NO_STORE 767#define vstore_partial_4_11 NO_STORE 768#define vstore_partial_4_12 NO_STORE 769#define vstore_partial_4_13 NO_STORE 770#define vstore_partial_4_14 NO_STORE 771#define vstore_partial_4_15 NO_STORE 772#define vstore_partial_4_16 NO_STORE 773 774#define vstore_partial_8_0 NO_STORE 775#define vstore_partial_8_1 vstore_partial_1 776#define vstore_partial_8_2 vstore_partial_2 777#define vstore_partial_8_3 vstore_partial_3 778#define vstore_partial_8_4 vstore_partial_4 779#define vstore_partial_8_5 vstore_partial_5 780#define vstore_partial_8_6 vstore_partial_6 781#define vstore_partial_8_7 vstore_partial_7 782#define vstore_partial_8_8 vstore_partial_8 783#define vstore_partial_8_9 NO_STORE 784#define vstore_partial_8_10 NO_STORE 785#define vstore_partial_8_11 NO_STORE 786#define vstore_partial_8_12 NO_STORE 787#define vstore_partial_8_13 NO_STORE 788#define vstore_partial_8_14 NO_STORE 789#define vstore_partial_8_15 NO_STORE 790#define vstore_partial_8_16 NO_STORE 791 792#define vstore_partial_16_0 NO_STORE 793#define vstore_partial_16_1 vstore_partial_1 794#define vstore_partial_16_2 vstore_partial_2 795#define vstore_partial_16_3 vstore_partial_3 796#define vstore_partial_16_4 vstore_partial_4 797#define vstore_partial_16_5 vstore_partial_5 798#define vstore_partial_16_6 vstore_partial_6 799#define vstore_partial_16_7 vstore_partial_7 800#define vstore_partial_16_8 vstore_partial_8 801#define vstore_partial_16_9 vstore_partial_9 802#define vstore_partial_16_10 vstore_partial_10 803#define vstore_partial_16_11 vstore_partial_11 804#define vstore_partial_16_12 vstore_partial_12 805#define vstore_partial_16_13 vstore_partial_13 806#define vstore_partial_16_14 vstore_partial_14 807#define vstore_partial_16_15 vstore_partial_15 808#define vstore_partial_16_16 vstore_partial_16 809 810 811#define vstore_partial_1(DATA, OFFSET, PTR) \ 812 vstore1(DATA.s0, OFFSET, PTR); 813 814#define vstore_partial_2(DATA, OFFSET, PTR) \ 815 vstore2(DATA.s01, OFFSET, PTR); 816 817#define vstore_partial_3(DATA, OFFSET, PTR) \ 818 vstore3(DATA.s012, OFFSET, PTR); 819 820#define vstore_partial_4(DATA, OFFSET, PTR) \ 821 vstore4(DATA.s0123, OFFSET, PTR); 822 823#define vstore_partial_5(DATA, OFFSET, PTR) \ 824 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 825 vstore1(DATA.s4, OFFSET, PTR + 4); 826 827#define vstore_partial_6(DATA, OFFSET, PTR) \ 828 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 829 vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 830 831#define vstore_partial_7(DATA, OFFSET, PTR) \ 832 vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 833 vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 834 835#define vstore_partial_8(DATA, OFFSET, PTR) \ 836 vstore8(DATA.s01234567, OFFSET, PTR); 837 838#define vstore_partial_9(DATA, OFFSET, PTR) \ 839 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 840 vstore1(DATA.s8, OFFSET, PTR + 8); 841 842#define vstore_partial_10(DATA, OFFSET, PTR) \ 843 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 844 vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 845 846#define vstore_partial_11(DATA, OFFSET, PTR) \ 847 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 848 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 849 850#define vstore_partial_12(DATA, OFFSET, PTR) \ 851 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 852 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 853 854#define vstore_partial_13(DATA, OFFSET, PTR) \ 855 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 856 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 857 858#define vstore_partial_14(DATA, OFFSET, PTR) \ 859 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 860 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 861 862#define vstore_partial_15(DATA, OFFSET, PTR) \ 863 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 864 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 865 866#define vstore_partial_16(DATA, OFFSET, PTR) \ 867 vstore16(DATA, OFFSET, PTR); 868 869 870 871 872 873#define convert_float_sat convert_float 874#define convert_float1_sat convert_float 875#define convert_float2_sat convert_float2 876#define convert_float3_sat convert_float3 877#define convert_float4_sat convert_float4 878#define convert_float8_sat convert_float8 879#define convert_float16_sat convert_float16 880#define convert_half_sat convert_float 881#define convert_half1_sat convert_half 882#define convert_half2_sat convert_half2 883#define convert_half3_sat convert_half3 884#define convert_half4_sat convert_half4 885#define convert_half8_sat convert_half8 886#define convert_half16_sat convert_half16 887 888#define convert_float1 convert_float 889#define convert_half1 convert_half 890#define convert_char1 convert_char 891#define convert_uchar1 convert_uchar 892#define convert_short1 convert_short 893#define convert_ushort1 convert_ushort 894#define convert_int1 convert_int 895#define convert_uint1 convert_uint 896#define convert_long1 convert_long 897#define convert_ulong1 convert_ulong 898#define convert_double1 convert_double 899 900#define convert_char1_sat convert_char_sat 901#define convert_uchar1_sat convert_uchar_sat 902#define convert_uchar2_sat convert_uchar2_sat 903#define convert_uchar3_sat convert_uchar3_sat 904#define convert_uchar4_sat convert_uchar4_sat 905#define convert_uchar8_sat convert_uchar8_sat 906#define convert_uchar16_sat convert_uchar16_sat 907#define convert_short1_sat convert_short_sat 908#define convert_ushort1_sat convert_ushort_sat 909#define convert_int1_sat convert_int_sat 910#define convert_uint1_sat convert_uint_sat 911#define convert_long1_sat convert_long_sat 912#define convert_ulong1_sat convert_ulong_sat 913#define convert_double1_sat convert_double_sat 914 915#define VEC_DATA_TYPE_STR(type, size) type##size 916#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 917 918#define CONVERT_STR(x, type) (convert_##type((x))) 919#define CONVERT(x, type) CONVERT_STR(x, type) 920 921#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 922#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 923 924#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 925#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 926 927#define select_vec_dt_uchar(size) uchar##size 928#define select_vec_dt_char(size) char##size 929#define select_vec_dt_ushort(size) ushort##size 930#define select_vec_dt_short(size) short##size 931#define select_vec_dt_half(size) short##size 932#define select_vec_dt_uint(size) uint##size 933#define select_vec_dt_int(size) int##size 934#define select_vec_dt_float(size) int##size 935#define select_vec_dt_ulong(size) ulong##size 936#define select_vec_dt_long(size) long##size 937 938#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 939#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 940#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 941 942#define signed_int_vec_dt_uchar(size) char##size 943#define signed_int_vec_dt_char(size) char##size 944#define signed_int_vec_dt_ushort(size) short##size 945#define signed_int_vec_dt_short(size) short##size 946#define signed_int_vec_dt_half(size) short##size 947#define signed_int_vec_dt_uint(size) int##size 948#define signed_int_vec_dt_int(size) int##size 949#define signed_int_vec_dt_float(size) int##size 950#define signed_int_vec_dt_ulong(size) long##size 951#define signed_int_vec_dt_long(size) long##size 952 953#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 954#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 955#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 956 957#define sum_reduce_1(x) (x) 958#define sum_reduce_2(x) ((x).s0) + ((x).s1) 959#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 960#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 961#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 962#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 963 964#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 965#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 966 967#define prod_reduce_1(x) (x) 968#define prod_reduce_2(x) ((x).s0) * ((x).s1) 969#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 970#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 971#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 972#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 973 974#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 975#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 976 977#define max_reduce_1(x) (x) 978#define max_reduce_2(x) max(((x).s0), ((x).s1)) 979#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 980#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 981#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 982#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 983 984#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 985#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 986 987#define VECTOR_DECLARATION(name) \ 988 __global uchar *name##_ptr, \ 989 uint name##_stride_x, \ 990 uint name##_step_x, \ 991 uint name##_offset_first_element_in_bytes 992 993#define IMAGE_DECLARATION(name) \ 994 __global uchar *name##_ptr, \ 995 uint name##_stride_x, \ 996 uint name##_step_x, \ 997 uint name##_stride_y, \ 998 uint name##_step_y, \ 999 uint name##_offset_first_element_in_bytes 1000 1001#define TENSOR3D_DECLARATION(name) \ 1002 __global uchar *name##_ptr, \ 1003 uint name##_stride_x, \ 1004 uint name##_step_x, \ 1005 uint name##_stride_y, \ 1006 uint name##_step_y, \ 1007 uint name##_stride_z, \ 1008 uint name##_step_z, \ 1009 uint name##_offset_first_element_in_bytes 1010 1011#define TENSOR4D_DECLARATION(name) \ 1012 __global uchar *name##_ptr, \ 1013 uint name##_stride_x, \ 1014 uint name##_step_x, \ 1015 uint name##_stride_y, \ 1016 uint name##_step_y, \ 1017 uint name##_stride_z, \ 1018 uint name##_step_z, \ 1019 uint name##_stride_w, \ 1020 uint name##_step_w, \ 1021 uint name##_offset_first_element_in_bytes 1022 1023#define TENSOR5D_DECLARATION(name) \ 1024 __global uchar *name##_ptr, \ 1025 uint name##_stride_x, \ 1026 uint name##_step_x, \ 1027 uint name##_stride_y, \ 1028 uint name##_step_y, \ 1029 uint name##_stride_z, \ 1030 uint name##_step_z, \ 1031 uint name##_stride_w, \ 1032 uint name##_step_w, \ 1033 uint name##_stride_v, \ 1034 uint name##_step_v, \ 1035 uint name##_offset_first_element_in_bytes 1036 1037#define CONVERT_TO_VECTOR_STRUCT(name) \ 1038 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1039 1040#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1041 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1042 1043#define CONVERT_TO_IMAGE_STRUCT(name) \ 1044 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1045 1046#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1047 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1048 1049#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1050 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1051 1052#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1053 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1054 1055#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1056 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1057 1058#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1059 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1060 name##_stride_z, name##_step_z) 1061 1062#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1063 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1064 1065#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1066 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1067 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1068 1069#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1070 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1071 1072#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1073 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1074 name##_stride_z, name##_step_z) 1075 1076 1077typedef struct Vector 1078{ 1079 __global uchar *ptr; 1080 int offset_first_element_in_bytes; 1081 int stride_x; 1082} Vector; 1083 1084 1085typedef struct Image 1086{ 1087 __global uchar *ptr; 1088 int offset_first_element_in_bytes; 1089 int stride_x; 1090 int stride_y; 1091} Image; 1092 1093 1094typedef struct Tensor3D 1095{ 1096 __global uchar *ptr; 1097 int offset_first_element_in_bytes; 1098 int stride_x; 1099 int stride_y; 1100 int stride_z; 1101} Tensor3D; 1102 1103 1104typedef struct Tensor4D 1105{ 1106 __global uchar *ptr; 1107 int offset_first_element_in_bytes; 1108 int stride_x; 1109 int stride_y; 1110 int stride_z; 1111 int stride_w; 1112} Tensor4D; 1113 1114 1115inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1116{ 1117 Vector vector = 1118 { 1119 .ptr = ptr, 1120 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1121 .stride_x = stride_x, 1122 }; 1123 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1124 return vector; 1125} 1126 1127 1128inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1129{ 1130 Image img = 1131 { 1132 .ptr = ptr, 1133 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1134 .stride_x = stride_x, 1135 .stride_y = stride_y 1136 }; 1137 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1138 return img; 1139} 1140 1141 1142inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1143{ 1144 Image img = 1145 { 1146 .ptr = ptr, 1147 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1148 .stride_x = stride_x, 1149 .stride_y = stride_y 1150 }; 1151 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1152 return img; 1153} 1154 1155 1156inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1157{ 1158 Tensor3D tensor = 1159 { 1160 .ptr = ptr, 1161 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1162 .stride_x = stride_x, 1163 .stride_y = stride_y, 1164 .stride_z = stride_z 1165 }; 1166 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1167 return tensor; 1168} 1169 1170 1171inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1172{ 1173 Tensor3D tensor = 1174 { 1175 .ptr = ptr, 1176 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1177 .stride_x = stride_x, 1178 .stride_y = stride_y, 1179 .stride_z = stride_z 1180 }; 1181 return tensor; 1182} 1183 1184inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1185 uint step_w, 1186 uint mod_size) 1187{ 1188 Tensor4D tensor = 1189 { 1190 .ptr = ptr, 1191 .offset_first_element_in_bytes = offset_first_element_in_bytes, 1192 .stride_x = stride_x, 1193 .stride_y = stride_y, 1194 .stride_z = stride_z, 1195 .stride_w = stride_w 1196 }; 1197 1198 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1199 return tensor; 1200} 1201 1202 1203inline __global const uchar *vector_offset(const Vector *vec, int x) 1204{ 1205 return vec->ptr + x * vec->stride_x; 1206} 1207 1208 1209inline __global uchar *offset(const Image *img, int x, int y) 1210{ 1211 return img->ptr + x * img->stride_x + y * img->stride_y; 1212} 1213 1214 1215inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1216{ 1217 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1218} 1219 1220 1221inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1222{ 1223 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1224} 1225 1226 1227inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1228{ 1229 uint num_elements = width * height; 1230 1231 const uint z = index / num_elements; 1232 1233 index %= num_elements; 1234 1235 const uint y = index / width; 1236 1237 index %= width; 1238 1239 const uint x = index; 1240 1241 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1242} 1243 1244#endif )"