1/* 2 * Copyright (c) 2018-2022 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "activation_float_helpers.h" 25#include "helpers.h" 26#include "tile_helpers.h" 27 28#if defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) 29#if defined(VEC_SIZE) && VEC_SIZE == 2 30#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC) 31/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC 32 * 33 * @note must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 34 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 35 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 36 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 37 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time 38 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. 39 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 40 * 41 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 42 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 43 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 44 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 45 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 46 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 47 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 48 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 49 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 50 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 51 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 52 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 53 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 54 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 55 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 56 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 57 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 58 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 59 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 60 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 61 * @param[in] _ISRC_HEIGHT The source tensor's height 62 * @param[in] _IDST_WIDTH The destination tensor's width 63 * @param[in] _IDST_HEIGHT The destination tensor's height 64 */ 65__kernel void winograd_output_transform_2x2_7x7_nhwc( 66 TENSOR4D(src, BUFFER), 67 TENSOR4D(dst, BUFFER), 68#if defined(HAS_BIAS) 69 VECTOR_DECLARATION(bias), 70#endif // defined(HAS_BIAS) 71 int dst_size, 72 const int _ISRC_HEIGHT, 73 const int _IDST_WIDTH, 74 const int _IDST_HEIGHT) 75{ 76 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM 77 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES 78 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX 79 80 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; 81 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; 82 83#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 84 TILE(DATA_TYPE, 8, N0, in); 85 TILE(DATA_TYPE, 2, N0, out); 86 TILE(uint, 8, 1, src_indirect_y); 87 88 // Calculate the indirect Y for the source tensor 89 LOOP_UNROLLING(int, i, 0, 1, 8, 90 { 91 src_indirect_y[i].v = mout + i *_ISRC_HEIGHT; 92 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8); 93 }) 94 95 // Initialize the input tile 96 LOOP_UNROLLING(int, i, 0, 1, 8, 97 { 98 in[i].v = 0; 99 }) 100 101 // Load the values across the 8 channels to compose the 8x1 tile 102 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); 103 104 // Compute out0 and out01 105 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v; 106 out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v; 107 108#if defined(HAS_BIAS) 109 // Add bias 110 TILE(DATA_TYPE, 1, N0, b); 111 112 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); 113 114 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out); 115#endif // defined(HAS_BIAS) 116 117 T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); 118 119 TILE(uint, 2, 1, dst_indirect_y); 120 121#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 122 LOOP_UNROLLING(int, yk, 0, 1, 2, 123 { 124 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1)); 125 dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH); 126 }) 127#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 128 LOOP_UNROLLING(int, xk, 0, 1, 2, 129 { 130 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1)); 131 dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH); 132 }) 133#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 134 135 // Store the tile in reverse order so the invalid values are overwritten with the valid ones 136 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 137 138#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 139 140 TILE(DATA_TYPE, 64, N0, in); 141 TILE(DATA_TYPE, 4, N0, out); 142 TILE(DATA_TYPE, 16, N0, tmp); 143 TILE(uint, 64, 1, src_indirect_y); 144 145 // Calculate the indirect Y for the source tensor 146 LOOP_UNROLLING(int, i, 0, 1, 64, 147 { 148 src_indirect_y[i].v = mout + i *_ISRC_HEIGHT; 149 src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64); 150 }) 151 152 // Initialize the input tile 153 LOOP_UNROLLING(int, i, 0, 1, 64, 154 { 155 in[i].v = 0; 156 }) 157 158 // Load the values across the 64 channels to compose the 8x8 tile 159 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); 160 161 LOOP_UNROLLING(int, i, 0, 1, 8, 162 { 163 tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v; 164 tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v; 165 }) 166 167 // Compute the 2x2 output tile 168 LOOP_UNROLLING(int, i, 0, 1, 2, 169 { 170 out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v; 171 out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v; 172 }) 173 174#if defined(HAS_BIAS) 175 // Add bias 176 TILE(DATA_TYPE, 1, N0, b); 177 178 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); 179 180 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out); 181#endif // defined(HAS_BIAS) 182 183 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); 184 185 TILE(uint, 4, 1, dst_indirect_y); 186 187 // Calculate the destination indirect Y 188 LOOP_UNROLLING(int, yk, 0, 1, 2, 189 { 190 LOOP_UNROLLING(int, xk, 0, 1, 2, 191 { 192 int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1)); 193 int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1)); 194 dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH; 195 dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); 196 }) 197 }) 198 199 // Store the tile in reverse order so the invalid values are overwritten with the valid ones 200 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 201#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 202} 203#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC) 204#endif // defined(VEC_SIZE) && VEC_SIZE == 2 205 206#if defined(VEC_SIZE) && VEC_SIZE == 4 207#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC) 208/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC 209 * 210 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 211 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 212 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 213 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time 214 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. 215 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 216 * 217 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 218 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 219 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 220 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 221 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 222 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 223 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 224 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 225 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 226 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 227 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 228 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 229 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 230 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 231 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 232 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 233 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 234 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 235 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 236 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 237 * @param[in] dst_size Size of the destination tensor, minus the last padding 238 * @param[in] SRC_HEIGHT The source tensor's height 239 * @param[in] DST_WIDTH The destination tensor's width 240 * @param[in] DST_HEIGHT The destination tensor's height 241 */ 242__kernel void winograd_output_transform_4x4_3x3_nhwc( 243 TENSOR4D(src, BUFFER), 244 TENSOR4D(dst, BUFFER), 245#if defined(HAS_BIAS) 246 VECTOR_DECLARATION(bias), 247#endif // defined(HAS_BIAS) 248 int dst_size, 249 const int SRC_HEIGHT, 250 const int DST_WIDTH, 251 const int DST_HEIGHT) 252{ 253 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM 254 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES 255 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX 256 257#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 258 259 TILE(DATA_TYPE, 6, N0, in); 260 TILE(DATA_TYPE, 4, N0, out); 261 TILE(uint, 6, 1, src_indirect_y); 262 263 LOOP_UNROLLING(int, i, 0, 1, 6, 264 { 265 src_indirect_y[i].v = mout + i *SRC_HEIGHT; 266 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6); 267 }) 268 269 // Initialize the input tile 270 LOOP_UNROLLING(int, i, 0, 1, 6, 271 { 272 in[i].v = 0; 273 }) 274 275 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile 276 T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); 277 278 // Compute out00, out01, out02 and out03 279 out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v; 280 out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v; 281 out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v; 282 out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v; 283 284#if defined(HAS_BIAS) 285 TILE(DATA_TYPE, 1, N0, b); 286 287 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); 288 289 // c = c + bias[broadcasted] 290 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out); 291#endif // HAS_BIAS 292 293 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; 294 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; 295 296 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); 297 298 TILE(uint, 4, 1, dst_indirect_y); 299 300 // Calculate the destination indirect Y 301#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 302 LOOP_UNROLLING(int, yk, 0, 1, 4, 303 { 304 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); 305 dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH; 306 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); 307 }) 308#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 309 LOOP_UNROLLING(int, xk, 0, 1, 4, 310 { 311 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); 312 dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH; 313 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); 314 }) 315#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 316 317 // Store the tile in reverse order so the invalid values are overwritten with the valid ones 318 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 319 320#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 321 322 // Calculate the indirect Y for the source tensor 323 TILE(DATA_TYPE, 36, N0, in); 324 TILE(DATA_TYPE, 4, N0, tmp); 325 TILE(uint, 36, 1, src_indirect_y); 326 327 LOOP_UNROLLING(int, i, 0, 1, 36, 328 { 329 src_indirect_y[i].v = mout + i *SRC_HEIGHT; 330 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36); 331 }) 332 333 // Initialize the input tile 334 LOOP_UNROLLING(int, i, 0, 1, 36, 335 { 336 in[i].v = 0; 337 }) 338 339 // Load the values across the 36 channels to compose the 6x6 or 6x1 tile 340 T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); 341 342 LOOP_UNROLLING(int, i, 0, 1, 6, 343 { 344 tmp[0].v = in[6 + i].v + in[12 + i].v; 345 tmp[1].v = in[6 + i].v - in[12 + i].v; 346 tmp[2].v = in[18 + i].v + in[24 + i].v; 347 tmp[3].v = in[18 + i].v - in[24 + i].v; 348 tmp[3].v = tmp[3].v + tmp[3].v; 349 in[i].v = in[i].v + tmp[0].v + tmp[2].v; 350 in[6 + i].v = tmp[3].v + tmp[1].v; 351 in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v); 352 in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v; 353 }) 354 355 // Compute the output tile 356 TILE(DATA_TYPE, 16, N0, out); 357 358 LOOP_UNROLLING(int, i, 0, 1, 4, 359 { 360 tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v; 361 tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v; 362 tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v; 363 tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v; 364 tmp[3].v = tmp[3].v + tmp[3].v; 365 out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v; 366 out[4 * i + 1].v = tmp[3].v + tmp[1].v; 367 out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v); 368 out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v; 369 }) 370 371#if defined(HAS_BIAS) 372 TILE(DATA_TYPE, 1, N0, b); 373 374 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); 375 376 // c = c + bias[broadcasted] 377 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out); 378#endif // HAS_BIAS 379 380 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; 381 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; 382 383 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); 384 385 TILE(uint, 16, 1, dst_indirect_y); 386 387 // Calculate the destination indirect Y 388 LOOP_UNROLLING(int, yk, 0, 1, 4, 389 { 390 LOOP_UNROLLING(int, xk, 0, 1, 4, 391 { 392 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); 393 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); 394 dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH; 395 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT); 396 }) 397 }) 398 399 // Store the tile in reverse order so the invalid values are overwritten with the valid ones 400 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 401#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 402} 403#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC) 404 405#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC) 406/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC 407 * 408 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 409 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 410 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 411 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time 412 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. 413 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 414 * 415 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 416 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 417 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 418 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 419 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 420 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 421 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 422 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 423 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 424 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 425 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 426 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 427 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 428 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 429 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 430 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 431 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 432 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 433 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 434 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 435 * @param[in] SRC_HEIGHT The source tensor's height 436 * @param[in] DST_WIDTH The destination tensor's width 437 * @param[in] DST_HEIGHT The destination tensor's height 438 */ 439__kernel void winograd_output_transform_4x4_5x5_nhwc( 440 TENSOR4D(src, BUFFER), 441 TENSOR4D(dst, BUFFER), 442#if defined(HAS_BIAS) 443 VECTOR_DECLARATION(bias), 444#endif // defined(HAS_BIAS) 445 int dst_size, 446 const int SRC_HEIGHT, 447 const int DST_WIDTH, 448 const int DST_HEIGHT) 449{ 450 const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM 451 const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES 452 const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX 453 454#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 455 TILE(DATA_TYPE, 8, N0, in); 456 TILE(DATA_TYPE, 4, N0, out); 457 TILE(DATA_TYPE, 4, N0, tmp); 458 TILE(uint, 8, 1, src_indirect_y); 459 460 LOOP_UNROLLING(int, i, 0, 1, 8, 461 { 462 src_indirect_y[i].v = mout + i *SRC_HEIGHT; 463 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8); 464 }) 465 466 // Initialize the input tile 467 LOOP_UNROLLING(int, i, 0, 1, 8, 468 { 469 in[i].v = 0; 470 }) 471 472 // "in" contains 1x8 or 8x1 tile here 473 T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); 474 475 // A^T * in, and in this degenerate case out consists of 1 column/row 476 tmp[0].v = in[1].v - in[2].v; 477 tmp[1].v = 2.0f * (in[3].v - in[4].v); 478 tmp[2].v = 2.0f * (in[5].v + in[6].v); 479 tmp[3].v = in[3].v + in[4].v; 480 out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v; 481 out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v); 482 out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v; 483 out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v; 484 485#if defined(HAS_BIAS) 486 TILE(DATA_TYPE, 1, N0, b); 487 488 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); 489 490 // c = c + bias[broadcasted] 491 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out); 492#endif // HAS_BIAS 493 494 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; 495 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; 496 497 T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); 498 499 TILE(uint, 4, 1, dst_indirect_y); 500 501 // Calculate the destination indirect Y 502#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 503 LOOP_UNROLLING(int, yk, 0, 1, 4, 504 { 505 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); 506 dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH; 507 dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); 508 }) 509#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 510 LOOP_UNROLLING(int, xk, 0, 1, 4, 511 { 512 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); 513 dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH; 514 dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); 515 }) 516#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 517 518 // Store the tile in reverse order so the invalid values are overwritten with the valid ones 519 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 520 521#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 522 // Calculate the indirect Y for the source tensor 523 TILE(DATA_TYPE, 64, N0, in); 524 TILE(DATA_TYPE, 6, N0, tmp); 525 TILE(uint, 64, 1, src_indirect_y); 526 527 LOOP_UNROLLING(int, i, 0, 1, 64, 528 { 529 src_indirect_y[i].v = mout + i *SRC_HEIGHT; 530 src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64); 531 }) 532 533 // Initialize the input tile 534 LOOP_UNROLLING(int, i, 0, 1, 64, 535 { 536 in[i].v = 0; 537 }) 538 539 // "in" here is 8x8 tile 540 T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); 541 542 // A^T * in 543 LOOP_UNROLLING(int, i, 0, 1, 8, 544 { 545 tmp[0].v = in[8 + i].v + in[16 + i].v; 546 tmp[1].v = in[8 + i].v - in[16 + i].v; 547 tmp[2].v = in[24 + i].v + in[32 + i].v; 548 tmp[3].v = in[24 + i].v - in[32 + i].v; 549 tmp[3].v = tmp[3].v + tmp[3].v; 550 tmp[4].v = in[40 + i].v + in[48 + i].v; 551 tmp[4].v = tmp[4].v + tmp[4].v; 552 tmp[5].v = in[40 + i].v - in[48 + i].v; 553 554 // 4x8 matrix as a result 555 in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v); 556 in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v); 557 in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v); 558 in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v; 559 }) 560 561 // Compute the output tile 562 TILE(DATA_TYPE, 16, N0, out); 563 564 // in * A, with in = A^T * in as above 565 LOOP_UNROLLING(int, i, 0, 1, 4, 566 { 567 tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v; 568 tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v; 569 tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v; 570 tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v; 571 tmp[3].v = tmp[3].v + tmp[3].v; 572 tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v; 573 tmp[4].v = tmp[4].v + tmp[4].v; 574 tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v; 575 576 // 4x4 tile 577 out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v); 578 out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v); 579 out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v; 580 out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v; 581 }) 582 583#if defined(HAS_BIAS) 584 TILE(DATA_TYPE, 1, N0, b); 585 586 T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); 587 588 // c = c + bias[broadcasted] 589 T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out); 590#endif // HAS_BIAS 591 592 int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; 593 int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; 594 595 T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); 596 597 TILE(uint, 16, 1, dst_indirect_y); 598 599 // Calculate the destination indirect Y 600 LOOP_UNROLLING(int, yk, 0, 1, 4, 601 { 602 LOOP_UNROLLING(int, xk, 0, 1, 4, 603 { 604 int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); 605 int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); 606 dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH; 607 dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT); 608 }) 609 }) 610 611 // Store the tile in reverse order so the invalid values are overwritten with the valid ones 612 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); 613#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 614} 615#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC) 616#endif // defined(VEC_SIZE) && VEC_SIZE == 4 617 618#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) 619#if defined(VEC_SIZE) && VEC_SIZE == 2 620#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) 621/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC 622 * 623 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 624 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 625 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 626 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 627 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 628 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. 629 * 630 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 631 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 632 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 633 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 634 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 635 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 636 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 637 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 638 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 639 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 640 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 641 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 642 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 643 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 644 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 645 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 646 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 647 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 648 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 649 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 650 * @param[in] SRC_HEIGHT The source tensor's height 651 * @param[in] DST_WIDTH The destination tensor's width 652 * @param[in] DST_HEIGHT The destination tensor's height 653 */ 654__kernel void winograd_output_transform_2x1_7x1_nhwc( 655 TENSOR4D_DECLARATION(src), 656 TENSOR4D_DECLARATION(dst), 657#if defined(HAS_BIAS) 658 VECTOR_DECLARATION(bias), 659#endif // defined(HAS_BIAS) 660 int dst_size, 661 const int SRC_HEIGHT, 662 const int DST_WIDTH, 663 const int DST_HEIGHT) 664{ 665 winograd_output_transform_2x2_7x7_nhwc(src_ptr, 666 src_stride_x, 667 src_step_x, 668 src_stride_y, 669 src_step_y, 670 src_stride_z, 671 src_step_z, 672 src_stride_w, 673 src_step_w, 674 src_offset_first_element_in_bytes, 675 dst_ptr, 676 dst_stride_x, 677 dst_step_x, 678 dst_stride_y, 679 dst_step_y, 680 dst_stride_z, 681 dst_step_z, 682 dst_stride_w, 683 dst_step_w, 684 dst_offset_first_element_in_bytes, 685#if defined(HAS_BIAS) 686 bias_ptr, 687 bias_stride_x, 688 bias_step_x, 689 bias_offset_first_element_in_bytes, 690#endif // defined(HAS_BIAS) 691 dst_size, 692 SRC_HEIGHT, 693 DST_WIDTH, 694 DST_HEIGHT); 695} 696#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) 697#endif // defined(VEC_SIZE) && VEC_SIZE == 2 698 699#if defined(VEC_SIZE) && VEC_SIZE == 4 700#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) 701/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC 702 * 703 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 704 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 705 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 706 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 707 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 708 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. 709 * 710 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 711 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 712 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 713 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 714 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 715 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 716 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 717 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 718 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 719 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 720 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 721 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 722 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 723 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 724 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 725 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 726 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 727 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 728 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 729 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 730 * @param[in] SRC_HEIGHT The source tensor's height 731 * @param[in] DST_WIDTH The destination tensor's width 732 * @param[in] DST_HEIGHT The destination tensor's height 733 */ 734__kernel void winograd_output_transform_4x1_3x1_nhwc( 735 TENSOR4D_DECLARATION(src), 736 TENSOR4D_DECLARATION(dst), 737#if defined(HAS_BIAS) 738 VECTOR_DECLARATION(bias), 739#endif // defined(HAS_BIAS) 740 int dst_size, 741 const int SRC_HEIGHT, 742 const int DST_WIDTH, 743 const int DST_HEIGHT) 744{ 745 winograd_output_transform_4x4_3x3_nhwc(src_ptr, 746 src_stride_x, 747 src_step_x, 748 src_stride_y, 749 src_step_y, 750 src_stride_z, 751 src_step_z, 752 src_stride_w, 753 src_step_w, 754 src_offset_first_element_in_bytes, 755 dst_ptr, 756 dst_stride_x, 757 dst_step_x, 758 dst_stride_y, 759 dst_step_y, 760 dst_stride_z, 761 dst_step_z, 762 dst_stride_w, 763 dst_step_w, 764 dst_offset_first_element_in_bytes, 765#if defined(HAS_BIAS) 766 bias_ptr, 767 bias_stride_x, 768 bias_step_x, 769 bias_offset_first_element_in_bytes, 770#endif // defined(HAS_BIAS) 771 dst_size, 772 SRC_HEIGHT, 773 DST_WIDTH, 774 DST_HEIGHT); 775} 776#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) 777 778#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) 779/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC 780 * 781 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 782 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 783 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 784 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 785 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time 786 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. 787 * 788 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 789 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 790 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 791 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 792 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 793 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 794 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 795 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 796 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 797 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 798 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 799 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 800 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 801 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 802 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 803 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 804 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 805 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 806 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 807 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 808 * @param[in] SRC_HEIGHT The source tensor's height 809 * @param[in] DST_WIDTH The destination tensor's width 810 * @param[in] DST_HEIGHT The destination tensor's height 811 */ 812__kernel void winograd_output_transform_4x1_5x1_nhwc( 813 TENSOR4D_DECLARATION(src), 814 TENSOR4D_DECLARATION(dst), 815#if defined(HAS_BIAS) 816 VECTOR_DECLARATION(bias), 817#endif // defined(HAS_BIAS) 818 int dst_size, 819 const int SRC_HEIGHT, 820 const int DST_WIDTH, 821 const int DST_HEIGHT) 822{ 823 winograd_output_transform_4x4_5x5_nhwc(src_ptr, 824 src_stride_x, 825 src_step_x, 826 src_stride_y, 827 src_step_y, 828 src_stride_z, 829 src_step_z, 830 src_stride_w, 831 src_step_w, 832 src_offset_first_element_in_bytes, 833 dst_ptr, 834 dst_stride_x, 835 dst_step_x, 836 dst_stride_y, 837 dst_step_y, 838 dst_stride_z, 839 dst_step_z, 840 dst_stride_w, 841 dst_step_w, 842 dst_offset_first_element_in_bytes, 843#if defined(HAS_BIAS) 844 bias_ptr, 845 bias_stride_x, 846 bias_step_x, 847 bias_offset_first_element_in_bytes, 848#endif // defined(HAS_BIAS) 849 dst_size, 850 SRC_HEIGHT, 851 DST_WIDTH, 852 DST_HEIGHT); 853} 854#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) 855#endif // defined(VEC_SIZE) && VEC_SIZE == 4 856#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) 857 858#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 859#if defined(VEC_SIZE) && VEC_SIZE == 2 860#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC) 861/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC 862 * 863 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 864 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 865 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 866 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 867 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time 868 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. 869 * 870 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 871 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 872 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 873 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 874 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 875 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 876 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 877 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 878 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 879 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 880 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 881 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 882 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 883 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 884 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 885 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 886 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 887 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 888 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 889 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 890 * @param[in] SRC_HEIGHT The source tensor's height 891 * @param[in] DST_WIDTH The destination tensor's width 892 * @param[in] DST_HEIGHT The destination tensor's height 893 */ 894__kernel void winograd_output_transform_1x2_1x7_nhwc( 895 TENSOR4D_DECLARATION(src), 896 TENSOR4D_DECLARATION(dst), 897#if defined(HAS_BIAS) 898 VECTOR_DECLARATION(bias), 899#endif // defined(HAS_BIAS) 900 int dst_size, 901 const int SRC_HEIGHT, 902 const int DST_WIDTH, 903 const int DST_HEIGHT) 904{ 905 winograd_output_transform_2x2_7x7_nhwc(src_ptr, 906 src_stride_x, 907 src_step_x, 908 src_stride_y, 909 src_step_y, 910 src_stride_z, 911 src_step_z, 912 src_stride_w, 913 src_step_w, 914 src_offset_first_element_in_bytes, 915 dst_ptr, 916 dst_stride_x, 917 dst_step_x, 918 dst_stride_y, 919 dst_step_y, 920 dst_stride_z, 921 dst_step_z, 922 dst_stride_w, 923 dst_step_w, 924 dst_offset_first_element_in_bytes, 925#if defined(HAS_BIAS) 926 bias_ptr, 927 bias_stride_x, 928 bias_step_x, 929 bias_offset_first_element_in_bytes, 930#endif // defined(HAS_BIAS) 931 dst_size, 932 SRC_HEIGHT, 933 DST_WIDTH, 934 DST_HEIGHT); 935} 936#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC) 937#endif // defined(VEC_SIZE) && VEC_SIZE == 2 938 939#if defined(VEC_SIZE) && VEC_SIZE == 4 940#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC) 941/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC 942 * 943 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 944 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 945 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 946 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 947 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time 948 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. 949 * 950 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 951 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 952 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 953 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 954 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 955 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 956 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 957 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 958 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 959 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 960 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 961 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 962 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 963 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 964 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 965 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 966 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 967 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 968 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 969 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 970 * @param[in] SRC_HEIGHT The source tensor's height 971 * @param[in] DST_WIDTH The destination tensor's width 972 * @param[in] DST_HEIGHT The destination tensor's height 973 */ 974__kernel void winograd_output_transform_1x4_1x3_nhwc( 975 TENSOR4D_DECLARATION(src), 976 TENSOR4D_DECLARATION(dst), 977#if defined(HAS_BIAS) 978 VECTOR_DECLARATION(bias), 979#endif // defined(HAS_BIAS) 980 int dst_size, 981 const int SRC_HEIGHT, 982 const int DST_WIDTH, 983 const int DST_HEIGHT) 984{ 985 winograd_output_transform_4x4_3x3_nhwc(src_ptr, 986 src_stride_x, 987 src_step_x, 988 src_stride_y, 989 src_step_y, 990 src_stride_z, 991 src_step_z, 992 src_stride_w, 993 src_step_w, 994 src_offset_first_element_in_bytes, 995 dst_ptr, 996 dst_stride_x, 997 dst_step_x, 998 dst_stride_y, 999 dst_step_y, 1000 dst_stride_z, 1001 dst_step_z, 1002 dst_stride_w, 1003 dst_step_w, 1004 dst_offset_first_element_in_bytes, 1005#if defined(HAS_BIAS) 1006 bias_ptr, 1007 bias_stride_x, 1008 bias_step_x, 1009 bias_offset_first_element_in_bytes, 1010#endif // defined(HAS_BIAS) 1011 dst_size, 1012 SRC_HEIGHT, 1013 DST_WIDTH, 1014 DST_HEIGHT); 1015} 1016#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC) 1017 1018#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC) 1019/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC 1020 * 1021 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 1022 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 1023 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 1024 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 1025 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time 1026 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. 1027 * 1028 * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 1029 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) 1030 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) 1031 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 1032 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) 1033 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 1034 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) 1035 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 1036 * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) 1037 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 1038 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr 1039 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) 1040 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) 1041 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 1042 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) 1043 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) 1044 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) 1045 * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) 1046 * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) 1047 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 1048 * @param[in] SRC_HEIGHT The source tensor's height 1049 * @param[in] DST_WIDTH The destination tensor's width 1050 * @param[in] DST_HEIGHT The destination tensor's height 1051 */ 1052__kernel void winograd_output_transform_1x4_1x5_nhwc( 1053 TENSOR4D_DECLARATION(src), 1054 TENSOR4D_DECLARATION(dst), 1055#if defined(HAS_BIAS) 1056 VECTOR_DECLARATION(bias), 1057#endif // defined(HAS_BIAS) 1058 int dst_size, 1059 const int SRC_HEIGHT, 1060 const int DST_WIDTH, 1061 const int DST_HEIGHT) 1062{ 1063 winograd_output_transform_4x4_5x5_nhwc(src_ptr, 1064 src_stride_x, 1065 src_step_x, 1066 src_stride_y, 1067 src_step_y, 1068 src_stride_z, 1069 src_step_z, 1070 src_stride_w, 1071 src_step_w, 1072 src_offset_first_element_in_bytes, 1073 dst_ptr, 1074 dst_stride_x, 1075 dst_step_x, 1076 dst_stride_y, 1077 dst_step_y, 1078 dst_stride_z, 1079 dst_step_z, 1080 dst_stride_w, 1081 dst_step_w, 1082 dst_offset_first_element_in_bytes, 1083#if defined(HAS_BIAS) 1084 bias_ptr, 1085 bias_stride_x, 1086 bias_step_x, 1087 bias_offset_first_element_in_bytes, 1088#endif // defined(HAS_BIAS) 1089 dst_size, 1090 SRC_HEIGHT, 1091 DST_WIDTH, 1092 DST_HEIGHT); 1093} 1094#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC) 1095#endif // defined(VEC_SIZE) && VEC_SIZE == 4 1096#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) 1097#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)