xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1/*
2 * Copyright (c) 2018-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "activation_float_helpers.h"
25#include "helpers.h"
26#include "tile_helpers.h"
27
28#if defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
29#if defined(VEC_SIZE) && VEC_SIZE == 2
30#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
31/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
32 *
33 * @note  must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
34 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
35 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
36 * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
37 * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
38 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
39 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
40 *
41 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
42 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
43 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
44 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
45 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
46 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
47 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
48 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
49 * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
50 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
51 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
52 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
53 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
54 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
55 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
56 * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
57 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
58 * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
59 * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
60 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
61 * @param[in]  _ISRC_HEIGHT                      The source tensor's height
62 * @param[in]  _IDST_WIDTH                       The destination tensor's width
63 * @param[in]  _IDST_HEIGHT                      The destination tensor's height
64 */
65__kernel void winograd_output_transform_2x2_7x7_nhwc(
66    TENSOR4D(src, BUFFER),
67    TENSOR4D(dst, BUFFER),
68#if defined(HAS_BIAS)
69    VECTOR_DECLARATION(bias),
70#endif // defined(HAS_BIAS)
71    int       dst_size,
72    const int _ISRC_HEIGHT,
73    const int _IDST_WIDTH,
74    const int _IDST_HEIGHT)
75{
76    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
77    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
78    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
79
80    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
81    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
82
83#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
84    TILE(DATA_TYPE, 8, N0, in);
85    TILE(DATA_TYPE, 2, N0, out);
86    TILE(uint, 8, 1, src_indirect_y);
87
88    // Calculate the indirect Y for the source tensor
89    LOOP_UNROLLING(int, i, 0, 1, 8,
90    {
91        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
92        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
93    })
94
95    // Initialize the input tile
96    LOOP_UNROLLING(int, i, 0, 1, 8,
97    {
98        in[i].v = 0;
99    })
100
101    // Load the values across the 8 channels to compose the 8x1 tile
102    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
103
104    // Compute out0 and out01
105    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
106    out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
107
108#if defined(HAS_BIAS)
109    // Add bias
110    TILE(DATA_TYPE, 1, N0, b);
111
112    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
113
114    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 2, N0, out, b, out);
115#endif // defined(HAS_BIAS)
116
117    T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
118
119    TILE(uint, 2, 1, dst_indirect_y);
120
121#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
122    LOOP_UNROLLING(int, yk, 0, 1, 2,
123    {
124        int y_c              = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
125        dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
126    })
127#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
128    LOOP_UNROLLING(int, xk, 0, 1, 2,
129    {
130        int x_c              = min(x_out + xk, ((int)_IDST_WIDTH - 1));
131        dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
132    })
133#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
134
135    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
136    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
137
138#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
139
140    TILE(DATA_TYPE, 64, N0, in);
141    TILE(DATA_TYPE, 4, N0, out);
142    TILE(DATA_TYPE, 16, N0, tmp);
143    TILE(uint, 64, 1, src_indirect_y);
144
145    // Calculate the indirect Y for the source tensor
146    LOOP_UNROLLING(int, i, 0, 1, 64,
147    {
148        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
149        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
150    })
151
152    // Initialize the input tile
153    LOOP_UNROLLING(int, i, 0, 1, 64,
154    {
155        in[i].v = 0;
156    })
157
158    // Load the values across the 64 channels to compose the 8x8 tile
159    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
160
161    LOOP_UNROLLING(int, i, 0, 1, 8,
162    {
163        tmp[i * 2].v     = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
164        tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
165    })
166
167    // Compute the 2x2 output tile
168    LOOP_UNROLLING(int, i, 0, 1, 2,
169    {
170        out[i * 2].v     = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
171        out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
172    })
173
174#if defined(HAS_BIAS)
175    // Add bias
176    TILE(DATA_TYPE, 1, N0, b);
177
178    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
179
180    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
181#endif // defined(HAS_BIAS)
182
183    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
184
185    TILE(uint, 4, 1, dst_indirect_y);
186
187    // Calculate the destination indirect Y
188    LOOP_UNROLLING(int, yk, 0, 1, 2,
189    {
190        LOOP_UNROLLING(int, xk, 0, 1, 2,
191        {
192            int x_c                       = min(x_out + xk, ((int)_IDST_WIDTH - 1));
193            int y_c                       = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
194            dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
195            dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
196        })
197    })
198
199    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
200    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
201#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
202}
203#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X2_7X7_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
204#endif // defined(VEC_SIZE) && VEC_SIZE == 2
205
206#if defined(VEC_SIZE) && VEC_SIZE == 4
207#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
208/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
209 *
210 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
211 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
212 * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
213 * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
214 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
215 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
216 *
217 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
218 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
219 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
220 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
221 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
222 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
223 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
224 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
225 * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
226 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
227 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
228 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
229 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
230 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
231 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
232 * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
233 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
234 * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
235 * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
236 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
237 * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
238 * @param[in]  SRC_HEIGHT                        The source tensor's height
239 * @param[in]  DST_WIDTH                         The destination tensor's width
240 * @param[in]  DST_HEIGHT                        The destination tensor's height
241 */
242__kernel void winograd_output_transform_4x4_3x3_nhwc(
243    TENSOR4D(src, BUFFER),
244    TENSOR4D(dst, BUFFER),
245#if defined(HAS_BIAS)
246    VECTOR_DECLARATION(bias),
247#endif // defined(HAS_BIAS)
248    int       dst_size,
249    const int SRC_HEIGHT,
250    const int DST_WIDTH,
251    const int DST_HEIGHT)
252{
253    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
254    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
255    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
256
257#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
258
259    TILE(DATA_TYPE, 6, N0, in);
260    TILE(DATA_TYPE, 4, N0, out);
261    TILE(uint, 6, 1, src_indirect_y);
262
263    LOOP_UNROLLING(int, i, 0, 1, 6,
264    {
265        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
266        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
267    })
268
269    // Initialize the input tile
270    LOOP_UNROLLING(int, i, 0, 1, 6,
271    {
272        in[i].v = 0;
273    })
274
275    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
276    T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
277
278    // Compute out00, out01, out02 and out03
279    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
280    out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
281    out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
282    out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
283
284#if defined(HAS_BIAS)
285    TILE(DATA_TYPE, 1, N0, b);
286
287    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
288
289    // c = c + bias[broadcasted]
290    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
291#endif // HAS_BIAS
292
293    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
294    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
295
296    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
297
298    TILE(uint, 4, 1, dst_indirect_y);
299
300    // Calculate the destination indirect Y
301#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
302    LOOP_UNROLLING(int, yk, 0, 1, 4,
303    {
304        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
305        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
306        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
307    })
308#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
309    LOOP_UNROLLING(int, xk, 0, 1, 4,
310    {
311        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
312        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
313        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
314    })
315#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
316
317    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
318    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
319
320#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
321
322    // Calculate the indirect Y for the source tensor
323    TILE(DATA_TYPE, 36, N0, in);
324    TILE(DATA_TYPE, 4, N0, tmp);
325    TILE(uint, 36, 1, src_indirect_y);
326
327    LOOP_UNROLLING(int, i, 0, 1, 36,
328    {
329        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
330        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
331    })
332
333    // Initialize the input tile
334    LOOP_UNROLLING(int, i, 0, 1, 36,
335    {
336        in[i].v = 0;
337    })
338
339    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
340    T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
341
342    LOOP_UNROLLING(int, i, 0, 1, 6,
343    {
344        tmp[0].v     = in[6 + i].v + in[12 + i].v;
345        tmp[1].v     = in[6 + i].v - in[12 + i].v;
346        tmp[2].v     = in[18 + i].v + in[24 + i].v;
347        tmp[3].v     = in[18 + i].v - in[24 + i].v;
348        tmp[3].v     = tmp[3].v + tmp[3].v;
349        in[i].v      = in[i].v + tmp[0].v + tmp[2].v;
350        in[6 + i].v  = tmp[3].v + tmp[1].v;
351        in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
352        in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
353    })
354
355    // Compute the output tile
356    TILE(DATA_TYPE, 16, N0, out);
357
358    LOOP_UNROLLING(int, i, 0, 1, 4,
359    {
360        tmp[0].v         = in[6 * i + 1].v + in[6 * i + 2].v;
361        tmp[1].v         = in[6 * i + 1].v - in[6 * i + 2].v;
362        tmp[2].v         = in[6 * i + 3].v + in[6 * i + 4].v;
363        tmp[3].v         = in[6 * i + 3].v - in[6 * i + 4].v;
364        tmp[3].v         = tmp[3].v + tmp[3].v;
365        out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
366        out[4 * i + 1].v = tmp[3].v + tmp[1].v;
367        out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
368        out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
369    })
370
371#if defined(HAS_BIAS)
372    TILE(DATA_TYPE, 1, N0, b);
373
374    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
375
376    // c = c + bias[broadcasted]
377    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
378#endif // HAS_BIAS
379
380    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
381    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
382
383    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
384
385    TILE(uint, 16, 1, dst_indirect_y);
386
387    // Calculate the destination indirect Y
388    LOOP_UNROLLING(int, yk, 0, 1, 4,
389    {
390        LOOP_UNROLLING(int, xk, 0, 1, 4,
391        {
392            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
393            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
394            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
395            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
396        })
397    })
398
399    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
400    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
401#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
402}
403#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_3X3_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
404
405#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
406/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
407 *
408 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
409 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
410 * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
411 * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
412 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
413 * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
414 *
415 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
416 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
417 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
418 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
419 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
420 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
421 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
422 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
423 * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
424 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
425 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
426 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
427 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
428 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
429 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
430 * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
431 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
432 * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
433 * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
434 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
435 * @param[in]  SRC_HEIGHT                        The source tensor's height
436 * @param[in]  DST_WIDTH                         The destination tensor's width
437 * @param[in]  DST_HEIGHT                        The destination tensor's height
438 */
439__kernel void winograd_output_transform_4x4_5x5_nhwc(
440    TENSOR4D(src, BUFFER),
441    TENSOR4D(dst, BUFFER),
442#if defined(HAS_BIAS)
443    VECTOR_DECLARATION(bias),
444#endif // defined(HAS_BIAS)
445    int       dst_size,
446    const int SRC_HEIGHT,
447    const int DST_WIDTH,
448    const int DST_HEIGHT)
449{
450    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
451    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
452    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
453
454#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
455    TILE(DATA_TYPE, 8, N0, in);
456    TILE(DATA_TYPE, 4, N0, out);
457    TILE(DATA_TYPE, 4, N0, tmp);
458    TILE(uint, 8, 1, src_indirect_y);
459
460    LOOP_UNROLLING(int, i, 0, 1, 8,
461    {
462        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
463        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
464    })
465
466    // Initialize the input tile
467    LOOP_UNROLLING(int, i, 0, 1, 8,
468    {
469        in[i].v = 0;
470    })
471
472    // "in" contains 1x8 or 8x1 tile here
473    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
474
475    // A^T * in, and in this degenerate case out consists of 1 column/row
476    tmp[0].v = in[1].v - in[2].v;
477    tmp[1].v = 2.0f * (in[3].v - in[4].v);
478    tmp[2].v = 2.0f * (in[5].v + in[6].v);
479    tmp[3].v = in[3].v + in[4].v;
480    out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
481    out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
482    out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
483    out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
484
485#if defined(HAS_BIAS)
486    TILE(DATA_TYPE, 1, N0, b);
487
488    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
489
490    // c = c + bias[broadcasted]
491    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 4, N0, out, b, out);
492#endif // HAS_BIAS
493
494    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
495    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
496
497    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
498
499    TILE(uint, 4, 1, dst_indirect_y);
500
501    // Calculate the destination indirect Y
502#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
503    LOOP_UNROLLING(int, yk, 0, 1, 4,
504    {
505        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
506        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
507        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
508    })
509#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
510    LOOP_UNROLLING(int, xk, 0, 1, 4,
511    {
512        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
513        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
514        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
515    })
516#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
517
518    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
519    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
520
521#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
522    // Calculate the indirect Y for the source tensor
523    TILE(DATA_TYPE, 64, N0, in);
524    TILE(DATA_TYPE, 6, N0, tmp);
525    TILE(uint, 64, 1, src_indirect_y);
526
527    LOOP_UNROLLING(int, i, 0, 1, 64,
528    {
529        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
530        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
531    })
532
533    // Initialize the input tile
534    LOOP_UNROLLING(int, i, 0, 1, 64,
535    {
536        in[i].v = 0;
537    })
538
539    // "in" here is 8x8 tile
540    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
541
542    // A^T * in
543    LOOP_UNROLLING(int, i, 0, 1, 8,
544    {
545        tmp[0].v = in[8 + i].v + in[16 + i].v;
546        tmp[1].v = in[8 + i].v - in[16 + i].v;
547        tmp[2].v = in[24 + i].v + in[32 + i].v;
548        tmp[3].v = in[24 + i].v - in[32 + i].v;
549        tmp[3].v = tmp[3].v + tmp[3].v;
550        tmp[4].v = in[40 + i].v + in[48 + i].v;
551        tmp[4].v = tmp[4].v + tmp[4].v;
552        tmp[5].v = in[40 + i].v - in[48 + i].v;
553
554        // 4x8 matrix as a result
555        in[i].v      = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
556        in[8 + i].v  = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
557        in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
558        in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
559    })
560
561    // Compute the output tile
562    TILE(DATA_TYPE, 16, N0, out);
563
564    // in * A, with in = A^T * in as above
565    LOOP_UNROLLING(int, i, 0, 1, 4,
566    {
567        tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
568        tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
569        tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
570        tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
571        tmp[3].v = tmp[3].v + tmp[3].v;
572        tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
573        tmp[4].v = tmp[4].v + tmp[4].v;
574        tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
575
576        // 4x4 tile
577        out[4 * i].v     = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
578        out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
579        out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
580        out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
581    })
582
583#if defined(HAS_BIAS)
584    TILE(DATA_TYPE, 1, N0, b);
585
586    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
587
588    // c = c + bias[broadcasted]
589    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, 16, N0, out, b, out);
590#endif // HAS_BIAS
591
592    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
593    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
594
595    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
596
597    TILE(uint, 16, 1, dst_indirect_y);
598
599    // Calculate the destination indirect Y
600    LOOP_UNROLLING(int, yk, 0, 1, 4,
601    {
602        LOOP_UNROLLING(int, xk, 0, 1, 4,
603        {
604            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
605            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
606            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
607            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
608        })
609    })
610
611    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
612    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
613#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
614}
615#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X4_5X5_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC) || defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
616#endif // defined(VEC_SIZE) && VEC_SIZE == 4
617
618#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
619#if defined(VEC_SIZE) && VEC_SIZE == 2
620#if defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
621/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
622 *
623 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
624 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
625 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
626 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
627 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
628 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
629 *
630 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
631 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
632 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
633 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
634 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
635 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
636 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
637 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
638 * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
639 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
640 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
641 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
642 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
643 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
644 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
645 * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
646 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
647 * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
648 * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
649 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
650 * @param[in]  SRC_HEIGHT                        The source tensor's height
651 * @param[in]  DST_WIDTH                         The destination tensor's width
652 * @param[in]  DST_HEIGHT                        The destination tensor's height
653 */
654__kernel void winograd_output_transform_2x1_7x1_nhwc(
655    TENSOR4D_DECLARATION(src),
656    TENSOR4D_DECLARATION(dst),
657#if defined(HAS_BIAS)
658    VECTOR_DECLARATION(bias),
659#endif // defined(HAS_BIAS)
660    int       dst_size,
661    const int SRC_HEIGHT,
662    const int DST_WIDTH,
663    const int DST_HEIGHT)
664{
665    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
666                                           src_stride_x,
667                                           src_step_x,
668                                           src_stride_y,
669                                           src_step_y,
670                                           src_stride_z,
671                                           src_step_z,
672                                           src_stride_w,
673                                           src_step_w,
674                                           src_offset_first_element_in_bytes,
675                                           dst_ptr,
676                                           dst_stride_x,
677                                           dst_step_x,
678                                           dst_stride_y,
679                                           dst_step_y,
680                                           dst_stride_z,
681                                           dst_step_z,
682                                           dst_stride_w,
683                                           dst_step_w,
684                                           dst_offset_first_element_in_bytes,
685#if defined(HAS_BIAS)
686                                           bias_ptr,
687                                           bias_stride_x,
688                                           bias_step_x,
689                                           bias_offset_first_element_in_bytes,
690#endif // defined(HAS_BIAS)
691                                           dst_size,
692                                           SRC_HEIGHT,
693                                           DST_WIDTH,
694                                           DST_HEIGHT);
695}
696#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_2X1_7X1_NHWC)
697#endif // defined(VEC_SIZE) && VEC_SIZE == 2
698
699#if defined(VEC_SIZE) && VEC_SIZE == 4
700#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
701/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
702 *
703 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
704 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
705 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
706 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
707 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
708 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
709 *
710 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
711 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
712 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
713 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
714 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
715 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
716 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
717 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
718 * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
719 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
720 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
721 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
722 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
723 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
724 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
725 * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
726 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
727 * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
728 * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
729 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
730 * @param[in]  SRC_HEIGHT                        The source tensor's height
731 * @param[in]  DST_WIDTH                         The destination tensor's width
732 * @param[in]  DST_HEIGHT                        The destination tensor's height
733 */
734__kernel void winograd_output_transform_4x1_3x1_nhwc(
735    TENSOR4D_DECLARATION(src),
736    TENSOR4D_DECLARATION(dst),
737#if defined(HAS_BIAS)
738    VECTOR_DECLARATION(bias),
739#endif // defined(HAS_BIAS)
740    int       dst_size,
741    const int SRC_HEIGHT,
742    const int DST_WIDTH,
743    const int DST_HEIGHT)
744{
745    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
746                                           src_stride_x,
747                                           src_step_x,
748                                           src_stride_y,
749                                           src_step_y,
750                                           src_stride_z,
751                                           src_step_z,
752                                           src_stride_w,
753                                           src_step_w,
754                                           src_offset_first_element_in_bytes,
755                                           dst_ptr,
756                                           dst_stride_x,
757                                           dst_step_x,
758                                           dst_stride_y,
759                                           dst_step_y,
760                                           dst_stride_z,
761                                           dst_step_z,
762                                           dst_stride_w,
763                                           dst_step_w,
764                                           dst_offset_first_element_in_bytes,
765#if defined(HAS_BIAS)
766                                           bias_ptr,
767                                           bias_stride_x,
768                                           bias_step_x,
769                                           bias_offset_first_element_in_bytes,
770#endif // defined(HAS_BIAS)
771                                           dst_size,
772                                           SRC_HEIGHT,
773                                           DST_WIDTH,
774                                           DST_HEIGHT);
775}
776#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_3X1_NHWC)
777
778#if defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
779/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
780 *
781 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
782 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
783 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
784 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
785 * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
786 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
787 *
788 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
789 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
790 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
791 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
792 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
793 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
794 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
795 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
796 * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
797 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
798 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
799 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
800 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
801 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
802 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
803 * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
804 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
805 * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
806 * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
807 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
808 * @param[in]  SRC_HEIGHT                        The source tensor's height
809 * @param[in]  DST_WIDTH                         The destination tensor's width
810 * @param[in]  DST_HEIGHT                        The destination tensor's height
811 */
812__kernel void winograd_output_transform_4x1_5x1_nhwc(
813    TENSOR4D_DECLARATION(src),
814    TENSOR4D_DECLARATION(dst),
815#if defined(HAS_BIAS)
816    VECTOR_DECLARATION(bias),
817#endif // defined(HAS_BIAS)
818    int       dst_size,
819    const int SRC_HEIGHT,
820    const int DST_WIDTH,
821    const int DST_HEIGHT)
822{
823    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
824                                           src_stride_x,
825                                           src_step_x,
826                                           src_stride_y,
827                                           src_step_y,
828                                           src_stride_z,
829                                           src_step_z,
830                                           src_stride_w,
831                                           src_step_w,
832                                           src_offset_first_element_in_bytes,
833                                           dst_ptr,
834                                           dst_stride_x,
835                                           dst_step_x,
836                                           dst_stride_y,
837                                           dst_step_y,
838                                           dst_stride_z,
839                                           dst_step_z,
840                                           dst_stride_w,
841                                           dst_step_w,
842                                           dst_offset_first_element_in_bytes,
843#if defined(HAS_BIAS)
844                                           bias_ptr,
845                                           bias_stride_x,
846                                           bias_step_x,
847                                           bias_offset_first_element_in_bytes,
848#endif // defined(HAS_BIAS)
849                                           dst_size,
850                                           SRC_HEIGHT,
851                                           DST_WIDTH,
852                                           DST_HEIGHT);
853}
854#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_4X1_5X1_NHWC)
855#endif // defined(VEC_SIZE) && VEC_SIZE == 4
856#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
857
858#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
859#if defined(VEC_SIZE) && VEC_SIZE == 2
860#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
861/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
862 *
863 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
864 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
865 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
866 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
867 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
868 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
869 *
870 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
871 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
872 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
873 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
874 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
875 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
876 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
877 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
878 * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
879 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
880 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
881 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
882 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
883 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
884 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
885 * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
886 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
887 * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
888 * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
889 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
890 * @param[in]  SRC_HEIGHT                        The source tensor's height
891 * @param[in]  DST_WIDTH                         The destination tensor's width
892 * @param[in]  DST_HEIGHT                        The destination tensor's height
893 */
894__kernel void winograd_output_transform_1x2_1x7_nhwc(
895    TENSOR4D_DECLARATION(src),
896    TENSOR4D_DECLARATION(dst),
897#if defined(HAS_BIAS)
898    VECTOR_DECLARATION(bias),
899#endif // defined(HAS_BIAS)
900    int       dst_size,
901    const int SRC_HEIGHT,
902    const int DST_WIDTH,
903    const int DST_HEIGHT)
904{
905    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
906                                           src_stride_x,
907                                           src_step_x,
908                                           src_stride_y,
909                                           src_step_y,
910                                           src_stride_z,
911                                           src_step_z,
912                                           src_stride_w,
913                                           src_step_w,
914                                           src_offset_first_element_in_bytes,
915                                           dst_ptr,
916                                           dst_stride_x,
917                                           dst_step_x,
918                                           dst_stride_y,
919                                           dst_step_y,
920                                           dst_stride_z,
921                                           dst_step_z,
922                                           dst_stride_w,
923                                           dst_step_w,
924                                           dst_offset_first_element_in_bytes,
925#if defined(HAS_BIAS)
926                                           bias_ptr,
927                                           bias_stride_x,
928                                           bias_step_x,
929                                           bias_offset_first_element_in_bytes,
930#endif // defined(HAS_BIAS)
931                                           dst_size,
932                                           SRC_HEIGHT,
933                                           DST_WIDTH,
934                                           DST_HEIGHT);
935}
936#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X2_1X7_NHWC)
937#endif // defined(VEC_SIZE) && VEC_SIZE == 2
938
939#if defined(VEC_SIZE) && VEC_SIZE == 4
940#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
941/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
942 *
943 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
944 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
945 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
946 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
947 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
948 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
949 *
950 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
951 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
952 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
953 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
954 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
955 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
956 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
957 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
958 * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
959 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
960 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
961 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
962 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
963 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
964 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
965 * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
966 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
967 * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
968 * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
969 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
970 * @param[in]  SRC_HEIGHT                        The source tensor's height
971 * @param[in]  DST_WIDTH                         The destination tensor's width
972 * @param[in]  DST_HEIGHT                        The destination tensor's height
973 */
974__kernel void winograd_output_transform_1x4_1x3_nhwc(
975    TENSOR4D_DECLARATION(src),
976    TENSOR4D_DECLARATION(dst),
977#if defined(HAS_BIAS)
978    VECTOR_DECLARATION(bias),
979#endif // defined(HAS_BIAS)
980    int       dst_size,
981    const int SRC_HEIGHT,
982    const int DST_WIDTH,
983    const int DST_HEIGHT)
984{
985    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
986                                           src_stride_x,
987                                           src_step_x,
988                                           src_stride_y,
989                                           src_step_y,
990                                           src_stride_z,
991                                           src_step_z,
992                                           src_stride_w,
993                                           src_step_w,
994                                           src_offset_first_element_in_bytes,
995                                           dst_ptr,
996                                           dst_stride_x,
997                                           dst_step_x,
998                                           dst_stride_y,
999                                           dst_step_y,
1000                                           dst_stride_z,
1001                                           dst_step_z,
1002                                           dst_stride_w,
1003                                           dst_step_w,
1004                                           dst_offset_first_element_in_bytes,
1005#if defined(HAS_BIAS)
1006                                           bias_ptr,
1007                                           bias_stride_x,
1008                                           bias_step_x,
1009                                           bias_offset_first_element_in_bytes,
1010#endif // defined(HAS_BIAS)
1011                                           dst_size,
1012                                           SRC_HEIGHT,
1013                                           DST_WIDTH,
1014                                           DST_HEIGHT);
1015}
1016#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X3_NHWC)
1017
1018#if defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
1019/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
1020 *
1021 * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
1022 * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
1023 * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
1024 * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
1025 * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
1026 * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
1027 *
1028 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
1029 * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
1030 * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
1031 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
1032 * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
1033 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
1034 * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
1035 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
1036 * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
1037 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
1038 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
1039 * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
1040 * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
1041 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
1042 * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
1043 * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
1044 * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
1045 * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
1046 * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
1047 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
1048 * @param[in]  SRC_HEIGHT                        The source tensor's height
1049 * @param[in]  DST_WIDTH                         The destination tensor's width
1050 * @param[in]  DST_HEIGHT                        The destination tensor's height
1051 */
1052__kernel void winograd_output_transform_1x4_1x5_nhwc(
1053    TENSOR4D_DECLARATION(src),
1054    TENSOR4D_DECLARATION(dst),
1055#if defined(HAS_BIAS)
1056    VECTOR_DECLARATION(bias),
1057#endif // defined(HAS_BIAS)
1058    int       dst_size,
1059    const int SRC_HEIGHT,
1060    const int DST_WIDTH,
1061    const int DST_HEIGHT)
1062{
1063    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
1064                                           src_stride_x,
1065                                           src_step_x,
1066                                           src_stride_y,
1067                                           src_step_y,
1068                                           src_stride_z,
1069                                           src_step_z,
1070                                           src_stride_w,
1071                                           src_step_w,
1072                                           src_offset_first_element_in_bytes,
1073                                           dst_ptr,
1074                                           dst_stride_x,
1075                                           dst_step_x,
1076                                           dst_stride_y,
1077                                           dst_step_y,
1078                                           dst_stride_z,
1079                                           dst_step_z,
1080                                           dst_stride_w,
1081                                           dst_step_w,
1082                                           dst_offset_first_element_in_bytes,
1083#if defined(HAS_BIAS)
1084                                           bias_ptr,
1085                                           bias_stride_x,
1086                                           bias_step_x,
1087                                           bias_offset_first_element_in_bytes,
1088#endif // defined(HAS_BIAS)
1089                                           dst_size,
1090                                           SRC_HEIGHT,
1091                                           DST_WIDTH,
1092                                           DST_HEIGHT);
1093}
1094#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_1X4_1X5_NHWC)
1095#endif // defined(VEC_SIZE) && VEC_SIZE == 4
1096#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
1097#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)