xref: /aosp_15_r20/external/ComputeLibrary/src/core/CL/cl_kernels/nhwc/scale.cl (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1/*
2 * Copyright (c) 2016-2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25#include "tile_helpers.h"
26
27#if defined(SCALE_NEAREST_NEIGHBOUR)
28//! @cond Doxygen_Suppress
29/** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC)
30 *
31 * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
32 * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
33 * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
34 * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
35 * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
36 * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
37 * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
38 * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
39 * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
40 *
41 * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
42 * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
43 * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
44 * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
45 * @param[in] src_c                             The size of the channels dimension of the source tensor
46 * @param[in] src_w                             The size of the width dimension of the source tensor
47 * @param[in] src_h                             The size of the height dimension of the source tensor
48 * @param[in] src_n                             The size of the batches dimension of the source tensor
49 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
50 * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
51 * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
52 * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
53 * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
54 * @param[in] dst_c                             The size of the channels dimension of the destination tensor
55 * @param[in] dst_w                             The size of the width dimension of the destination tensor
56 * @param[in] dst_h                             The size of the height dimension of the destination tensor
57 * @param[in] dst_n                             The size of the batches dimension of the destination tensor
58 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
59 * @param[in] scale_x                           The scale value to apply on the source width
60 * @param[in] scale_y                           The scale value to apply on the source height
61 */
62//! @endcond
63__kernel void scale_nearest_neighbour_nhwc(
64    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
65    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
66    const float scale_x,
67    const float scale_y)
68{
69    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
70    const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
71#if defined(BATCHED_EXECUTION)
72    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
73    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
74#else                                                  // defined(BATCHED_EXECUTION)
75    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
76    const int bout = 0;                        // BATCH SIZE IDX
77#endif                                                 // defined(BATCHED_EXECUTION)
78
79#ifdef SAMPLING_POLICY_TOP_LEFT
80    float xi_f = (xo * scale_x);
81    float yi_f = (yo * scale_y);
82#elif SAMPLING_POLICY_CENTER
83    float     xi_f = ((xo + 0.5f) * scale_x);
84    float     yi_f = ((yo + 0.5f) * scale_y);
85#else // SAMPLING_POLICY
86#error("Unsupported sampling policy");
87#endif // SAMPLING_POLICY
88
89#ifdef ALIGN_CORNERS
90    xi_f = round(xi_f);
91    yi_f = round(yi_f);
92#endif // ALIGN_CORNERS
93
94    const int xi0 = clamp((int)xi_f, 0, (int)src_w - 1);
95    const int yi0 = clamp((int)yi_f, 0, (int)src_h - 1);
96
97    TILE(SRC_DATA_TYPE, 1, N0, in00);
98
99    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
100
101    TILE(uint, 1, 1, dst_indirect_y);
102
103    // Calculate the destination indirect Y
104    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
105
106    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
107
108    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y);
109}
110#endif /* SCALE_NEAREST_NEIGHBOUR */
111
112#if defined(SCALE_BILINEAR)
113//! @cond Doxygen_Suppress
114/** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC)
115 *
116 * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
117 * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
118 * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
119 * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
120 * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
121 * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
122 * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
123 * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
124 * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
125 * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
126 *
127 * @note In case of QASYMM8, the following extra information must be passed at compile time:
128 * - The source offset e.g. -DOFFSET=4
129 * - The source scale e.g. -DSCALE=4
130 *
131 * @param[in]  src_img                           (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE
132 * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
133 * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
134 * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
135 * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
136 * @param[in]  src_c                             The size of the channels dimension of the source tensor
137 * @param[in]  src_w                             The size of the width dimension of the source tensor
138 * @param[in]  src_h                             The size of the height dimension of the source tensor
139 * @param[in]  src_n                             The size of the batches dimension of the source tensor
140 * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
141 * @param[out] dst_img                           (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE
142 * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
143 * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
144 * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
145 * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
146 * @param[in]  dst_c                             The size of the channels dimension of the destination tensor
147 * @param[in]  dst_w                             The size of the width dimension of the destination tensor
148 * @param[in]  dst_h                             The size of the height dimension of the destination tensor
149 * @param[in]  dst_n                             The size of the batches dimension of the destination tensor
150 * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
151 * @param[in]  scale_x                           The scale value to apply on the source width
152 * @param[in]  scale_y                           The scale value to apply on the source height
153 */
154//! @endcond
155__kernel void scale_bilinear_nhwc(
156    TENSOR4D_RO_T(src, SRC_TENSOR_TYPE),
157    TENSOR4D_WO_T(dst, DST_TENSOR_TYPE),
158    const float scale_x,
159    const float scale_y)
160{
161    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
162    const int xo   = GET_SPATIAL_IDX(1, 1, 0);           // WIDTH
163#if defined(BATCHED_EXECUTION)
164    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
165    const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
166#else                                                  // defined(BATCHED_EXECUTION)
167    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
168    const int bout = 0;                        // BATCH SIZE IDX
169#endif                                                 // defined(BATCHED_EXECUTION)
170
171#ifdef SAMPLING_POLICY_TOP_LEFT
172    float xi_f = (xo * scale_x);
173    float yi_f = (yo * scale_y);
174#elif SAMPLING_POLICY_CENTER
175    float     xi_f = ((xo + 0.5f) * scale_x - 0.5f);
176    float     yi_f = ((yo + 0.5f) * scale_y - 0.5f);
177#else // SAMPLING_POLICY
178#error("Unsupported sampling policy");
179#endif // SAMPLING_POLICY
180
181    const int xi = (int)floor(xi_f);
182    const int yi = (int)floor(yi_f);
183
184    TILE(SRC_DATA_TYPE, 1, N0, in00);
185    TILE(SRC_DATA_TYPE, 1, N0, in01);
186    TILE(SRC_DATA_TYPE, 1, N0, in10);
187    TILE(SRC_DATA_TYPE, 1, N0, in11);
188
189    // Initialize the tiles to CONSTANT_VALUE
190    in00[0].v = CONSTANT_VALUE;
191    in01[0].v = CONSTANT_VALUE;
192    in10[0].v = CONSTANT_VALUE;
193    in11[0].v = CONSTANT_VALUE;
194
195#ifndef BORDER_MODE_REPLICATE
196    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, src_w, src_h, 1, 1, true, in00);
197    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, src_w, src_h, 1, 1, true, in01);
198    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, src_w, src_h, 1, 1, true, in10);
199    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, src_w, src_h, 1, 1, true, in11);
200#else  // BORDER_MODE_REPLICATE
201    const int xi0  = clamp(xi, 0, (int)src_w - 1);
202    const int yi0  = clamp(yi, 0, (int)src_h - 1);
203    const int xi1  = clamp(xi + 1, 0, (int)src_w - 1);
204    const int yi1  = clamp(yi + 1, 0, (int)src_h - 1);
205
206    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
207    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, src_w, src_h, 1, 1, false, in01);
208    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, src_w, src_h, 1, 1, false, in10);
209    T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, src_w, src_h, 1, 1, false, in11);
210#endif // BORDER_MODE_REPLICATE
211
212    TILE(DST_DATA_TYPE, 1, N0, out);
213
214#if defined(IS_FLOATING_POINT)
215    const SRC_DATA_TYPE a  = (SRC_DATA_TYPE)(xi_f - (float)xi);
216    const SRC_DATA_TYPE b  = (SRC_DATA_TYPE)(1.f - a);
217    const SRC_DATA_TYPE a1 = (SRC_DATA_TYPE)(yi_f - (float)yi);
218    const SRC_DATA_TYPE b1 = (SRC_DATA_TYPE)(1.f - a1);
219
220    // Calculate the output
221    out[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1));
222#else  // defined(IS_FLOATING_POINT)
223
224    const float a  = (xi_f - (float)xi);
225    const float b  = (1.f - a);
226    const float a1 = (yi_f - (float)yi);
227    const float b1 = (1.f - a1);
228
229    out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
230                           (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
231                           (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
232                           (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1),
233                           VEC_DATA_TYPE(DST_DATA_TYPE, N0));
234#endif // defined(IS_FLOATING_POINT)
235
236    TILE(uint, 1, 1, dst_indirect_y);
237
238    // Calculate the destination indirect Y
239    dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
240
241    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
242
243    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
244}
245#endif /* SCALE_BILINEAR */
246