1/* 2 * Copyright (c) 2016-2023 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24#include "helpers.h" 25#include "tile_helpers.h" 26 27#if defined(SCALE_NEAREST_NEIGHBOUR) 28//! @cond Doxygen_Suppress 29/** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC) 30 * 31 * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT 32 * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER) 33 * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER) 34 * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float) 35 * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float) 36 * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2) 37 * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0) 38 * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time 39 * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time 40 * 41 * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. 42 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 43 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 44 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 45 * @param[in] src_c The size of the channels dimension of the source tensor 46 * @param[in] src_w The size of the width dimension of the source tensor 47 * @param[in] src_h The size of the height dimension of the source tensor 48 * @param[in] src_n The size of the batches dimension of the source tensor 49 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 50 * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32. 51 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 52 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 53 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 54 * @param[in] dst_c The size of the channels dimension of the destination tensor 55 * @param[in] dst_w The size of the width dimension of the destination tensor 56 * @param[in] dst_h The size of the height dimension of the destination tensor 57 * @param[in] dst_n The size of the batches dimension of the destination tensor 58 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 59 * @param[in] scale_x The scale value to apply on the source width 60 * @param[in] scale_y The scale value to apply on the source height 61 */ 62//! @endcond 63__kernel void scale_nearest_neighbour_nhwc( 64 TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), 65 TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), 66 const float scale_x, 67 const float scale_y) 68{ 69 const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM 70 const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH 71#if defined(BATCHED_EXECUTION) 72 const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT 73 const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX 74#else // defined(BATCHED_EXECUTION) 75 const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT 76 const int bout = 0; // BATCH SIZE IDX 77#endif // defined(BATCHED_EXECUTION) 78 79#ifdef SAMPLING_POLICY_TOP_LEFT 80 float xi_f = (xo * scale_x); 81 float yi_f = (yo * scale_y); 82#elif SAMPLING_POLICY_CENTER 83 float xi_f = ((xo + 0.5f) * scale_x); 84 float yi_f = ((yo + 0.5f) * scale_y); 85#else // SAMPLING_POLICY 86#error("Unsupported sampling policy"); 87#endif // SAMPLING_POLICY 88 89#ifdef ALIGN_CORNERS 90 xi_f = round(xi_f); 91 yi_f = round(yi_f); 92#endif // ALIGN_CORNERS 93 94 const int xi0 = clamp((int)xi_f, 0, (int)src_w - 1); 95 const int yi0 = clamp((int)yi_f, 0, (int)src_h - 1); 96 97 TILE(SRC_DATA_TYPE, 1, N0, in00); 98 99 T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00); 100 101 TILE(uint, 1, 1, dst_indirect_y); 102 103 // Calculate the destination indirect Y 104 dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h); 105 106 bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; 107 108 T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y); 109} 110#endif /* SCALE_NEAREST_NEIGHBOUR */ 111 112#if defined(SCALE_BILINEAR) 113//! @cond Doxygen_Suppress 114/** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC) 115 * 116 * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE 117 * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT 118 * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER) 119 * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER) 120 * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float) 121 * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float) 122 * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2) 123 * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0) 124 * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time 125 * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time 126 * 127 * @note In case of QASYMM8, the following extra information must be passed at compile time: 128 * - The source offset e.g. -DOFFSET=4 129 * - The source scale e.g. -DSCALE=4 130 * 131 * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE 132 * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. 133 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) 134 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) 135 * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) 136 * @param[in] src_c The size of the channels dimension of the source tensor 137 * @param[in] src_w The size of the width dimension of the source tensor 138 * @param[in] src_h The size of the height dimension of the source tensor 139 * @param[in] src_n The size of the batches dimension of the source tensor 140 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor 141 * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE 142 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32. 143 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) 144 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) 145 * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) 146 * @param[in] dst_c The size of the channels dimension of the destination tensor 147 * @param[in] dst_w The size of the width dimension of the destination tensor 148 * @param[in] dst_h The size of the height dimension of the destination tensor 149 * @param[in] dst_n The size of the batches dimension of the destination tensor 150 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor 151 * @param[in] scale_x The scale value to apply on the source width 152 * @param[in] scale_y The scale value to apply on the source height 153 */ 154//! @endcond 155__kernel void scale_bilinear_nhwc( 156 TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), 157 TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), 158 const float scale_x, 159 const float scale_y) 160{ 161 const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM 162 const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH 163#if defined(BATCHED_EXECUTION) 164 const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT 165 const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX 166#else // defined(BATCHED_EXECUTION) 167 const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT 168 const int bout = 0; // BATCH SIZE IDX 169#endif // defined(BATCHED_EXECUTION) 170 171#ifdef SAMPLING_POLICY_TOP_LEFT 172 float xi_f = (xo * scale_x); 173 float yi_f = (yo * scale_y); 174#elif SAMPLING_POLICY_CENTER 175 float xi_f = ((xo + 0.5f) * scale_x - 0.5f); 176 float yi_f = ((yo + 0.5f) * scale_y - 0.5f); 177#else // SAMPLING_POLICY 178#error("Unsupported sampling policy"); 179#endif // SAMPLING_POLICY 180 181 const int xi = (int)floor(xi_f); 182 const int yi = (int)floor(yi_f); 183 184 TILE(SRC_DATA_TYPE, 1, N0, in00); 185 TILE(SRC_DATA_TYPE, 1, N0, in01); 186 TILE(SRC_DATA_TYPE, 1, N0, in10); 187 TILE(SRC_DATA_TYPE, 1, N0, in11); 188 189 // Initialize the tiles to CONSTANT_VALUE 190 in00[0].v = CONSTANT_VALUE; 191 in01[0].v = CONSTANT_VALUE; 192 in10[0].v = CONSTANT_VALUE; 193 in11[0].v = CONSTANT_VALUE; 194 195#ifndef BORDER_MODE_REPLICATE 196 T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, src_w, src_h, 1, 1, true, in00); 197 T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, src_w, src_h, 1, 1, true, in01); 198 T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, src_w, src_h, 1, 1, true, in10); 199 T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, src_w, src_h, 1, 1, true, in11); 200#else // BORDER_MODE_REPLICATE 201 const int xi0 = clamp(xi, 0, (int)src_w - 1); 202 const int yi0 = clamp(yi, 0, (int)src_h - 1); 203 const int xi1 = clamp(xi + 1, 0, (int)src_w - 1); 204 const int yi1 = clamp(yi + 1, 0, (int)src_h - 1); 205 206 T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00); 207 T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, src_w, src_h, 1, 1, false, in01); 208 T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, src_w, src_h, 1, 1, false, in10); 209 T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, src_w, src_h, 1, 1, false, in11); 210#endif // BORDER_MODE_REPLICATE 211 212 TILE(DST_DATA_TYPE, 1, N0, out); 213 214#if defined(IS_FLOATING_POINT) 215 const SRC_DATA_TYPE a = (SRC_DATA_TYPE)(xi_f - (float)xi); 216 const SRC_DATA_TYPE b = (SRC_DATA_TYPE)(1.f - a); 217 const SRC_DATA_TYPE a1 = (SRC_DATA_TYPE)(yi_f - (float)yi); 218 const SRC_DATA_TYPE b1 = (SRC_DATA_TYPE)(1.f - a1); 219 220 // Calculate the output 221 out[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1)); 222#else // defined(IS_FLOATING_POINT) 223 224 const float a = (xi_f - (float)xi); 225 const float b = (1.f - a); 226 const float a1 = (yi_f - (float)yi); 227 const float b1 = (1.f - a1); 228 229 out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + 230 (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + 231 (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + 232 (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), 233 VEC_DATA_TYPE(DST_DATA_TYPE, N0)); 234#endif // defined(IS_FLOATING_POINT) 235 236 TILE(uint, 1, 1, dst_indirect_y); 237 238 // Calculate the destination indirect Y 239 dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h); 240 241 bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; 242 243 T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y); 244} 245#endif /* SCALE_BILINEAR */ 246