xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/scale/neon/list.h (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef SRC_CORE_NEON_KERNELS_SCALE_LIST_H
25 #define SRC_CORE_NEON_KERNELS_SCALE_LIST_H
26 
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/Window.h"
29 #include "src/core/NEON/wrapper/wrapper.h"
30 #include "src/core/utils/ScaleUtils.h"
31 #include "support/Rounding.h"
32 
33 namespace arm_compute
34 {
35 namespace cpu
36 {
37 #define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
38     void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
39                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
40                    bool align_corners, const Window &window)
41 
42 DECLARE_SCALE_KERNEL(s16_neon_scale);
43 DECLARE_SCALE_KERNEL(u8_neon_scale);
44 DECLARE_SCALE_KERNEL(s8_neon_scale);
45 DECLARE_SCALE_KERNEL(qasymm8_neon_scale);
46 DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale);
47 
48 #undef DECLARE_SCALE_KERNEL
49 
50 template <typename T>
nearest_neon_scale(const ITensor * src,ITensor * dst,const ITensor * offsets,float sampling_offset,bool align_corners,const Window & window)51 void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset,
52                         bool align_corners, const Window &window)
53 {
54     ARM_COMPUTE_UNUSED(offsets);
55 
56     // Compute the ratio between source and destination dimensions
57     const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
58     const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
59 
60     const int in_stride_y  = src->info()->strides_in_bytes()[1];
61     const int in_stride_z  = src->info()->strides_in_bytes()[2];
62     const int in_stride_w  = src->info()->strides_in_bytes()[3];
63     const int out_stride_y = dst->info()->strides_in_bytes()[1];
64     const int out_stride_z = dst->info()->strides_in_bytes()[2];
65     const int out_stride_w = dst->info()->strides_in_bytes()[3];
66     const int out_dim_ch   = dst->info()->dimension(0);
67     const int step_cout    = 16 / sizeof(T);
68 
69     Window window_execution = window;
70     window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
71     Window win_in_out(window);
72     win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
73     win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
74     Iterator in(src, win_in_out);
75     Iterator out(dst, win_in_out);
76 
77     const int xo_start = window_execution.y().start();
78     const int xo_end   = window_execution.y().end();
79     const int xo_step  = window_execution.y().step();
80     const int yo_start = window_execution.z().start();
81     const int yo_end   = window_execution.z().end();
82     const int yo_step  = window_execution.z().step();
83     const int bo_start = window_execution[3].start();
84     const int bo_end   = window_execution[3].end();
85     const int bo_step  = window_execution[3].step();
86 
87     for(int bo = bo_start; bo < bo_end; bo += bo_step)
88     {
89         const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
90         uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
91 
92         for(int yo = yo_start; yo < yo_end; yo += yo_step)
93         {
94             // Floating-point coordinate
95             float yi_f = ((yo + sampling_offset) * scale_y);
96             int   yi   = 0;
97             if(align_corners)
98             {
99                 yi = utils::rounding::round_half_away_from_zero(yi_f);
100             }
101             else
102             {
103                 yi = static_cast<int>(std::floor(yi_f));
104             }
105 
106             for(int xo = xo_start; xo < xo_end; xo += xo_step)
107             {
108                 // Floating-point coordinate
109                 float xi_f = ((xo + sampling_offset) * scale_x);
110                 int   xi   = 0;
111                 if(align_corners)
112                 {
113                     xi = utils::rounding::round_half_away_from_zero(xi_f);
114                 }
115                 else
116                 {
117                     xi = static_cast<int>(std::floor(xi_f));
118                 }
119 
120                 const uint8_t *in_ptr  = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
121                 uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
122 
123                 int cout = 0;
124                 for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
125                 {
126                     auto out0 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
127                     wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
128                 }
129 
130                 for(; cout < out_dim_ch; ++cout)
131                 {
132                     auto out0                                            = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
133                     *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
134                 }
135             }
136         }
137     }
138 }
139 
140 template <typename T>
bilinear_neon_scale(const ITensor * src,ITensor * dst,const ITensor * offsets,const ITensor * dx,const ITensor * dy,BorderMode border_mode,PixelValue constant_border_value,float sampling_offset,bool align_corners,const Window & window)141 void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
142                          BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
143                          bool align_corners, const Window &window)
144 {
145     ARM_COMPUTE_UNUSED(offsets);
146     ARM_COMPUTE_UNUSED(dx);
147     ARM_COMPUTE_UNUSED(dy);
148     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
149 
150     // Compute the ratio between source and destination dimensions
151     const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
152     const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
153 
154     const int in_stride_y  = src->info()->strides_in_bytes()[1];
155     const int in_stride_z  = src->info()->strides_in_bytes()[2];
156     const int in_stride_w  = src->info()->strides_in_bytes()[3];
157     const int out_stride_y = dst->info()->strides_in_bytes()[1];
158     const int out_stride_z = dst->info()->strides_in_bytes()[2];
159     const int out_stride_w = dst->info()->strides_in_bytes()[3];
160     const int in_dim_w     = src->info()->dimension(1);
161     const int in_dim_h     = src->info()->dimension(2);
162     const int out_dim_ch   = dst->info()->dimension(0);
163     const int step_cout    = 16 / sizeof(T);
164 
165     Window window_execution = window;
166     window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
167     Window win_in_out(window);
168     win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
169     win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
170     Iterator in(src, win_in_out);
171     Iterator out(dst, win_in_out);
172 
173     const int xo_start = window_execution.y().start();
174     const int xo_end   = window_execution.y().end();
175     const int xo_step  = window_execution.y().step();
176     const int yo_start = window_execution.z().start();
177     const int yo_end   = window_execution.z().end();
178     const int yo_step  = window_execution.z().step();
179     const int bo_start = window_execution[3].start();
180     const int bo_end   = window_execution[3].end();
181     const int bo_step  = window_execution[3].step();
182 
183     if(border_mode == BorderMode::CONSTANT)
184     {
185 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
186         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
187 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
188         using ConstType = T;
189 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
190         const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
191 
192         for(int bo = bo_start; bo < bo_end; bo += bo_step)
193         {
194             const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
195             uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
196 
197             for(int yo = yo_start; yo < yo_end; yo += yo_step)
198             {
199                 // Floating-point coordinate
200                 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
201                 // Integer coordinate
202                 const auto yi = static_cast<int>(std::floor(yi_f));
203                 // Weight for the y coordinate
204                 const auto a1 = (yi_f - static_cast<float>(yi));
205                 const auto b1 = (1.f - a1);
206 
207                 for(int xo = xo_start; xo < xo_end; xo += xo_step)
208                 {
209                     // Floating-point coordinate
210                     const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
211                     // Integer coordinate
212                     const auto xi = static_cast<int>(std::floor(xi_f));
213                     // Weight for the x coordinate
214                     const auto a = (xi_f - static_cast<float>(xi));
215                     const auto b = (1.f - a);
216 
217                     const auto s00_s = static_cast<T>(b * b1);
218                     const auto s01_s = static_cast<T>(a * b1);
219                     const auto s10_s = static_cast<T>(b * a1);
220                     const auto s11_s = static_cast<T>(a * a1);
221 
222                     const uint8_t *in_ptr  = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
223                     uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
224 
225                     int cout = 0;
226                     for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
227                     {
228                         auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
229                         auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
230                         auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
231                         auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
232                         if((yi >= 0) && (yi < in_dim_h))
233                         {
234                             if((xi >= 0) && (xi < in_dim_w))
235                             {
236                                 in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
237                             }
238                             if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
239                             {
240                                 in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
241                             }
242                         }
243                         if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
244                         {
245                             if((xi >= 0) && (xi < in_dim_w))
246                             {
247                                 in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
248                             }
249                             if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
250                             {
251                                 in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
252                             }
253                         }
254 
255                         const auto s00  = wrapper::vdup_n(s00_s, ExactTagType{});
256                         const auto s01  = wrapper::vdup_n(s01_s, ExactTagType{});
257                         const auto s10  = wrapper::vdup_n(s10_s, ExactTagType{});
258                         const auto s11  = wrapper::vdup_n(s11_s, ExactTagType{});
259                         auto       out0 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
260                         out0            = wrapper::vmla(out0, in00, s00);
261                         out0            = wrapper::vmla(out0, in01, s01);
262                         out0            = wrapper::vmla(out0, in10, s10);
263                         out0            = wrapper::vmla(out0, in11, s11);
264                         wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
265                     }
266 
267                     for(; cout < out_dim_ch; ++cout)
268                     {
269                         auto in00 = static_cast<T>(const_border_value);
270                         auto in01 = static_cast<T>(const_border_value);
271                         auto in10 = static_cast<T>(const_border_value);
272                         auto in11 = static_cast<T>(const_border_value);
273                         if((yi >= 0) && (yi < in_dim_h))
274                         {
275                             if((xi >= 0) && (xi < in_dim_w))
276                             {
277                                 in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
278                             }
279                             if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
280                             {
281                                 in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
282                             }
283                         }
284                         if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
285                         {
286                             if((xi >= 0) && (xi < in_dim_w))
287                             {
288                                 in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
289                             }
290                             if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
291                             {
292                                 in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
293                             }
294                         }
295                         auto out0 = static_cast<T>(0);
296                         out0 += in00 * s00_s;
297                         out0 += in01 * s01_s;
298                         out0 += in10 * s10_s;
299                         out0 += in11 * s11_s;
300                         *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
301                     }
302                 }
303             }
304         }
305     }
306     else if(border_mode == BorderMode::REPLICATE)
307     {
308         for(int bo = bo_start; bo < bo_end; bo += bo_step)
309         {
310             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_w;
311             uint8_t       *out_ptr = out.ptr() + bo * out_stride_w;
312 
313             for(int yo = yo_start; yo < yo_end; yo += yo_step)
314             {
315                 // Floating-point coordinate
316                 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
317                 // Integer coordinate
318                 const auto yi = static_cast<int>(std::floor(yi_f));
319                 // Weight for the y coordinate
320                 const auto a1 = (yi_f - static_cast<float>(yi));
321                 const auto b1 = (1.f - a1);
322 
323                 const int yi0 = utility::clamp<int>(yi, 0, in_dim_h - 1);
324                 const int yi1 = utility::clamp<int>(yi + 1, 0, in_dim_h - 1);
325 
326                 const int yi0_offset = yi0 * in_stride_z;
327                 const int yi1_offset = yi1 * in_stride_z;
328 
329                 const int y_offset = yo * out_stride_z;
330                 for(int xo = xo_start; xo < xo_end; xo += xo_step)
331                 {
332                     // Floating-point coordinate
333                     const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
334                     // Integer coordinate
335                     const auto xi = static_cast<int>(std::floor(xi_f));
336                     // Weight for the x coordinate
337                     const auto a = (xi_f - static_cast<float>(xi));
338                     const auto b = (1.f - a);
339 
340                     const auto s00_s = static_cast<T>(b * b1);
341                     const auto s01_s = static_cast<T>(a * b1);
342                     const auto s10_s = static_cast<T>(b * a1);
343                     const auto s11_s = static_cast<T>(a * a1);
344 
345                     const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{});
346                     const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{});
347                     const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{});
348                     const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{});
349 
350                     const int xi0 = utility::clamp<int>(xi, 0, in_dim_w - 1);
351                     const int xi1 = utility::clamp<int>(xi + 1, 0, in_dim_w - 1);
352 
353                     const int xi0_offset = xi0 * in_stride_y;
354                     const int xi1_offset = xi1 * in_stride_y;
355 
356                     const int offset = xo * out_stride_y + y_offset;
357 
358                     int cout = 0;
359                     for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
360                     {
361                         const auto in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
362                         const auto in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
363                         const auto in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
364                         const auto in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
365 
366                         auto out0 = wrapper::vmul(in00, s00);
367                         out0      = wrapper::vmla(out0, in01, s01);
368                         out0      = wrapper::vmla(out0, in10, s10);
369                         out0      = wrapper::vmla(out0, in11, s11);
370                         wrapper::vstore(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T)), out0);
371                     }
372 
373                     for(; cout < out_dim_ch; ++cout)
374                     {
375                         const T in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
376                         const T in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
377                         const T in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
378                         const T in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
379 
380                         T out0 = in00 * s00_s;
381                         out0 += in01 * s01_s;
382                         out0 += in10 * s10_s;
383                         out0 += in11 * s11_s;
384                         *(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T))) = out0;
385                     }
386                 }
387             }
388         }
389     }
390     else
391     {
392         ARM_COMPUTE_ERROR("Not implemented");
393     }
394 }
395 
396 template <typename T>
common_neon_scale(const ITensor * src,ITensor * dst,const ITensor * offsets,const ITensor * dx,const ITensor * dy,InterpolationPolicy policy,BorderMode border_mode,PixelValue constant_border_value,float sampling_offset,bool align_corners,const Window & window)397 void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
398                        InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
399                        bool align_corners, const Window &window)
400 {
401     if(policy == InterpolationPolicy::BILINEAR)
402     {
403         bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
404     }
405     else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
406     {
407         nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
408     }
409 }
410 } // namespace cpu
411 } // namespace arm_compute
412 
413 #endif /* SRC_CORE_NEON_KERNELS_SCALE_LIST_H */
414