xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/Types.h"
30 #include "arm_compute/core/Validate.h"
31 #include "arm_compute/core/Window.h"
32 #include "arm_compute/core/utils/misc/Traits.h"
33 #include "src/core/CPP/Validate.h"
34 #include "src/core/NEON/NEAsymm.h"
35 #include "src/core/NEON/NEFixedPoint.h"
36 #include "src/core/NEON/wrapper/wrapper.h"
37 #include "src/core/helpers/AutoConfiguration.h"
38 #include "src/core/helpers/WindowHelpers.h"
39 
40 #include <arm_neon.h>
41 #include <cstddef>
42 #include <cstdint>
43 
44 namespace arm_compute
45 {
46 namespace cpu
47 {
48 namespace kernels
49 {
50 namespace
51 {
validate_arguments(const ITensorInfo * src,const ITensorInfo * bias,const ITensorInfo * dst,const DirectConvolutionLayerOutputStageKernelInfo & info)52 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
53                           const DirectConvolutionLayerOutputStageKernelInfo &info)
54 {
55     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
56     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
57     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
58     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
59 
60     if(bias != nullptr)
61     {
62         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
63         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
64         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
65     }
66 
67     if(src->data_type() == DataType::S32)
68     {
69         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
70     }
71 
72     // Checks performed when output is configured
73     if((dst != nullptr) && (dst->total_size() != 0))
74     {
75         if(is_data_type_float(src->data_type()))
76         {
77             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
78         }
79         else
80         {
81             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
82         }
83         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
84     }
85     else if(src->data_type() == DataType::S32)
86     {
87         // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
88         ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
89     }
90 
91     return Status{};
92 }
93 
94 template <typename T>
95 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
output_stage_nchw(ITensor * src,const ITensor * bias,const Window & window,ITensor * dst,int result_fixedpoint_multiplier,int result_shift,int result_offset_after_shift)96 output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
97                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
98 {
99     const bool has_bias = bias != nullptr;
100     /** SIMD vector tag type. */
101     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
102 
103     ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
104     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
105     ARM_COMPUTE_UNUSED(result_shift);
106     ARM_COMPUTE_UNUSED(result_offset_after_shift);
107 
108     const int window_start_x = window.x().start();
109     const int window_end_x   = window.x().end();
110     const int window_step_x  = 16 / src->info()->element_size();
111     Window    win            = window;
112     win.set(Window::DimX, Window::Dimension(0, 1, 1));
113 
114     Iterator in(src, win);
115     Iterator out(dst, win);
116     execute_window_loop(win, [&](const Coordinates & id)
117     {
118         int x = window_start_x;
119         for(; x <= (window_end_x - window_step_x); x += window_step_x)
120         {
121             // Get bias and pointer to input
122             const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
123             auto       v_in   = wrapper::vloadq(in_ptr);
124 
125             // Accumulate bias
126             if(has_bias)
127             {
128                 const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
129                 v_in          = wrapper::vadd(v_in, vb);
130             }
131 
132             const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
133             wrapper::vstore(out_ptr, v_in);
134         }
135 
136         // Left-overs loop
137         for(; x < window_end_x; ++x)
138         {
139             // Get bias and pointer to input
140             auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
141 
142             // Accumulate bias
143             if(has_bias)
144             {
145                 const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
146                 s_in += b;
147             }
148 
149             *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
150         }
151 
152     },
153     in, out);
154 }
155 
156 template <typename T>
157 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
output_stage_nhwc(ITensor * src,const ITensor * bias,const Window & window,ITensor * dst,int result_fixedpoint_multiplier,int result_shift,int result_offset_after_shift)158 output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
159                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
160 {
161     const bool has_bias = bias != nullptr;
162     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
163     ARM_COMPUTE_UNUSED(result_shift);
164     ARM_COMPUTE_UNUSED(result_offset_after_shift);
165 
166     Window window_bias = window;
167     window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
168     window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
169     window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
170     window_bias.set(3, Window::Dimension(0, 0, 0));
171 
172     const int window_start_x = window.x().start();
173     const int window_end_x   = window.x().end();
174     const int window_step_x  = 16 / src->info()->element_size();
175     Window    win            = window;
176     win.set(Window::DimX, Window::Dimension(0, 1, 1));
177 
178     Iterator in(src, win);
179     Iterator bi(bias, window_bias);
180     Iterator out(dst, win);
181 
182     execute_window_loop(win, [&](const Coordinates &)
183     {
184         int x = window_start_x;
185         for(; x <= (window_end_x - window_step_x); x += window_step_x)
186         {
187             // Get bias and pointer to input
188             const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
189             auto       v_in   = wrapper::vloadq(in_ptr + x);
190 
191             // Accumulate bias
192             if(has_bias)
193             {
194                 const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
195                 v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
196             }
197 
198             const auto out_ptr = reinterpret_cast<T *>(out.ptr());
199             wrapper::vstore(out_ptr + x, v_in);
200         }
201 
202         // Left-overs loop
203         for(; x < window_end_x; ++x)
204         {
205             // Get bias and pointer to input
206             auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
207 
208             // Accumulate bias
209             if(has_bias)
210             {
211                 const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
212                 s_in += *bias_ptr;
213             }
214 
215             const auto out_ptr = reinterpret_cast<T *>(out.ptr());
216             *(out_ptr + x)     = s_in;
217         }
218     },
219     in, bi, out);
220 }
221 
222 // Quantized case
223 template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
output_stage_nchw(ITensor * src,const ITensor * bias,const Window & window,ITensor * dst,int result_fixedpoint_multiplier,int result_shift,int result_offset_after_shift)224 void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
225                        int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
226 {
227     const bool has_bias = bias != nullptr;
228     using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
229     using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
230 
231     const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
232 
233     const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
234     const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
235 
236     const int window_start_x = window.x().start();
237     const int window_end_x   = window.x().end();
238     const int window_step_x  = 16 / src->info()->element_size();
239     Window    win            = window;
240     win.set(Window::DimX, Window::Dimension(0, 1, 1));
241 
242     Iterator in(src, win);
243     Iterator out(dst, win);
244 
245     execute_window_loop(win, [&](const Coordinates & id)
246     {
247 
248         int x = window_start_x;
249         for(; x <= (window_end_x - window_step_x); x += window_step_x)
250         {
251             // Get bias and pointer to input
252             const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
253             int32x4x4_t v_in =
254             {
255                 {
256                     wrapper::vloadq(in_ptr),
257                     wrapper::vloadq(in_ptr + 4),
258                     wrapper::vloadq(in_ptr + 8),
259                     wrapper::vloadq(in_ptr + 12)
260                 }
261             };
262 
263             // Accumulate bias
264             if(has_bias)
265             {
266                 const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
267                 v_in =
268                 {
269                     {
270                         wrapper::vadd(v_in.val[0], vb),
271                         wrapper::vadd(v_in.val[1], vb),
272                         wrapper::vadd(v_in.val[2], vb),
273                         wrapper::vadd(v_in.val[3], vb)
274                     }
275                 };
276             }
277 
278             const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
279             wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
280                                                            min, max, false));
281         }
282 
283         // Left-overs loop
284         for(; x < window_end_x; ++x)
285         {
286             // Get bias and pointer to input
287             int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
288 
289             // Accumulate bias
290             if(has_bias)
291             {
292                 const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
293                 s_in += b;
294             }
295 
296             const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
297             *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
298                                                        std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
299         }
300     },
301     in, out);
302 }
303 template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
output_stage_nhwc(ITensor * src,const ITensor * bias,const Window & window,ITensor * dst,int result_fixedpoint_multiplier,int result_shift,int result_offset_after_shift)304 void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
305                        int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
306 {
307     const bool has_bias = bias != nullptr;
308     using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
309     using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
310 
311     const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
312 
313     const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
314     const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
315 
316     Window window_bias = window;
317     window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
318     window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
319     window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
320     window_bias.set(3, Window::Dimension(0, 0, 0));
321 
322     const int window_start_x = window.x().start();
323     const int window_end_x   = window.x().end();
324     const int window_step_x  = 16 / src->info()->element_size();
325     Window    win            = window;
326     win.set(Window::DimX, Window::Dimension(0, 1, 1));
327 
328     Iterator in(src, win);
329     Iterator bi(bias, window_bias);
330     Iterator out(dst, win);
331 
332     execute_window_loop(win, [&](const Coordinates &)
333     {
334         int x = window_start_x;
335         for(; x <= (window_end_x - window_step_x); x += window_step_x)
336         {
337             // Get bias and pointer to input
338             const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
339             int32x4x4_t v_in =
340             {
341                 {
342                     wrapper::vloadq(in_ptr),
343                     wrapper::vloadq(in_ptr + 4),
344                     wrapper::vloadq(in_ptr + 8),
345                     wrapper::vloadq(in_ptr + 12),
346                 }
347             };
348 
349             // Accumulate bias
350             if(has_bias)
351             {
352                 const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
353 
354                 wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
355                 wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
356                 wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
357                 wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
358             }
359 
360             const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
361             wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
362         }
363 
364         // Left-overs loop
365         for(; x < window_end_x; ++x)
366         {
367             // Get bias and pointer to input
368             const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
369             int32_t    s_in   = *in_ptr;
370 
371             // Accumulate bias
372             if(has_bias)
373             {
374                 const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
375                 s_in += *bias_ptr;
376             }
377 
378             const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
379             *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
380                                                        std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
381         }
382     },
383     in, bi, out);
384 }
385 } // namespace
386 
configure(ITensorInfo * src,const ITensorInfo * bias,ITensorInfo * dst,const DirectConvolutionLayerOutputStageKernelInfo & info)387 void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
388                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
389 {
390     ARM_COMPUTE_UNUSED(bias);
391     // Perform validation step
392     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
393     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
394 
395     _func                         = nullptr;
396     _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
397     _result_shift                 = info.result_shift;
398     _result_offset_after_shift    = info.result_offset_after_shift;
399 
400     // Auto-initialize output output if required
401     if(dst != nullptr)
402     {
403         // Work out expected output data type
404         const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
405         // Output tensor auto initialization if not yet initialized
406         auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
407     }
408 
409     Window win = calculate_max_window(*src, Steps());
410 
411     ICpuKernel::configure(win);
412 
413     const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
414 
415     // Set appropriate function
416     if(src->data_layout() == DataLayout::NCHW)
417     {
418         switch(src->data_type())
419         {
420             case DataType::S32:
421             {
422                 if(is_qasymm8_signed)
423                 {
424                     _func = &output_stage_nchw<int8_t>;
425                 }
426                 else
427                 {
428                     _func = &output_stage_nchw<uint8_t>;
429                 }
430                 break;
431             }
432 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
433             case DataType::F16:
434             {
435                 _func = &output_stage_nchw<float16_t>;
436                 break;
437             }
438 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
439             case DataType::F32:
440             {
441                 _func = &output_stage_nchw<float>;
442                 break;
443             }
444             default:
445             {
446                 ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
447             }
448         }
449     }
450     else
451     {
452         switch(src->data_type())
453         {
454             case DataType::S32:
455             {
456                 if(is_qasymm8_signed)
457                 {
458                     _func = &output_stage_nhwc<int8_t>;
459                 }
460                 else
461                 {
462                     _func = &output_stage_nhwc<uint8_t>;
463                 }
464                 break;
465             }
466 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
467             case DataType::F16:
468             {
469                 _func = &output_stage_nhwc<float16_t>;
470                 break;
471             }
472 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
473             case DataType::F32:
474             {
475                 _func = &output_stage_nhwc<float>;
476                 break;
477             }
478             default:
479             {
480                 ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
481             }
482         }
483     }
484 }
485 
validate(const ITensorInfo * src,const ITensorInfo * bias,const ITensorInfo * dst,const DirectConvolutionLayerOutputStageKernelInfo & info)486 Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
487                                                   const DirectConvolutionLayerOutputStageKernelInfo &info)
488 {
489     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
490     return Status{};
491 }
492 
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)493 void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
494 {
495     ARM_COMPUTE_UNUSED(info);
496     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
497     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
498     ARM_COMPUTE_ERROR_ON(_func == nullptr);
499 
500     auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
501     auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
502     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
503 
504     (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
505 }
506 
name() const507 const char *CpuDirectConv2dOutputStageKernel::name() const
508 {
509     return "CpuDirectConv2dOutputStageKernel";
510 }
511 } // namespace kernels
512 } // namespace cpu
513 } // namespace arm_compute
514