1 /*
2 * Copyright (c) 2017-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
25
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/Types.h"
30 #include "arm_compute/core/Validate.h"
31 #include "arm_compute/core/Window.h"
32 #include "arm_compute/core/utils/misc/Traits.h"
33 #include "src/core/CPP/Validate.h"
34 #include "src/core/NEON/NEAsymm.h"
35 #include "src/core/NEON/NEFixedPoint.h"
36 #include "src/core/NEON/wrapper/wrapper.h"
37 #include "src/core/helpers/AutoConfiguration.h"
38 #include "src/core/helpers/WindowHelpers.h"
39
40 #include <arm_neon.h>
41 #include <cstddef>
42 #include <cstdint>
43
44 namespace arm_compute
45 {
46 namespace cpu
47 {
48 namespace kernels
49 {
50 namespace
51 {
validate_arguments(const ITensorInfo * src,const ITensorInfo * bias,const ITensorInfo * dst,const DirectConvolutionLayerOutputStageKernelInfo & info)52 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
53 const DirectConvolutionLayerOutputStageKernelInfo &info)
54 {
55 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
56 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
57 ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
58 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
59
60 if(bias != nullptr)
61 {
62 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
63 ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
64 ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
65 }
66
67 if(src->data_type() == DataType::S32)
68 {
69 ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
70 }
71
72 // Checks performed when output is configured
73 if((dst != nullptr) && (dst->total_size() != 0))
74 {
75 if(is_data_type_float(src->data_type()))
76 {
77 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
78 }
79 else
80 {
81 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
82 }
83 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
84 }
85 else if(src->data_type() == DataType::S32)
86 {
87 // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
88 ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
89 }
90
91 return Status{};
92 }
93
94 template <typename T>
95 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
output_stage_nchw(ITensor * src,const ITensor * bias,const Window & window,ITensor * dst,int result_fixedpoint_multiplier,int result_shift,int result_offset_after_shift)96 output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
97 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
98 {
99 const bool has_bias = bias != nullptr;
100 /** SIMD vector tag type. */
101 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
102
103 ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
104 ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
105 ARM_COMPUTE_UNUSED(result_shift);
106 ARM_COMPUTE_UNUSED(result_offset_after_shift);
107
108 const int window_start_x = window.x().start();
109 const int window_end_x = window.x().end();
110 const int window_step_x = 16 / src->info()->element_size();
111 Window win = window;
112 win.set(Window::DimX, Window::Dimension(0, 1, 1));
113
114 Iterator in(src, win);
115 Iterator out(dst, win);
116 execute_window_loop(win, [&](const Coordinates & id)
117 {
118 int x = window_start_x;
119 for(; x <= (window_end_x - window_step_x); x += window_step_x)
120 {
121 // Get bias and pointer to input
122 const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
123 auto v_in = wrapper::vloadq(in_ptr);
124
125 // Accumulate bias
126 if(has_bias)
127 {
128 const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
129 v_in = wrapper::vadd(v_in, vb);
130 }
131
132 const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
133 wrapper::vstore(out_ptr, v_in);
134 }
135
136 // Left-overs loop
137 for(; x < window_end_x; ++x)
138 {
139 // Get bias and pointer to input
140 auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
141
142 // Accumulate bias
143 if(has_bias)
144 {
145 const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
146 s_in += b;
147 }
148
149 *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
150 }
151
152 },
153 in, out);
154 }
155
156 template <typename T>
157 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
output_stage_nhwc(ITensor * src,const ITensor * bias,const Window & window,ITensor * dst,int result_fixedpoint_multiplier,int result_shift,int result_offset_after_shift)158 output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
159 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
160 {
161 const bool has_bias = bias != nullptr;
162 ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
163 ARM_COMPUTE_UNUSED(result_shift);
164 ARM_COMPUTE_UNUSED(result_offset_after_shift);
165
166 Window window_bias = window;
167 window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
168 window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
169 window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
170 window_bias.set(3, Window::Dimension(0, 0, 0));
171
172 const int window_start_x = window.x().start();
173 const int window_end_x = window.x().end();
174 const int window_step_x = 16 / src->info()->element_size();
175 Window win = window;
176 win.set(Window::DimX, Window::Dimension(0, 1, 1));
177
178 Iterator in(src, win);
179 Iterator bi(bias, window_bias);
180 Iterator out(dst, win);
181
182 execute_window_loop(win, [&](const Coordinates &)
183 {
184 int x = window_start_x;
185 for(; x <= (window_end_x - window_step_x); x += window_step_x)
186 {
187 // Get bias and pointer to input
188 const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
189 auto v_in = wrapper::vloadq(in_ptr + x);
190
191 // Accumulate bias
192 if(has_bias)
193 {
194 const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
195 v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
196 }
197
198 const auto out_ptr = reinterpret_cast<T *>(out.ptr());
199 wrapper::vstore(out_ptr + x, v_in);
200 }
201
202 // Left-overs loop
203 for(; x < window_end_x; ++x)
204 {
205 // Get bias and pointer to input
206 auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
207
208 // Accumulate bias
209 if(has_bias)
210 {
211 const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
212 s_in += *bias_ptr;
213 }
214
215 const auto out_ptr = reinterpret_cast<T *>(out.ptr());
216 *(out_ptr + x) = s_in;
217 }
218 },
219 in, bi, out);
220 }
221
222 // Quantized case
223 template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
output_stage_nchw(ITensor * src,const ITensor * bias,const Window & window,ITensor * dst,int result_fixedpoint_multiplier,int result_shift,int result_offset_after_shift)224 void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
225 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
226 {
227 const bool has_bias = bias != nullptr;
228 using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
229 using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
230
231 const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
232
233 const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
234 const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
235
236 const int window_start_x = window.x().start();
237 const int window_end_x = window.x().end();
238 const int window_step_x = 16 / src->info()->element_size();
239 Window win = window;
240 win.set(Window::DimX, Window::Dimension(0, 1, 1));
241
242 Iterator in(src, win);
243 Iterator out(dst, win);
244
245 execute_window_loop(win, [&](const Coordinates & id)
246 {
247
248 int x = window_start_x;
249 for(; x <= (window_end_x - window_step_x); x += window_step_x)
250 {
251 // Get bias and pointer to input
252 const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
253 int32x4x4_t v_in =
254 {
255 {
256 wrapper::vloadq(in_ptr),
257 wrapper::vloadq(in_ptr + 4),
258 wrapper::vloadq(in_ptr + 8),
259 wrapper::vloadq(in_ptr + 12)
260 }
261 };
262
263 // Accumulate bias
264 if(has_bias)
265 {
266 const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
267 v_in =
268 {
269 {
270 wrapper::vadd(v_in.val[0], vb),
271 wrapper::vadd(v_in.val[1], vb),
272 wrapper::vadd(v_in.val[2], vb),
273 wrapper::vadd(v_in.val[3], vb)
274 }
275 };
276 }
277
278 const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
279 wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
280 min, max, false));
281 }
282
283 // Left-overs loop
284 for(; x < window_end_x; ++x)
285 {
286 // Get bias and pointer to input
287 int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
288
289 // Accumulate bias
290 if(has_bias)
291 {
292 const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
293 s_in += b;
294 }
295
296 const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
297 *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
298 std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
299 }
300 },
301 in, out);
302 }
303 template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
output_stage_nhwc(ITensor * src,const ITensor * bias,const Window & window,ITensor * dst,int result_fixedpoint_multiplier,int result_shift,int result_offset_after_shift)304 void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
305 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
306 {
307 const bool has_bias = bias != nullptr;
308 using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
309 using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
310
311 const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
312
313 const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
314 const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
315
316 Window window_bias = window;
317 window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
318 window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
319 window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
320 window_bias.set(3, Window::Dimension(0, 0, 0));
321
322 const int window_start_x = window.x().start();
323 const int window_end_x = window.x().end();
324 const int window_step_x = 16 / src->info()->element_size();
325 Window win = window;
326 win.set(Window::DimX, Window::Dimension(0, 1, 1));
327
328 Iterator in(src, win);
329 Iterator bi(bias, window_bias);
330 Iterator out(dst, win);
331
332 execute_window_loop(win, [&](const Coordinates &)
333 {
334 int x = window_start_x;
335 for(; x <= (window_end_x - window_step_x); x += window_step_x)
336 {
337 // Get bias and pointer to input
338 const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
339 int32x4x4_t v_in =
340 {
341 {
342 wrapper::vloadq(in_ptr),
343 wrapper::vloadq(in_ptr + 4),
344 wrapper::vloadq(in_ptr + 8),
345 wrapper::vloadq(in_ptr + 12),
346 }
347 };
348
349 // Accumulate bias
350 if(has_bias)
351 {
352 const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
353
354 wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
355 wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
356 wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
357 wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
358 }
359
360 const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
361 wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
362 }
363
364 // Left-overs loop
365 for(; x < window_end_x; ++x)
366 {
367 // Get bias and pointer to input
368 const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
369 int32_t s_in = *in_ptr;
370
371 // Accumulate bias
372 if(has_bias)
373 {
374 const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
375 s_in += *bias_ptr;
376 }
377
378 const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
379 *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
380 std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
381 }
382 },
383 in, bi, out);
384 }
385 } // namespace
386
configure(ITensorInfo * src,const ITensorInfo * bias,ITensorInfo * dst,const DirectConvolutionLayerOutputStageKernelInfo & info)387 void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
388 const DirectConvolutionLayerOutputStageKernelInfo &info)
389 {
390 ARM_COMPUTE_UNUSED(bias);
391 // Perform validation step
392 ARM_COMPUTE_ERROR_ON_NULLPTR(src);
393 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
394
395 _func = nullptr;
396 _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
397 _result_shift = info.result_shift;
398 _result_offset_after_shift = info.result_offset_after_shift;
399
400 // Auto-initialize output output if required
401 if(dst != nullptr)
402 {
403 // Work out expected output data type
404 const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
405 // Output tensor auto initialization if not yet initialized
406 auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
407 }
408
409 Window win = calculate_max_window(*src, Steps());
410
411 ICpuKernel::configure(win);
412
413 const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
414
415 // Set appropriate function
416 if(src->data_layout() == DataLayout::NCHW)
417 {
418 switch(src->data_type())
419 {
420 case DataType::S32:
421 {
422 if(is_qasymm8_signed)
423 {
424 _func = &output_stage_nchw<int8_t>;
425 }
426 else
427 {
428 _func = &output_stage_nchw<uint8_t>;
429 }
430 break;
431 }
432 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
433 case DataType::F16:
434 {
435 _func = &output_stage_nchw<float16_t>;
436 break;
437 }
438 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
439 case DataType::F32:
440 {
441 _func = &output_stage_nchw<float>;
442 break;
443 }
444 default:
445 {
446 ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
447 }
448 }
449 }
450 else
451 {
452 switch(src->data_type())
453 {
454 case DataType::S32:
455 {
456 if(is_qasymm8_signed)
457 {
458 _func = &output_stage_nhwc<int8_t>;
459 }
460 else
461 {
462 _func = &output_stage_nhwc<uint8_t>;
463 }
464 break;
465 }
466 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
467 case DataType::F16:
468 {
469 _func = &output_stage_nhwc<float16_t>;
470 break;
471 }
472 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
473 case DataType::F32:
474 {
475 _func = &output_stage_nhwc<float>;
476 break;
477 }
478 default:
479 {
480 ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
481 }
482 }
483 }
484 }
485
validate(const ITensorInfo * src,const ITensorInfo * bias,const ITensorInfo * dst,const DirectConvolutionLayerOutputStageKernelInfo & info)486 Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
487 const DirectConvolutionLayerOutputStageKernelInfo &info)
488 {
489 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
490 return Status{};
491 }
492
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)493 void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
494 {
495 ARM_COMPUTE_UNUSED(info);
496 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
497 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
498 ARM_COMPUTE_ERROR_ON(_func == nullptr);
499
500 auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
501 auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
502 auto dst = tensors.get_tensor(TensorType::ACL_DST);
503
504 (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
505 }
506
name() const507 const char *CpuDirectConv2dOutputStageKernel::name() const
508 {
509 return "CpuDirectConv2dOutputStageKernel";
510 }
511 } // namespace kernels
512 } // namespace cpu
513 } // namespace arm_compute
514