1 /*
2 * Copyright (c) 2021-2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
25
26 #include "arm_compute/core/Utils.h"
27 #include "arm_compute/core/Validate.h"
28 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
29 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
30 #include "src/core/CPP/Validate.h"
31 #include "src/core/helpers/AutoConfiguration.h"
32 #include "src/core/helpers/WindowHelpers.h"
33 #include "src/core/utils/AssemblyUtils.h"
34
35 #include "src/core/NEON/kernels/assembly/depthwise.hpp"
36
37 #include "depthwise_common.hpp"
38
39 #include <arm_neon.h>
40
41 namespace arm_compute
42 {
43 namespace cpu
44 {
45 namespace kernels
46 {
47 using namespace arm_compute::misc::shape_calculator;
48
49 namespace
50 {
51 constexpr unsigned int idx_width = 1;
52 constexpr unsigned int idx_height = 2;
53 constexpr unsigned int idx_channels = 0;
54 constexpr unsigned int idx_batches = 3;
55
56 template <typename TSrc, typename TWeights, typename TDst>
create_arm_dwc(const ITensorInfo * src,const ITensorInfo * weights,ITensorInfo * dst,const ConvolutionInfo & info,const CPUInfo & cpu_info,std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> & kernel,std::string & _name)57 void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
58 const ConvolutionInfo &info, const CPUInfo &cpu_info,
59 std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, std::string &_name)
60 {
61 unsigned int stride_cols{};
62 unsigned int stride_rows{};
63 std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
64
65 const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
66
67 const unsigned int n_batches = src->dimension(idx_batches);
68 const unsigned int src_rows = src->dimension(idx_height);
69 const unsigned int src_cols = src->dimension(idx_width);
70 const unsigned int n_channels = src->dimension(idx_channels);
71 const unsigned int dst_rows = dst->dimension(idx_height);
72 const unsigned int dst_cols = dst->dimension(idx_width);
73
74 const unsigned int kernel_cols = weights->dimension(idx_width);
75 const unsigned int kernel_rows = weights->dimension(idx_height);
76
77 const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
78
79 arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
80 n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
81 padding, activation, nullptr);
82
83 // Configure assembly pooling kernel
84 auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
85 if(dwc_kernel_asm == nullptr)
86 {
87 // Configuration not supported: Leave function unconfigured:
88 return;
89 }
90
91 _name = dwc_kernel_asm->name();
92 kernel = std::move(dwc_kernel_asm);
93 }
94
95 template <typename TSrc, typename TWeights, typename TDst>
create_arm_dwc_quant(const ITensorInfo * src,const ITensorInfo * weights,ITensorInfo * dst,const ConvolutionInfo & info,const CPUInfo & cpu_info,std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> & kernel,std::vector<int32_t> & multipliers,std::vector<int32_t> & right_shifts,std::vector<int32_t> & left_shifts,std::string & _name)96 void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
97 const ConvolutionInfo &info, const CPUInfo &cpu_info,
98 std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
99 std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts,
100 std::string &_name)
101 {
102 unsigned int stride_cols{};
103 unsigned int stride_rows{};
104 std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
105
106 const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
107
108 const unsigned int n_batches = src->dimension(idx_batches);
109 const unsigned int src_rows = src->dimension(idx_height);
110 const unsigned int src_cols = src->dimension(idx_width);
111 const unsigned int n_channels = src->dimension(idx_channels);
112 const unsigned int dst_rows = dst->dimension(idx_height);
113 const unsigned int dst_cols = dst->dimension(idx_width);
114
115 const unsigned int kernel_cols = weights->dimension(idx_width);
116 const unsigned int kernel_rows = weights->dimension(idx_height);
117
118 const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
119
120 arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
121 n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
122 padding, activation, nullptr);
123
124 const auto src_qinfo = src->quantization_info().uniform();
125 const auto weights_qinfo = weights->quantization_info();
126 const auto dst_qinfo = dst->quantization_info().uniform();
127
128 const unsigned int num_filters = weights_qinfo.scale().size();
129
130 multipliers.resize(num_filters);
131 std::vector<int32_t> dst_shifts(num_filters);
132 quantization::compute_quantized_multipliers_and_shifts(src,
133 weights,
134 dst,
135 multipliers.data(),
136 dst_shifts.data());
137
138 // Quantize activation bounds
139 int32_t min_activation = std::numeric_limits<TSrc>::lowest();
140 int32_t max_activation = std::numeric_limits<TSrc>::max();
141 if(info.act_info.enabled())
142 {
143 std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
144 }
145
146 // Set quantization parameters for assembly kernels
147 arm_gemm::Requantize32 requant_args{};
148 if(is_data_type_quantized_per_channel(weights->data_type()))
149 {
150 left_shifts.resize(num_filters);
151 right_shifts.resize(num_filters);
152 bool need_left_shift = false; // Select more optimized path if left shift is not needed
153 for(unsigned int i = 0; i < num_filters; ++i)
154 {
155 left_shifts[i] = std::max(-dst_shifts[i], static_cast<int32_t>(0));
156 right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
157 if(dst_shifts[i] < 0 && !need_left_shift)
158 {
159 need_left_shift = true;
160 }
161 }
162
163 requant_args = arm_gemm::Requantize32(nullptr,
164 0,
165 src_qinfo.offset,
166 weights_qinfo.uniform().offset,
167 dst_qinfo.offset,
168 (need_left_shift) ? left_shifts.data() : nullptr,
169 right_shifts.data(),
170 multipliers.data(),
171 static_cast<TSrc>(min_activation),
172 static_cast<TSrc>(max_activation));
173 }
174 else
175 {
176 requant_args = arm_gemm::Requantize32(nullptr,
177 0,
178 src_qinfo.offset,
179 weights_qinfo.uniform().offset,
180 dst_qinfo.offset,
181 -dst_shifts[0],
182 multipliers[0],
183 static_cast<TSrc>(min_activation),
184 static_cast<TSrc>(max_activation));
185 }
186
187 // Configure assembly pooling kernel with requantization
188 auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
189 if(dwc_kernel_asm == nullptr)
190 {
191 // Configuration not supported: Leave function unconfigured:
192 return;
193 }
194 _name = dwc_kernel_asm->name();
195 kernel = std::move(dwc_kernel_asm);
196 }
197 } // namespace
198
CpuDepthwiseConv2dAssemblyWrapperKernel()199 CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel()
200 : _kernel_asm(nullptr),
201 _multipliers(),
202 _left_shifts(),
203 _right_shifts(),
204 _name()
205 {
206 }
207
208 CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default;
209
configure(const ITensorInfo * src,const ITensorInfo * weights,const ITensorInfo *,ITensorInfo * dst,const ConvolutionInfo & info,const CPUInfo & cpu_info)210 void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst,
211 const ConvolutionInfo &info, const CPUInfo &cpu_info)
212 {
213 ARM_COMPUTE_UNUSED(cpu_info);
214 ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
215
216 // Destination initialization if not yet initialized
217 const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info);
218 auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
219 _name = "CpuDepthwiseConv2dAssemblyWrapperKernel";
220 std::string asm_kernel_name("");
221 #if defined(__aarch64__)
222 switch(src->data_type())
223 {
224 case DataType::QASYMM8:
225 if(is_data_type_quantized_per_channel(weights->data_type()))
226 {
227 create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
228 }
229 else
230 {
231 create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
232 }
233 break;
234 case DataType::QASYMM8_SIGNED:
235 create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
236 break;
237 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
238 case DataType::F16:
239 create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
240 break;
241 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
242 case DataType::F32:
243 create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
244 break;
245 default:
246 break;
247 }
248 #endif // defined(__aarch64__)
249
250 Window win = calculate_max_window(*dst, Steps());
251 ICpuKernel::configure(win);
252 if(_kernel_asm != nullptr)
253 {
254 _name += "/" + asm_kernel_name;
255 }
256 }
257
validate(const ITensorInfo * src,const ITensorInfo * weights,const ITensorInfo * bias,const ITensorInfo * dst,const ConvolutionInfo & info)258 Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
259 {
260 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
261
262 #if !defined(__aarch64__)
263 ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
264 #endif // !defined(__aarch64__)
265 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
266 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
267 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
268 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)");
269
270 if(is_data_type_quantized_per_channel(weights->data_type()))
271 {
272 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
273 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
274 }
275 else
276 {
277 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
278 }
279
280 if(bias != nullptr)
281 {
282 ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
283 ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
284
285 if(is_data_type_quantized(src->data_type()))
286 {
287 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
288 }
289 else
290 {
291 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
292 }
293 }
294
295 if(dst->total_size() > 0)
296 {
297 const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
298 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
299 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
300 }
301 return Status{};
302 }
303
run_op(ITensorPack & tensors,const Window & window,const ThreadInfo & info)304 void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
305 {
306 ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
307 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
308 ARM_COMPUTE_UNUSED(window);
309 ARM_COMPUTE_UNUSED(info);
310
311 ARM_COMPUTE_ERROR_ON(tensors.empty());
312
313 const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
314 ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
315 ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
316 ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1);
317
318 const auto src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
319 auto dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
320 auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
321 auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
322
323 const auto src_shape = src->info()->tensor_shape();
324 const auto dst_shape = dst->info()->tensor_shape();
325 const auto src_padding = src->info()->padding();
326 const auto dst_padding = dst->info()->padding();
327
328 const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right;
329 const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
330 const size_t ld_src_batch = ld_src_row * src_shape[2];
331 const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right;
332 const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
333 const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
334
335 _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
336 parameters_ptr,
337 dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
338 working_space, info.thread_id, info.num_threads);
339 }
340
pack_parameters(void * parameters_ptr,void * bias_ptr,void * weights_ptr,size_t ld_weights_col,size_t ld_weight_row)341 void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
342 {
343 _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
344 }
345
get_storage_size() const346 size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_storage_size() const
347 {
348 return _kernel_asm->get_storage_size();
349 }
350
get_working_size(unsigned int num_threads,unsigned int num_input_channels) const351 size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads, unsigned int num_input_channels) const
352 {
353 return _kernel_asm->get_working_size(num_threads, num_input_channels);
354 }
355
is_configured() const356 bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const
357 {
358 return _kernel_asm != nullptr;
359 }
360
name() const361 const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const
362 {
363 return _name.c_str();
364 }
365
get_mws(const CPUInfo & platform,size_t thread_count) const366 size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
367 {
368 ARM_COMPUTE_UNUSED(thread_count);
369 ARM_COMPUTE_UNUSED(platform);
370
371 return ICPPKernel::default_mws;
372 }
373 } // namespace kernels
374 } // namespace cpu
375 } // namespace arm_compute
376