xref: /aosp_15_r20/external/ComputeLibrary/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/KernelDescriptors.h"
30 #include "arm_compute/core/Types.h"
31 #include "arm_compute/core/Validate.h"
32 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
33 #include "arm_compute/runtime/NEON/NEScheduler.h"
34 #include "arm_compute/runtime/TensorAllocator.h"
35 #include "src/core/helpers/AutoConfiguration.h"
36 #include "src/core/helpers/MemoryHelpers.h"
37 
38 #include "src/common/utils/Log.h"
39 #include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
40 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
41 #include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
42 #include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
43 #include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h"
44 #include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h"
45 #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
46 #include "src/cpu/operators/CpuActivation.h"
47 #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
48 #include "src/cpu/utils/CpuAuxTensorHandler.h"
49 
50 using namespace arm_compute::misc::shape_calculator;
51 using namespace arm_compute::experimental;
52 
53 namespace arm_compute
54 {
55 namespace cpu
56 {
57 namespace
58 {
init_assembly_metadata(const GEMMInfo & info)59 cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
60 {
61     cpu::AsmGemmInfo asm_info;
62     asm_info.method                      = cpu::AsmConvMethod::Im2Col;
63     asm_info.reinterpret_input_as_3d     = info.reinterpret_input_as_3d();
64     asm_info.depth_output_gemm3d         = info.depth_output_gemm3d();
65     asm_info.activation_info             = info.activation_info();
66     asm_info.output_stage                = info.gemmlowp_output_stage();
67     asm_info.fast_mode                   = info.fast_math();
68     asm_info.reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
69 
70     return asm_info;
71 }
72 } // namespace
73 
CpuGemmLowpMatrixMultiplyCore()74 CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore()
75     : _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()),
76       _mm_kernel(),
77       _mtx_a_reshape_kernel(),
78       _mtx_b_reshape_kernel(),
79       _mtx_a_reduction_kernel(),
80       _mtx_b_reduction_kernel(),
81       _offset_contribution_kernel(),
82       _offset_contribution_output_stage_kernel(),
83       _activation_func(),
84       _convert_to_signed_asymm(),
85       _convert_from_signed_asymm(),
86       _vector_sum_col(),
87       _vector_sum_row(),
88       _tmp_a(),
89       _tmp_b(),
90       _mm_result_s32(),
91       _signed_a(),
92       _signed_output(),
93       _a_offset(0),
94       _b_offset(0),
95       _run_vector_matrix_multiplication(false),
96       _assembly_path(false),
97       _fused_assembly_path(false),
98       _reshape_b_only_on_first_run(false),
99       _is_prepared(false),
100       _fuse_output_stage(false),
101       _run_activation(false),
102       _flip_signedness(false),
103       _gemm_info(),
104       _aux_mem(Count)
105 {
106 }
107 CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
108 
configure(const ITensorInfo * a,const ITensorInfo * b,const ITensorInfo * c,ITensorInfo * dst,const GEMMInfo & gemm_info)109 void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
110 {
111     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
112     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
113     ARM_COMPUTE_LOG_PARAMS(a, b, c, dst, gemm_info);
114 
115     const ITensorInfo *matrix_a = a;
116     const ITensorInfo *matrix_b = b;
117     GEMMInfo           info     = gemm_info;
118 
119     // Set internal variables
120     _a_offset                         = a->quantization_info().uniform().offset;
121     _b_offset                         = b->quantization_info().uniform().offset;
122     _run_vector_matrix_multiplication = a->dimension(1) < 2;
123     _reshape_b_only_on_first_run      = info.reshape_b_only_on_first_run();
124     _is_prepared                      = false;
125     _fused_assembly_path              = false;
126     _flip_signedness                  = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
127     _gemm_info                        = gemm_info;
128 
129     _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
130 
131     const ITensorInfo *a_to_use = a;
132 
133     // Convert to QASYMM8 -> QASYMM8_SIGNED and back
134     if(_flip_signedness)
135     {
136         const int32_t                 offset_correction = 128;
137         const DataType                dt                = DataType::QASYMM8_SIGNED;
138         const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
139 
140         _signed_a                = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
141         _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
142         _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
143         a_to_use  = &_signed_a;
144         _a_offset = _signed_a.quantization_info().uniform().offset;
145 
146         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
147         _signed_output                       = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
148 
149         // Output stage correction
150         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
151         output_stage_corr.gemmlowp_offset         = _signed_output.quantization_info().uniform().offset;
152         output_stage_corr.gemmlowp_min_bound -= offset_correction;
153         output_stage_corr.gemmlowp_max_bound -= offset_correction;
154         info.set_gemmlowp_output_stage(output_stage_corr);
155 
156         // Update matrix a
157         matrix_a = &_signed_a;
158     }
159 
160     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
161     if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
162     {
163         _fuse_output_stage = true;
164         _mm_result_s32     = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
165     }
166 
167     // Initialize assembly kernel meta-data
168     const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
169 #ifdef __aarch64__
170     switch(a->data_type())
171     {
172         case DataType::QASYMM8:
173         case DataType::QASYMM8_SIGNED:
174         case DataType::U8:
175         case DataType::S8:
176         {
177             if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
178             {
179                 auto c_info_to_use = c == nullptr ? nullptr : c;
180                 _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
181                 _fused_assembly_path = _asm_glue->is_configured();
182             }
183             else
184             {
185                 auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst);
186                 _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info);
187             }
188             _assembly_path = _asm_glue->is_configured();
189             break;
190         }
191         default:
192         {
193             ARM_COMPUTE_ERROR("Datatype not supported");
194             break;
195         }
196     }
197 #endif /* __aarch64__ */
198     if(!(_assembly_path || _run_vector_matrix_multiplication))
199     {
200         matrix_a = &_tmp_a;
201         matrix_b = &_tmp_b;
202 
203         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
204         _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
205         // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
206         _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
207 
208         // Configure interleave kernel
209         _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>();
210         _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
211 
212         // Configure transpose kernel
213         _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>();
214         _mtx_b_reshape_kernel->configure(b, &_tmp_b);
215     }
216 
217     if(!_fused_assembly_path)
218     {
219         // Build reduction info
220         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
221 
222         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
223         if(_a_offset != 0)
224         {
225             _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
226 
227             // Configure Matrix B reduction kernel
228             _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>();
229             _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
230         }
231 
232         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
233         if(_b_offset != 0)
234         {
235             _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
236 
237             // Configure matrix A reduction kernel
238             _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>();
239             _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
240         }
241 
242         if(_fuse_output_stage)
243         {
244             // Configure matrix multiply kernel
245             if(!_assembly_path)
246             {
247                 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
248                 _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
249             }
250 
251             _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
252             _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
253                                                                 _a_offset == 0 ? nullptr : &_vector_sum_col,
254                                                                 _b_offset == 0 ? nullptr : &_vector_sum_row, c,
255                                                                 _flip_signedness ? &_signed_output : dst,
256                                                                 a->dimension(0),
257                                                                 _a_offset, _b_offset, info.gemmlowp_output_stage());
258 
259             if(_flip_signedness)
260             {
261                 _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
262                 _convert_from_signed_asymm->configure(&_signed_output, dst);
263             }
264         }
265         else
266         {
267             // Configure matrix multiply kernel
268             if(!_assembly_path)
269             {
270                 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
271                 _mm_kernel->configure(matrix_a, matrix_b, dst);
272             }
273             // Configure offset contribution kernel
274             _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
275             _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
276                                                    _a_offset, _b_offset);
277         }
278     }
279     // Configure activation
280     const ActivationLayerInfo &activation = gemm_info.activation_info();
281     _run_activation                       = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
282     if(_run_activation)
283     {
284         _activation_func = std::make_unique<CpuActivation>();
285         _activation_func->configure(dst, nullptr, activation);
286     }
287 
288     if(_assembly_path)
289     {
290         auto asm_mem_req           = _asm_glue->workspace();
291         _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
292         _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
293     }
294 
295     // Request memory for LHS and RHS reshape matrix
296     _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
297                                         && _reshape_b_only_on_first_run ?
298                                         MemoryLifetime::Persistent :
299                                         MemoryLifetime::Temporary,
300                                         _vector_sum_col.total_size());
301     _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
302     _aux_mem[TmpA]         = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
303     _aux_mem[TmpB]         = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
304     _aux_mem[MMResultS32]  = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
305     _aux_mem[SignedA]      = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
306     _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
307 }
308 
validate(const ITensorInfo * a,const ITensorInfo * b,const ITensorInfo * c,const ITensorInfo * output,const GEMMInfo & gemm_info)309 Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
310 {
311     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
312     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
313     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
314     ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
315     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
316                                     "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
317     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
318     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
319 
320     GEMMInfo           info          = gemm_info;
321     const ITensorInfo *matrix_a_info = a;
322     const ITensorInfo *matrix_b_info = b;
323 
324     const ITensorInfo *a_to_use = a;
325 
326     TensorInfo tmp_a_info{};
327     TensorInfo tmp_b_info{};
328     TensorInfo mm_result_s32_info{};
329 
330     int32_t a_offset = a->quantization_info().uniform().offset;
331     int32_t b_offset = b->quantization_info().uniform().offset;
332 
333     bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
334     if(fuse_output_stage)
335     {
336         auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
337     }
338 
339     // Convert QASYMM8->QASYMM8_SIGNED
340     TensorInfo signed_a{};
341     TensorInfo signed_output{};
342     bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
343     if(flip_signedness)
344     {
345         const int32_t                 offset_correction = 128;
346         const DataType                dt                = DataType::QASYMM8_SIGNED;
347         const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
348 
349         signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
350         ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
351         a_to_use = &signed_a;
352         a_offset = signed_a.quantization_info().uniform().offset;
353 
354         const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
355         signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
356 
357         // Output stage correction
358         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
359         output_stage_corr.gemmlowp_offset         = signed_output.quantization_info().uniform().offset;
360         output_stage_corr.gemmlowp_min_bound -= offset_correction;
361         output_stage_corr.gemmlowp_max_bound -= offset_correction;
362         info.set_gemmlowp_output_stage(output_stage_corr);
363 
364         // Update matrix a
365         matrix_a_info = &signed_a;
366     }
367 
368     // Initialize assembly kernel meta-data
369     const AsmGemmInfo asm_info = init_assembly_metadata(info);
370 
371     // Check if we need to run the optimized assembly kernel
372     bool run_optimised             = false;
373     bool run_optimised_requantized = false;
374     if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
375     {
376         run_optimised             = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
377         run_optimised_requantized = run_optimised;
378     }
379     else
380     {
381         run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
382     }
383 
384     if(run_optimised)
385     {
386         ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
387         if(info.depth_output_gemm3d() != 0)
388         {
389             if(info.reinterpret_input_as_3d())
390             {
391                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
392                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
393             }
394             else
395             {
396                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
397             }
398         }
399         else
400         {
401             ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
402         }
403     }
404     else
405     {
406         ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
407         ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
408 
409         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
410         if(!run_vector_matrix_multiplication)
411         {
412             matrix_a_info = &tmp_a_info;
413             matrix_b_info = &tmp_b_info;
414 
415             // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
416             TensorShape shape_tmp_a = a->tensor_shape();
417             shape_tmp_a.set(0, a->dimension(0) * 4);
418             shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
419 
420             // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
421             TensorShape shape_tmp_b = b->tensor_shape();
422             shape_tmp_b.set(0, b->dimension(1) * 16);
423             shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
424 
425             // Validate interleave kernel
426             auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
427             auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
428 
429             ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
430             ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
431         }
432     }
433 
434     if(!run_optimised_requantized)
435     {
436         TensorInfo info_vector_sum_col{};
437         TensorInfo info_vector_sum_row{};
438 
439         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
440 
441         // Validate matrix B reduction kernel only if _a_offset is not equal to 0
442         if(a_offset != 0)
443         {
444             info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
445 
446             // Configure Matrix B reduction kernel
447             ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
448         }
449 
450         // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
451         if(b_offset != 0)
452         {
453             info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
454 
455             // Configure matrix A reduction kernel
456             ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
457         }
458 
459         if(fuse_output_stage)
460         {
461             if(!run_optimised)
462             {
463                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
464                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
465 
466                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
467             }
468 
469             // Validate offset contribution kernel
470             ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
471                                                                                                           a_offset == 0 ? nullptr : &info_vector_sum_col,
472                                                                                                           b_offset == 0 ? nullptr : &info_vector_sum_row,
473                                                                                                           c,
474                                                                                                           flip_signedness ? &signed_output : output,
475                                                                                                           a_offset, b_offset,
476                                                                                                           info.gemmlowp_output_stage()));
477         }
478         else
479         {
480             if(!run_optimised)
481             {
482                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
483                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
484 
485                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
486             }
487             // Validate offset contribution kernel
488             ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
489                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
490                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
491                                                                                                a_offset, b_offset));
492         }
493     }
494 
495     // Validate activation
496     const ActivationLayerInfo &activation = gemm_info.activation_info();
497     if(activation.enabled())
498     {
499         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
500     }
501 
502     return Status{};
503 }
504 
run(ITensorPack & tensors)505 void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
506 {
507     prepare(tensors);
508 
509     auto a        = tensors.get_const_tensor(TensorType::ACL_SRC_0);
510     auto b        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
511     auto c        = tensors.get_const_tensor(TensorType::ACL_SRC_2);
512     auto dst      = tensors.get_tensor(TensorType::ACL_DST);
513     auto a_to_use = a;
514     auto matrix_a = a;
515     auto matrix_b = b;
516 
517     CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false);
518     CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false);
519     CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false);
520     CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true);
521     CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false);
522     CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false);
523     CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
524 
525     // Convert QASYMM8->QASYMM8_SIGNED
526     if(_flip_signedness)
527     {
528         ITensorPack pack =
529         {
530             { TensorType::ACL_SRC, a },
531             { TensorType::ACL_DST, signed_a.get() }
532         };
533         NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
534         a_to_use = signed_a.get();
535         matrix_a = signed_a.get();
536     }
537 
538     // Run GEMM
539     if(_asm_glue->is_configured())
540     {
541         ITensorPack asm_glue_tensors = tensors;
542         auto        output_to_use    = (_fuse_output_stage ? mm_result_s32.get() : dst);
543         if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
544         {
545             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
546             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
547             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
548             asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst);
549         }
550         else
551         {
552             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
553             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
554             asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
555         }
556         _asm_glue->run(asm_glue_tensors);
557     }
558     else
559     {
560         if(!_run_vector_matrix_multiplication)
561         {
562             matrix_a = tmp_a.get();
563             matrix_b = tmp_b.get();
564             // Run interleave kernel
565             ITensorPack pack_a =
566             {
567                 { TensorType::ACL_SRC, a_to_use },
568                 { TensorType::ACL_DST, tmp_a.get() }
569             };
570             NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
571 
572             if(!_reshape_b_only_on_first_run)
573             {
574                 ITensorPack pack_b =
575                 {
576                     { TensorType::ACL_SRC, b },
577                     { TensorType::ACL_DST, tmp_b.get() }
578                 };
579                 // Run transpose kernel
580                 NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
581             }
582         }
583         ITensorPack pack_mm =
584         {
585             { TensorType::ACL_SRC_0, matrix_a },
586             { TensorType::ACL_SRC_1, matrix_b }
587         };
588         if(_fuse_output_stage)
589         {
590             pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
591         }
592         else
593         {
594             pack_mm.add_tensor(TensorType::ACL_DST, dst);
595         }
596         NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
597     }
598 
599     if(!_fused_assembly_path)
600     {
601         // Run matrix A reduction kernel only if _b_offset is not equal to 0
602         if(_b_offset != 0)
603         {
604             ITensorPack pack =
605             {
606                 { TensorType::ACL_SRC, a_to_use },
607                 { TensorType::ACL_DST, vector_sum_row.get() }
608             };
609             NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
610         }
611 
612         // Run matrix B reduction kernel only if _a_offset is not equal to 0
613         if(_a_offset != 0 && !_reshape_b_only_on_first_run)
614         {
615             ITensorPack pack =
616             {
617                 { TensorType::ACL_SRC, b },
618                 { TensorType::ACL_DST, vector_sum_col.get() }
619             };
620             NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
621         }
622 
623         if(_fuse_output_stage)
624         {
625             ITensorPack pack;
626             pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
627             pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get());
628             pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get());
629             pack.add_tensor(TensorType::ACL_SRC_3, c);
630             pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
631 
632             // Run offset contribution kernel
633             NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
634         }
635         else
636         {
637             ITensorPack pack;
638             pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get());
639             pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get());
640             pack.add_tensor(TensorType::ACL_DST, dst);
641 
642             // Run offset contribution kernel
643             NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
644         }
645     }
646 
647     // Convert QASYMM8_SIGNED->QASYMM8
648     if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
649     {
650         ITensorPack pack =
651         {
652             { TensorType::ACL_SRC, signed_output.get() },
653             { TensorType::ACL_DST, dst }
654         };
655         NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
656     }
657 
658     // Run fused activation unless already run in the fused assembly
659     if(_run_activation)
660     {
661         ITensorPack pack =
662         {
663             { TensorType::ACL_SRC, dst },
664             { TensorType::ACL_DST, dst }
665         };
666         _activation_func->run(pack);
667     }
668 }
669 
prepare(ITensorPack & tensors)670 void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
671 {
672     if(!_is_prepared)
673     {
674         auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
675         // Run assembly reshape
676         if(_asm_glue->is_configured())
677         {
678             _asm_glue->prepare(tensors);
679         }
680         // Run non-assembly reshape
681         else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
682         {
683             // Run reshape kernel and mark original weights tensor as unused
684             ITensor            *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
685             CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
686             ITensorPack         pack =
687             {
688                 { TensorType::ACL_SRC, original_b },
689                 { TensorType::ACL_DST, tmp_b.get() }
690             };
691             NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
692         }
693 
694         // Run matrix B reduction kernel only if _a_offset is not equal to 0
695         if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
696         {
697             ITensor            *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
698             CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
699             ITensorPack         pack =
700             {
701                 { TensorType::ACL_SRC, original_b },
702                 { TensorType::ACL_DST, vector_sum_col.get() }
703             };
704             NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
705         }
706         _is_prepared = true;
707     }
708 }
workspace() const709 experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const
710 {
711     return _aux_mem;
712 }
713 } // namespace cpu
714 } // namespace arm_compute
715