1 /*
2 * Copyright (c) 2017-2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "arm_compute/core/Types.h"
25 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
26 #include "arm_compute/runtime/Tensor.h"
27 #include "arm_compute/runtime/TensorAllocator.h"
28 #include "src/core/helpers/MemoryHelpers.h"
29 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
30 #include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
31 #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
32 #include "src/cpu/operators/CpuGemm.h"
33 #include "tests/NEON/Accessor.h"
34 #include "tests/NEON/Helper.h"
35 #include "tests/PaddingCalculator.h"
36 #include "tests/datasets/LargeGEMMDataset.h"
37 #include "tests/datasets/SmallGEMMDataset.h"
38 #include "tests/datasets/TinyGEMMDataset.h"
39 #include "tests/framework/Asserts.h"
40 #include "tests/framework/Macros.h"
41 #include "tests/framework/datasets/Datasets.h"
42 #include "tests/validation/Validation.h"
43 #include "tests/validation/fixtures/GEMMFixture.h"
44 #include "tests/validation/fixtures/GEMMInterleave4x4Fixture.h"
45 #include "tests/validation/fixtures/GEMMTranspose1xWFixture.h"
46
47 namespace arm_compute
48 {
49 namespace test
50 {
51 namespace validation
52 {
53 namespace
54 {
55 constexpr AbsoluteTolerance<float> tolerance_f(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */
56 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
57 RelativeTolerance<half_float::half> rel_tolerance_f16(half(0.2)); /**< Relative tolerance value for comparing reference's output against implementation's output for FP16 data types */
58 const AbsoluteTolerance<float> abs_tolerance_f16(0.2f); /**< Absolute tolerance value for comparing reference's output against implementation's output for FP16 data types */
59 constexpr float tolerance_num = 0.07f; /**< Tolerance number for FP16 data types */
60 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
61 /** CNN data types */
62 const auto CNNDataTypes = framework::dataset::make("DataType",
63 {
64 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
65 DataType::F16,
66 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
67 DataType::F32,
68 });
69
70 const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12);
71 const auto data_transpose = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);
72
73 /** Zero padding test */
74 template <typename FunctionType>
validate_zero_padding(unsigned int dim0_value,unsigned int dim1_value)75 bool validate_zero_padding(unsigned int dim0_value, unsigned int dim1_value)
76 {
77 const TensorShape in_shape(dim0_value, dim1_value);
78 TensorInfo in(in_shape, 1, DataType::U32);
79 TensorInfo dst;
80
81 ARM_COMPUTE_EXPECT(in.is_resizable(), framework::LogLevel::ERRORS);
82
83 // Validate zero-padding
84 FunctionType func;
85
86 func.configure(&in, &dst);
87
88 return in.padding().empty();
89 }
90
91 /* Zero padding test for GEMM kernels */
validate_gemm_zero_padding(const TensorShape shape0,const TensorShape shape1)92 bool validate_gemm_zero_padding(const TensorShape shape0, const TensorShape shape1)
93 {
94 // Create tensors
95 TensorInfo in0(shape0, 1, DataType::F32);
96 TensorInfo in1(shape1, 1, DataType::F32);
97 TensorInfo dst;
98
99 // Validate zero-padding
100 cpu::kernels::CpuGemmMatrixMultiplyKernel gemm;
101 gemm.configure(&in0, &in1, &dst, 1.0, false);
102
103 return in0.padding().empty() && in1.padding().empty() && dst.padding().empty();
104 }
105 } // namespace
106
107 TEST_SUITE(NEON)
TEST_SUITE(GEMM)108 TEST_SUITE(GEMM)
109
110 /** Test case for memory injection in @ref cpu::CpuGemm.
111 *
112 * Configure the operator once and inject memory at run-time in multiple executions.
113 *
114 * Checks performed in order:
115 * - Both runs compute the same output
116 */
117 TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
118 {
119 auto gemm = std::make_unique<cpu::CpuGemm>();
120 const auto lhs_info = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);
121 const auto rhs_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
122 const auto c_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
123 auto dst_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
124 const auto gemm_info = GEMMInfo{};
125 gemm->configure(&lhs_info, &rhs_info, &c_info, &dst_info, 1.f, 1.f, gemm_info);
126
127 // telhs are newly created every call of this lambda function
128 auto lhs = create_tensor<Tensor>(lhs_info);
129 auto rhs = create_tensor<Tensor>(rhs_info);
130 auto c = create_tensor<Tensor>(c_info);
131 lhs.allocator()->allocate();
132 rhs.allocator()->allocate();
133 c.allocator()->allocate();
134
135 ITensorPack run_pack{ { TensorType::ACL_SRC_0, &lhs }, { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
136 ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
137
138 auto mg = MemoryGroup{};
139 auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
140
141 auto run_conv = [&]() -> Tensor
142 {
143 auto dst = create_tensor<Tensor>(dst_info);
144 dst.allocator()->allocate();
145 run_pack.add_tensor(TensorType::ACL_DST, &dst);
146
147 library->fill_tensor_value(Accessor(lhs), 1.f);
148 library->fill_tensor_value(Accessor(rhs), 2.f);
149 library->fill_tensor_value(Accessor(c), 3.f);
150 // This operator is configured once and captured by this lambda.
151 gemm->prepare(prep_pack);
152 gemm->run(run_pack);
153 return dst;
154 };
155 auto result_0 = run_conv();
156 auto result_1 = run_conv();
157 for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
158 {
159 ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
160 }
161 }
162
163 /** Test case for memory injection in @ref NEGEMM.
164 *
165 * Make sure @ref NEGEMM still works through injecting the memory at configure time using the old API.
166 *
167 * Checks performed in order:
168 * - Both runs compute the same output
169 */
TEST_CASE(MultipleExecutionWithConfigure,framework::DatasetMode::ALL)170 TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
171 {
172 auto gemm = std::make_unique<NEGEMM>();
173 const auto lhs_info = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);
174 const auto rhs_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
175 const auto c_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
176 auto dst_info = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);
177 const auto gemm_info = GEMMInfo{};
178 auto run_conv = [&]()
179 {
180 auto lhs = create_tensor<Tensor>(lhs_info);
181 auto rhs = create_tensor<Tensor>(rhs_info);
182 auto c = create_tensor<Tensor>(c_info);
183 auto dst = create_tensor<Tensor>(dst_info);
184 gemm->configure(&lhs, &rhs, &c, &dst, 1.f, 1.f, gemm_info);
185 lhs.allocator()->allocate();
186 rhs.allocator()->allocate();
187 c.allocator()->allocate();
188 dst.allocator()->allocate();
189 library->fill_tensor_value(Accessor(lhs), 1.f);
190 library->fill_tensor_value(Accessor(rhs), 2.f);
191 library->fill_tensor_value(Accessor(c), 3.f);
192 gemm->run();
193 return dst;
194 };
195 auto result_0 = run_conv();
196 auto result_1 = run_conv();
197 for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
198 {
199 ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
200 }
201 }
202
203 // *INDENT-OFF*
204 // clang-format off
205 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
206 framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type
207 TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),
208 }),
209 framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32),
210 TensorInfo(TensorShape(8U, 27U), 1, DataType::F32),
211 })),
212 framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32),
213 TensorInfo(TensorShape(8U, 13U), 1, DataType::F32),
214 })),
215 framework::dataset::make("Expected", { false, true })),
216 lhs_info, rhs_info, output_info, expected)
217 {
218 constexpr float alpha = 1.0;
219 constexpr float beta = 0.0;
220 const auto gemm_info = GEMMInfo();
221 bool is_valid = bool(NEGEMM::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), alpha, beta, gemm_info));
222 ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
223 }
224 // clang-format on
225 // *INDENT-ON*
226 TEST_SUITE(KERNEL_SELECTION)
227 DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL,
228 combine(framework::dataset::make("CpuExt", std::string("NEON")),
229 framework::dataset::make("DataType", { DataType::F32,
230 DataType::F16
231 })),
232 cpu_ext, data_type)
233 {
234 using namespace cpu::kernels;
235
236 cpuinfo::CpuIsaInfo cpu_isa{};
237 cpu_isa.neon = (cpu_ext == "NEON");
238 cpu_isa.fp16 = (data_type == DataType::F16);
239
240 const auto *selected_impl_mul = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
241
242 ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_mul);
243
244 std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_mul";
245 std::string actual = selected_impl_mul->name;
246
247 ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
248
249 const auto *selected_impl_add = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);
250
251 ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_add);
252
253 expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_add";
254 actual = selected_impl_add->name;
255
256 ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
257 }
258 TEST_SUITE_END() // KERNEL_SELECTION
259
260 TEST_SUITE(TRANSPOSE_1XW)
261 using CpuGemmTranspose1xW = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmTranspose1xWKernel>;
262 DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
263 framework::dataset::make("N", { 1, 23, 63, 101 }),
264 framework::dataset::make("K", { 1, 47, 29, 27 })),
265 n_value, k_value)
266 {
267 bool status = validate_zero_padding<CpuGemmTranspose1xW>(n_value, k_value);
268 ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
269 }
270
271 TEST_SUITE(U32)
272 using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint32_t>;
273 FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U32))
274 {
275 // Validate output
276 validate(Accessor(_target), _reference);
277 }
278 TEST_SUITE_END() // U32
279
280 TEST_SUITE(U16)
281 using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint16_t>;
282 FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U16))
283 {
284 // Validate output
285 validate(Accessor(_target), _reference);
286 }
287 TEST_SUITE_END() // U16
288
289 TEST_SUITE(U8)
290 using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint8_t>;
291 FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U8))
292 {
293 // Validate output
294 validate(Accessor(_target), _reference);
295 }
296 TEST_SUITE_END() // U8
297
298 TEST_SUITE_END() // TRANSPOSE_1XW
299
300 TEST_SUITE(INTERLEAVE_4X4)
301 using CpuGemmInterleave4x4 = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmInterleave4x4Kernel>;
302
303 DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
304 framework::dataset::make("M", { 1, 23, 63, 101 }),
305 framework::dataset::make("K", { 1, 47, 29, 27 })),
306 m_value, k_value)
307 {
308 bool status = validate_zero_padding<cpu::kernels::CpuGemmInterleave4x4Kernel>(m_value, k_value);
309 ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
310 }
311
312 TEST_SUITE(U32)
313 using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint32_t>;
314 FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U32))
315 {
316 // Validate output
317 validate(Accessor(_target), _reference);
318 }
319 TEST_SUITE_END() // U32
320
321 TEST_SUITE(U16)
322 using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint16_t>;
323 FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U16))
324 {
325 // Validate output
326 validate(Accessor(_target), _reference);
327 }
328 TEST_SUITE_END() // U16
329
330 TEST_SUITE(U8)
331 using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint8_t>;
332 FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::QASYMM8))
333 {
334 // Validate output
335 validate(Accessor(_target), _reference);
336 }
337 TEST_SUITE_END() // U8
338
339 TEST_SUITE_END() // INTERLEAVE_4X4
340
341 template <typename T>
342 using NEGEMMFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T>;
343
344 template <typename T>
345 using NEBatchedMatMulFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true, false, false, false, false, true>;
346
347 TEST_SUITE(Float)
348 DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(framework::dataset::make("In0", { TensorShape(21U, 13U),
349 TensorShape(31U, 1U),
350 TensorShape(31U, 1U),
351 TensorShape(8U, 2U),
352 TensorShape(38U, 12U),
353 TensorShape(32U, 1U)
354 }),
355 framework::dataset::make("In1", { TensorShape(33U, 21U),
356 TensorShape(23U, 31U),
357 TensorShape(23U, 31U),
358 TensorShape(16U, 8U),
359 TensorShape(21U, 38U),
360 TensorShape(17U, 32U)
361 })),
362 shape0, shape1)
363 {
364 bool status = validate_gemm_zero_padding(shape0, shape1);
365 ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
366 }
367
368 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
369 TEST_SUITE(FP16)
370 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
371 framework::dataset::make("ReshapeWeights", { true, false })),
372 framework::dataset::make("DataType", DataType::F16)))
373 {
374 // Validate output
375 validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
376 }
377
378 TEST_SUITE(BATCHED_MATMUL)
379
380 FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
381 framework::dataset::make("ReshapeWeights", { false })),
382 framework::dataset::make("DataType", DataType::F16)))
383 {
384 // Validate output
385 validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
386 }
387 TEST_SUITE_END()
388
389 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
390 framework::dataset::make("ReshapeWeights", { true, false })),
391
392 framework::dataset::make("DataType", DataType::F16)))
393 {
394 // Validate output
395 validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
396 }
397 TEST_SUITE_END()
398 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
399
TEST_SUITE(FP32)400 TEST_SUITE(FP32)
401 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),
402 framework::dataset::make("ReshapeWeights", { true, false })),
403
404 framework::dataset::make("DataType", DataType::F32)))
405 {
406 // Validate output
407 validate(Accessor(_target), _reference, tolerance_f);
408 }
409 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),
410 framework::dataset::make("ReshapeWeights", { true, false })),
411
412 framework::dataset::make("DataType", DataType::F32)))
413 {
414 // Validate output
415 validate(Accessor(_target), _reference, tolerance_f);
416 }
417
418 TEST_SUITE(BATCHED_MATMUL)
419
TEST_SUITE(FP32)420 TEST_SUITE(FP32)
421 FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),
422 framework::dataset::make("ReshapeWeights", { false })),
423 framework::dataset::make("DataType", DataType::F32)))
424 {
425 // Validate output
426 validate(Accessor(_target), _reference, tolerance_f);
427 }
428 TEST_SUITE_END()
429
430 TEST_SUITE_END()
431
432 TEST_SUITE_END()
433 TEST_SUITE_END()
434
435 TEST_SUITE_END()
436 TEST_SUITE_END()
437 } // namespace validation
438 } // namespace test
439 } // namespace arm_compute
440