1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 //
9 // Auto-generated file. Do not edit!
10 // Specification: test/f32-gemm-minmax.yaml
11 // Generator: tools/generate-gemm-test.py
12
13
14 #include <gtest/gtest.h>
15
16 #include <xnnpack/allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/isa-checks.h>
19 #include <xnnpack/microparams-init.h>
20
21 #include <xnnpack/gemm.h>
22 #include <xnnpack/igemm.h>
23 #include <xnnpack/ppmm.h>
24 #include "gemm-microkernel-tester.h"
25
26
27 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4)28 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4) {
29 TEST_REQUIRES_ARM_NEON;
30 GemmMicrokernelTester()
31 .mr(4)
32 .nr(8)
33 .kr(1)
34 .sr(1)
35 .m(4)
36 .n(8)
37 .k(4)
38 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
39 }
40
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,strided_cn)41 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, strided_cn) {
42 TEST_REQUIRES_ARM_NEON;
43 GemmMicrokernelTester()
44 .mr(4)
45 .nr(8)
46 .kr(1)
47 .sr(1)
48 .m(4)
49 .n(8)
50 .k(4)
51 .cn_stride(11)
52 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
53 }
54
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4_strided_a)55 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_strided_a) {
56 TEST_REQUIRES_ARM_NEON;
57 GemmMicrokernelTester()
58 .mr(4)
59 .nr(8)
60 .kr(1)
61 .sr(1)
62 .m(4)
63 .n(8)
64 .k(4)
65 .a_stride(7)
66 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
67 }
68
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4_subtile)69 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile) {
70 TEST_REQUIRES_ARM_NEON;
71 for (uint32_t n = 1; n <= 8; n++) {
72 for (uint32_t m = 1; m <= 4; m++) {
73 GemmMicrokernelTester()
74 .mr(4)
75 .nr(8)
76 .kr(1)
77 .sr(1)
78 .m(m)
79 .n(n)
80 .k(4)
81 .iterations(1)
82 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
83 }
84 }
85 }
86
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4_subtile_m)87 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_m) {
88 TEST_REQUIRES_ARM_NEON;
89 for (uint32_t m = 1; m <= 4; m++) {
90 GemmMicrokernelTester()
91 .mr(4)
92 .nr(8)
93 .kr(1)
94 .sr(1)
95 .m(m)
96 .n(8)
97 .k(4)
98 .iterations(1)
99 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
100 }
101 }
102
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4_subtile_n)103 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_n) {
104 TEST_REQUIRES_ARM_NEON;
105 for (uint32_t n = 1; n <= 8; n++) {
106 GemmMicrokernelTester()
107 .mr(4)
108 .nr(8)
109 .kr(1)
110 .sr(1)
111 .m(4)
112 .n(n)
113 .k(4)
114 .iterations(1)
115 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
116 }
117 }
118
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_eq_8)119 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8) {
120 TEST_REQUIRES_ARM_NEON;
121 GemmMicrokernelTester()
122 .mr(4)
123 .nr(8)
124 .kr(1)
125 .sr(1)
126 .m(4)
127 .n(8)
128 .k(8)
129 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
130 }
131
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_eq_8_strided_a)132 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_strided_a) {
133 TEST_REQUIRES_ARM_NEON;
134 GemmMicrokernelTester()
135 .mr(4)
136 .nr(8)
137 .kr(1)
138 .sr(1)
139 .m(4)
140 .n(8)
141 .k(8)
142 .a_stride(11)
143 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
144 }
145
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_eq_8_subtile)146 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_subtile) {
147 TEST_REQUIRES_ARM_NEON;
148 for (uint32_t n = 1; n <= 8; n++) {
149 for (uint32_t m = 1; m <= 4; m++) {
150 GemmMicrokernelTester()
151 .mr(4)
152 .nr(8)
153 .kr(1)
154 .sr(1)
155 .m(m)
156 .n(n)
157 .k(8)
158 .iterations(1)
159 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
160 }
161 }
162 }
163
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_lt_8)164 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8) {
165 TEST_REQUIRES_ARM_NEON;
166 for (size_t k = 1; k < 8; k++) {
167 GemmMicrokernelTester()
168 .mr(4)
169 .nr(8)
170 .kr(1)
171 .sr(1)
172 .m(4)
173 .n(8)
174 .k(k)
175 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
176 }
177 }
178
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_lt_8_strided_a)179 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_strided_a) {
180 TEST_REQUIRES_ARM_NEON;
181 for (size_t k = 1; k < 8; k++) {
182 GemmMicrokernelTester()
183 .mr(4)
184 .nr(8)
185 .kr(1)
186 .sr(1)
187 .m(4)
188 .n(8)
189 .k(k)
190 .a_stride(11)
191 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
192 }
193 }
194
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_lt_8_subtile)195 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_subtile) {
196 TEST_REQUIRES_ARM_NEON;
197 for (size_t k = 1; k < 8; k++) {
198 for (uint32_t n = 1; n <= 8; n++) {
199 for (uint32_t m = 1; m <= 4; m++) {
200 GemmMicrokernelTester()
201 .mr(4)
202 .nr(8)
203 .kr(1)
204 .sr(1)
205 .m(m)
206 .n(n)
207 .k(k)
208 .iterations(1)
209 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
210 }
211 }
212 }
213 }
214
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_gt_8)215 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8) {
216 TEST_REQUIRES_ARM_NEON;
217 for (size_t k = 9; k < 16; k++) {
218 GemmMicrokernelTester()
219 .mr(4)
220 .nr(8)
221 .kr(1)
222 .sr(1)
223 .m(4)
224 .n(8)
225 .k(k)
226 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
227 }
228 }
229
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_gt_8_strided_a)230 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8_strided_a) {
231 TEST_REQUIRES_ARM_NEON;
232 for (size_t k = 9; k < 16; k++) {
233 GemmMicrokernelTester()
234 .mr(4)
235 .nr(8)
236 .kr(1)
237 .sr(1)
238 .m(4)
239 .n(8)
240 .k(k)
241 .a_stride(19)
242 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
243 }
244 }
245
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_gt_8_subtile)246 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8_subtile) {
247 TEST_REQUIRES_ARM_NEON;
248 for (size_t k = 9; k < 16; k++) {
249 for (uint32_t n = 1; n <= 8; n++) {
250 for (uint32_t m = 1; m <= 4; m++) {
251 GemmMicrokernelTester()
252 .mr(4)
253 .nr(8)
254 .kr(1)
255 .sr(1)
256 .m(m)
257 .n(n)
258 .k(k)
259 .iterations(1)
260 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
261 }
262 }
263 }
264 }
265
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_div_4)266 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_div_4) {
267 TEST_REQUIRES_ARM_NEON;
268 for (size_t k = 12; k <= 40; k += 4) {
269 GemmMicrokernelTester()
270 .mr(4)
271 .nr(8)
272 .kr(1)
273 .sr(1)
274 .m(4)
275 .n(8)
276 .k(k)
277 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
278 }
279 }
280
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_div_4_strided_a)281 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_strided_a) {
282 TEST_REQUIRES_ARM_NEON;
283 for (size_t k = 12; k <= 40; k += 4) {
284 GemmMicrokernelTester()
285 .mr(4)
286 .nr(8)
287 .kr(1)
288 .sr(1)
289 .m(4)
290 .n(8)
291 .k(k)
292 .a_stride(43)
293 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
294 }
295 }
296
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,k_div_4_subtile)297 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_subtile) {
298 TEST_REQUIRES_ARM_NEON;
299 for (size_t k = 12; k <= 40; k += 4) {
300 for (uint32_t n = 1; n <= 8; n++) {
301 for (uint32_t m = 1; m <= 4; m++) {
302 GemmMicrokernelTester()
303 .mr(4)
304 .nr(8)
305 .kr(1)
306 .sr(1)
307 .m(m)
308 .n(n)
309 .k(k)
310 .iterations(1)
311 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
312 }
313 }
314 }
315 }
316
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,n_gt_8)317 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8) {
318 TEST_REQUIRES_ARM_NEON;
319 for (uint32_t n = 9; n < 16; n++) {
320 for (size_t k = 1; k <= 20; k += 5) {
321 GemmMicrokernelTester()
322 .mr(4)
323 .nr(8)
324 .kr(1)
325 .sr(1)
326 .m(4)
327 .n(n)
328 .k(k)
329 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
330 }
331 }
332 }
333
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,n_gt_8_strided_cn)334 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_cn) {
335 TEST_REQUIRES_ARM_NEON;
336 for (uint32_t n = 9; n < 16; n++) {
337 for (size_t k = 1; k <= 20; k += 5) {
338 GemmMicrokernelTester()
339 .mr(4)
340 .nr(8)
341 .kr(1)
342 .sr(1)
343 .m(4)
344 .n(n)
345 .k(k)
346 .cn_stride(11)
347 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
348 }
349 }
350 }
351
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,n_gt_8_strided_a)352 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_a) {
353 TEST_REQUIRES_ARM_NEON;
354 for (uint32_t n = 9; n < 16; n++) {
355 for (size_t k = 1; k <= 20; k += 5) {
356 GemmMicrokernelTester()
357 .mr(4)
358 .nr(8)
359 .kr(1)
360 .sr(1)
361 .m(4)
362 .n(n)
363 .k(k)
364 .a_stride(23)
365 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
366 }
367 }
368 }
369
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,n_gt_8_subtile)370 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_subtile) {
371 TEST_REQUIRES_ARM_NEON;
372 for (uint32_t n = 9; n < 16; n++) {
373 for (size_t k = 1; k <= 20; k += 5) {
374 for (uint32_t m = 1; m <= 4; m++) {
375 GemmMicrokernelTester()
376 .mr(4)
377 .nr(8)
378 .kr(1)
379 .sr(1)
380 .m(m)
381 .n(n)
382 .k(k)
383 .iterations(1)
384 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
385 }
386 }
387 }
388 }
389
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,n_div_8)390 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, n_div_8) {
391 TEST_REQUIRES_ARM_NEON;
392 for (uint32_t n = 16; n <= 24; n += 8) {
393 for (size_t k = 1; k <= 20; k += 5) {
394 GemmMicrokernelTester()
395 .mr(4)
396 .nr(8)
397 .kr(1)
398 .sr(1)
399 .m(4)
400 .n(n)
401 .k(k)
402 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
403 }
404 }
405 }
406
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,n_div_8_strided_cn)407 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_cn) {
408 TEST_REQUIRES_ARM_NEON;
409 for (uint32_t n = 16; n <= 24; n += 8) {
410 for (size_t k = 1; k <= 20; k += 5) {
411 GemmMicrokernelTester()
412 .mr(4)
413 .nr(8)
414 .kr(1)
415 .sr(1)
416 .m(4)
417 .n(n)
418 .k(k)
419 .cn_stride(11)
420 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
421 }
422 }
423 }
424
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,n_div_8_strided_a)425 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_a) {
426 TEST_REQUIRES_ARM_NEON;
427 for (uint32_t n = 16; n <= 24; n += 8) {
428 for (size_t k = 1; k <= 20; k += 5) {
429 GemmMicrokernelTester()
430 .mr(4)
431 .nr(8)
432 .kr(1)
433 .sr(1)
434 .m(4)
435 .n(n)
436 .k(k)
437 .a_stride(23)
438 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
439 }
440 }
441 }
442
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,n_div_8_subtile)443 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_subtile) {
444 TEST_REQUIRES_ARM_NEON;
445 for (uint32_t n = 16; n <= 24; n += 8) {
446 for (size_t k = 1; k <= 20; k += 5) {
447 for (uint32_t m = 1; m <= 4; m++) {
448 GemmMicrokernelTester()
449 .mr(4)
450 .nr(8)
451 .kr(1)
452 .sr(1)
453 .m(m)
454 .n(n)
455 .k(k)
456 .iterations(1)
457 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
458 }
459 }
460 }
461 }
462
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,strided_cm_subtile)463 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, strided_cm_subtile) {
464 TEST_REQUIRES_ARM_NEON;
465 for (size_t k = 1; k <= 20; k += 5) {
466 for (uint32_t n = 1; n <= 8; n++) {
467 for (uint32_t m = 1; m <= 4; m++) {
468 GemmMicrokernelTester()
469 .mr(4)
470 .nr(8)
471 .kr(1)
472 .sr(1)
473 .m(m)
474 .n(n)
475 .k(k)
476 .cm_stride(11)
477 .iterations(1)
478 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
479 }
480 }
481 }
482 }
483
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,qmin)484 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, qmin) {
485 TEST_REQUIRES_ARM_NEON;
486 GemmMicrokernelTester()
487 .mr(4)
488 .nr(8)
489 .kr(1)
490 .sr(1)
491 .m(4)
492 .n(8)
493 .k(4)
494 .qmin(128)
495 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
496 }
497
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,qmax)498 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, qmax) {
499 TEST_REQUIRES_ARM_NEON;
500 GemmMicrokernelTester()
501 .mr(4)
502 .nr(8)
503 .kr(1)
504 .sr(1)
505 .m(4)
506 .n(8)
507 .k(4)
508 .qmax(128)
509 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
510 }
511
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55,strided_cm)512 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A55, strided_cm) {
513 TEST_REQUIRES_ARM_NEON;
514 GemmMicrokernelTester()
515 .mr(4)
516 .nr(8)
517 .kr(1)
518 .sr(1)
519 .m(4)
520 .n(8)
521 .k(4)
522 .cm_stride(11)
523 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
524 }
525 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
526
527
528 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4)529 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4) {
530 TEST_REQUIRES_ARM_NEON;
531 GemmMicrokernelTester()
532 .mr(4)
533 .nr(8)
534 .kr(1)
535 .sr(1)
536 .m(4)
537 .n(8)
538 .k(4)
539 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
540 }
541
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,strided_cn)542 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, strided_cn) {
543 TEST_REQUIRES_ARM_NEON;
544 GemmMicrokernelTester()
545 .mr(4)
546 .nr(8)
547 .kr(1)
548 .sr(1)
549 .m(4)
550 .n(8)
551 .k(4)
552 .cn_stride(11)
553 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
554 }
555
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4_strided_a)556 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_strided_a) {
557 TEST_REQUIRES_ARM_NEON;
558 GemmMicrokernelTester()
559 .mr(4)
560 .nr(8)
561 .kr(1)
562 .sr(1)
563 .m(4)
564 .n(8)
565 .k(4)
566 .a_stride(7)
567 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
568 }
569
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4_subtile)570 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile) {
571 TEST_REQUIRES_ARM_NEON;
572 for (uint32_t n = 1; n <= 8; n++) {
573 for (uint32_t m = 1; m <= 4; m++) {
574 GemmMicrokernelTester()
575 .mr(4)
576 .nr(8)
577 .kr(1)
578 .sr(1)
579 .m(m)
580 .n(n)
581 .k(4)
582 .iterations(1)
583 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
584 }
585 }
586 }
587
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4_subtile_m)588 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_m) {
589 TEST_REQUIRES_ARM_NEON;
590 for (uint32_t m = 1; m <= 4; m++) {
591 GemmMicrokernelTester()
592 .mr(4)
593 .nr(8)
594 .kr(1)
595 .sr(1)
596 .m(m)
597 .n(8)
598 .k(4)
599 .iterations(1)
600 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
601 }
602 }
603
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4_subtile_n)604 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_n) {
605 TEST_REQUIRES_ARM_NEON;
606 for (uint32_t n = 1; n <= 8; n++) {
607 GemmMicrokernelTester()
608 .mr(4)
609 .nr(8)
610 .kr(1)
611 .sr(1)
612 .m(4)
613 .n(n)
614 .k(4)
615 .iterations(1)
616 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
617 }
618 }
619
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_eq_8)620 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8) {
621 TEST_REQUIRES_ARM_NEON;
622 GemmMicrokernelTester()
623 .mr(4)
624 .nr(8)
625 .kr(1)
626 .sr(1)
627 .m(4)
628 .n(8)
629 .k(8)
630 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
631 }
632
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_eq_8_strided_a)633 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_strided_a) {
634 TEST_REQUIRES_ARM_NEON;
635 GemmMicrokernelTester()
636 .mr(4)
637 .nr(8)
638 .kr(1)
639 .sr(1)
640 .m(4)
641 .n(8)
642 .k(8)
643 .a_stride(11)
644 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
645 }
646
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_eq_8_subtile)647 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_subtile) {
648 TEST_REQUIRES_ARM_NEON;
649 for (uint32_t n = 1; n <= 8; n++) {
650 for (uint32_t m = 1; m <= 4; m++) {
651 GemmMicrokernelTester()
652 .mr(4)
653 .nr(8)
654 .kr(1)
655 .sr(1)
656 .m(m)
657 .n(n)
658 .k(8)
659 .iterations(1)
660 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
661 }
662 }
663 }
664
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_lt_8)665 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8) {
666 TEST_REQUIRES_ARM_NEON;
667 for (size_t k = 1; k < 8; k++) {
668 GemmMicrokernelTester()
669 .mr(4)
670 .nr(8)
671 .kr(1)
672 .sr(1)
673 .m(4)
674 .n(8)
675 .k(k)
676 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
677 }
678 }
679
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_lt_8_strided_a)680 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_strided_a) {
681 TEST_REQUIRES_ARM_NEON;
682 for (size_t k = 1; k < 8; k++) {
683 GemmMicrokernelTester()
684 .mr(4)
685 .nr(8)
686 .kr(1)
687 .sr(1)
688 .m(4)
689 .n(8)
690 .k(k)
691 .a_stride(11)
692 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
693 }
694 }
695
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_lt_8_subtile)696 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_subtile) {
697 TEST_REQUIRES_ARM_NEON;
698 for (size_t k = 1; k < 8; k++) {
699 for (uint32_t n = 1; n <= 8; n++) {
700 for (uint32_t m = 1; m <= 4; m++) {
701 GemmMicrokernelTester()
702 .mr(4)
703 .nr(8)
704 .kr(1)
705 .sr(1)
706 .m(m)
707 .n(n)
708 .k(k)
709 .iterations(1)
710 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
711 }
712 }
713 }
714 }
715
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_gt_8)716 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8) {
717 TEST_REQUIRES_ARM_NEON;
718 for (size_t k = 9; k < 16; k++) {
719 GemmMicrokernelTester()
720 .mr(4)
721 .nr(8)
722 .kr(1)
723 .sr(1)
724 .m(4)
725 .n(8)
726 .k(k)
727 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
728 }
729 }
730
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_gt_8_strided_a)731 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8_strided_a) {
732 TEST_REQUIRES_ARM_NEON;
733 for (size_t k = 9; k < 16; k++) {
734 GemmMicrokernelTester()
735 .mr(4)
736 .nr(8)
737 .kr(1)
738 .sr(1)
739 .m(4)
740 .n(8)
741 .k(k)
742 .a_stride(19)
743 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
744 }
745 }
746
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_gt_8_subtile)747 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8_subtile) {
748 TEST_REQUIRES_ARM_NEON;
749 for (size_t k = 9; k < 16; k++) {
750 for (uint32_t n = 1; n <= 8; n++) {
751 for (uint32_t m = 1; m <= 4; m++) {
752 GemmMicrokernelTester()
753 .mr(4)
754 .nr(8)
755 .kr(1)
756 .sr(1)
757 .m(m)
758 .n(n)
759 .k(k)
760 .iterations(1)
761 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
762 }
763 }
764 }
765 }
766
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_div_4)767 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_div_4) {
768 TEST_REQUIRES_ARM_NEON;
769 for (size_t k = 12; k <= 40; k += 4) {
770 GemmMicrokernelTester()
771 .mr(4)
772 .nr(8)
773 .kr(1)
774 .sr(1)
775 .m(4)
776 .n(8)
777 .k(k)
778 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
779 }
780 }
781
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_div_4_strided_a)782 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_strided_a) {
783 TEST_REQUIRES_ARM_NEON;
784 for (size_t k = 12; k <= 40; k += 4) {
785 GemmMicrokernelTester()
786 .mr(4)
787 .nr(8)
788 .kr(1)
789 .sr(1)
790 .m(4)
791 .n(8)
792 .k(k)
793 .a_stride(43)
794 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
795 }
796 }
797
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,k_div_4_subtile)798 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_subtile) {
799 TEST_REQUIRES_ARM_NEON;
800 for (size_t k = 12; k <= 40; k += 4) {
801 for (uint32_t n = 1; n <= 8; n++) {
802 for (uint32_t m = 1; m <= 4; m++) {
803 GemmMicrokernelTester()
804 .mr(4)
805 .nr(8)
806 .kr(1)
807 .sr(1)
808 .m(m)
809 .n(n)
810 .k(k)
811 .iterations(1)
812 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
813 }
814 }
815 }
816 }
817
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,n_gt_8)818 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8) {
819 TEST_REQUIRES_ARM_NEON;
820 for (uint32_t n = 9; n < 16; n++) {
821 for (size_t k = 1; k <= 20; k += 5) {
822 GemmMicrokernelTester()
823 .mr(4)
824 .nr(8)
825 .kr(1)
826 .sr(1)
827 .m(4)
828 .n(n)
829 .k(k)
830 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
831 }
832 }
833 }
834
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,n_gt_8_strided_cn)835 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_cn) {
836 TEST_REQUIRES_ARM_NEON;
837 for (uint32_t n = 9; n < 16; n++) {
838 for (size_t k = 1; k <= 20; k += 5) {
839 GemmMicrokernelTester()
840 .mr(4)
841 .nr(8)
842 .kr(1)
843 .sr(1)
844 .m(4)
845 .n(n)
846 .k(k)
847 .cn_stride(11)
848 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
849 }
850 }
851 }
852
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,n_gt_8_strided_a)853 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_a) {
854 TEST_REQUIRES_ARM_NEON;
855 for (uint32_t n = 9; n < 16; n++) {
856 for (size_t k = 1; k <= 20; k += 5) {
857 GemmMicrokernelTester()
858 .mr(4)
859 .nr(8)
860 .kr(1)
861 .sr(1)
862 .m(4)
863 .n(n)
864 .k(k)
865 .a_stride(23)
866 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
867 }
868 }
869 }
870
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,n_gt_8_subtile)871 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_subtile) {
872 TEST_REQUIRES_ARM_NEON;
873 for (uint32_t n = 9; n < 16; n++) {
874 for (size_t k = 1; k <= 20; k += 5) {
875 for (uint32_t m = 1; m <= 4; m++) {
876 GemmMicrokernelTester()
877 .mr(4)
878 .nr(8)
879 .kr(1)
880 .sr(1)
881 .m(m)
882 .n(n)
883 .k(k)
884 .iterations(1)
885 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
886 }
887 }
888 }
889 }
890
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,n_div_8)891 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, n_div_8) {
892 TEST_REQUIRES_ARM_NEON;
893 for (uint32_t n = 16; n <= 24; n += 8) {
894 for (size_t k = 1; k <= 20; k += 5) {
895 GemmMicrokernelTester()
896 .mr(4)
897 .nr(8)
898 .kr(1)
899 .sr(1)
900 .m(4)
901 .n(n)
902 .k(k)
903 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
904 }
905 }
906 }
907
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,n_div_8_strided_cn)908 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_cn) {
909 TEST_REQUIRES_ARM_NEON;
910 for (uint32_t n = 16; n <= 24; n += 8) {
911 for (size_t k = 1; k <= 20; k += 5) {
912 GemmMicrokernelTester()
913 .mr(4)
914 .nr(8)
915 .kr(1)
916 .sr(1)
917 .m(4)
918 .n(n)
919 .k(k)
920 .cn_stride(11)
921 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
922 }
923 }
924 }
925
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,n_div_8_strided_a)926 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_a) {
927 TEST_REQUIRES_ARM_NEON;
928 for (uint32_t n = 16; n <= 24; n += 8) {
929 for (size_t k = 1; k <= 20; k += 5) {
930 GemmMicrokernelTester()
931 .mr(4)
932 .nr(8)
933 .kr(1)
934 .sr(1)
935 .m(4)
936 .n(n)
937 .k(k)
938 .a_stride(23)
939 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
940 }
941 }
942 }
943
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,n_div_8_subtile)944 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_subtile) {
945 TEST_REQUIRES_ARM_NEON;
946 for (uint32_t n = 16; n <= 24; n += 8) {
947 for (size_t k = 1; k <= 20; k += 5) {
948 for (uint32_t m = 1; m <= 4; m++) {
949 GemmMicrokernelTester()
950 .mr(4)
951 .nr(8)
952 .kr(1)
953 .sr(1)
954 .m(m)
955 .n(n)
956 .k(k)
957 .iterations(1)
958 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
959 }
960 }
961 }
962 }
963
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,strided_cm_subtile)964 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, strided_cm_subtile) {
965 TEST_REQUIRES_ARM_NEON;
966 for (size_t k = 1; k <= 20; k += 5) {
967 for (uint32_t n = 1; n <= 8; n++) {
968 for (uint32_t m = 1; m <= 4; m++) {
969 GemmMicrokernelTester()
970 .mr(4)
971 .nr(8)
972 .kr(1)
973 .sr(1)
974 .m(m)
975 .n(n)
976 .k(k)
977 .cm_stride(11)
978 .iterations(1)
979 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
980 }
981 }
982 }
983 }
984
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,qmin)985 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, qmin) {
986 TEST_REQUIRES_ARM_NEON;
987 GemmMicrokernelTester()
988 .mr(4)
989 .nr(8)
990 .kr(1)
991 .sr(1)
992 .m(4)
993 .n(8)
994 .k(4)
995 .qmin(128)
996 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
997 }
998
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,qmax)999 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, qmax) {
1000 TEST_REQUIRES_ARM_NEON;
1001 GemmMicrokernelTester()
1002 .mr(4)
1003 .nr(8)
1004 .kr(1)
1005 .sr(1)
1006 .m(4)
1007 .n(8)
1008 .k(4)
1009 .qmax(128)
1010 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
1011 }
1012
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75,strided_cm)1013 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_CORTEX_A75, strided_cm) {
1014 TEST_REQUIRES_ARM_NEON;
1015 GemmMicrokernelTester()
1016 .mr(4)
1017 .nr(8)
1018 .kr(1)
1019 .sr(1)
1020 .m(4)
1021 .n(8)
1022 .k(4)
1023 .cm_stride(11)
1024 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
1025 }
1026 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
1027
1028
1029 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_eq_4)1030 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_eq_4) {
1031 TEST_REQUIRES_ARM_NEON;
1032 GemmMicrokernelTester()
1033 .mr(4)
1034 .nr(8)
1035 .kr(1)
1036 .sr(1)
1037 .m(4)
1038 .n(8)
1039 .k(4)
1040 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1041 }
1042
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,strided_cn)1043 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, strided_cn) {
1044 TEST_REQUIRES_ARM_NEON;
1045 GemmMicrokernelTester()
1046 .mr(4)
1047 .nr(8)
1048 .kr(1)
1049 .sr(1)
1050 .m(4)
1051 .n(8)
1052 .k(4)
1053 .cn_stride(11)
1054 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1055 }
1056
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_eq_4_strided_a)1057 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_eq_4_strided_a) {
1058 TEST_REQUIRES_ARM_NEON;
1059 GemmMicrokernelTester()
1060 .mr(4)
1061 .nr(8)
1062 .kr(1)
1063 .sr(1)
1064 .m(4)
1065 .n(8)
1066 .k(4)
1067 .a_stride(7)
1068 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1069 }
1070
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_eq_4_subtile)1071 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_eq_4_subtile) {
1072 TEST_REQUIRES_ARM_NEON;
1073 for (uint32_t n = 1; n <= 8; n++) {
1074 for (uint32_t m = 1; m <= 4; m++) {
1075 GemmMicrokernelTester()
1076 .mr(4)
1077 .nr(8)
1078 .kr(1)
1079 .sr(1)
1080 .m(m)
1081 .n(n)
1082 .k(4)
1083 .iterations(1)
1084 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1085 }
1086 }
1087 }
1088
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_eq_4_subtile_m)1089 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_eq_4_subtile_m) {
1090 TEST_REQUIRES_ARM_NEON;
1091 for (uint32_t m = 1; m <= 4; m++) {
1092 GemmMicrokernelTester()
1093 .mr(4)
1094 .nr(8)
1095 .kr(1)
1096 .sr(1)
1097 .m(m)
1098 .n(8)
1099 .k(4)
1100 .iterations(1)
1101 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1102 }
1103 }
1104
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_eq_4_subtile_n)1105 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_eq_4_subtile_n) {
1106 TEST_REQUIRES_ARM_NEON;
1107 for (uint32_t n = 1; n <= 8; n++) {
1108 GemmMicrokernelTester()
1109 .mr(4)
1110 .nr(8)
1111 .kr(1)
1112 .sr(1)
1113 .m(4)
1114 .n(n)
1115 .k(4)
1116 .iterations(1)
1117 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1118 }
1119 }
1120
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_eq_8)1121 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_eq_8) {
1122 TEST_REQUIRES_ARM_NEON;
1123 GemmMicrokernelTester()
1124 .mr(4)
1125 .nr(8)
1126 .kr(1)
1127 .sr(1)
1128 .m(4)
1129 .n(8)
1130 .k(8)
1131 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1132 }
1133
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_eq_8_strided_a)1134 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_eq_8_strided_a) {
1135 TEST_REQUIRES_ARM_NEON;
1136 GemmMicrokernelTester()
1137 .mr(4)
1138 .nr(8)
1139 .kr(1)
1140 .sr(1)
1141 .m(4)
1142 .n(8)
1143 .k(8)
1144 .a_stride(11)
1145 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1146 }
1147
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_eq_8_subtile)1148 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_eq_8_subtile) {
1149 TEST_REQUIRES_ARM_NEON;
1150 for (uint32_t n = 1; n <= 8; n++) {
1151 for (uint32_t m = 1; m <= 4; m++) {
1152 GemmMicrokernelTester()
1153 .mr(4)
1154 .nr(8)
1155 .kr(1)
1156 .sr(1)
1157 .m(m)
1158 .n(n)
1159 .k(8)
1160 .iterations(1)
1161 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1162 }
1163 }
1164 }
1165
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_lt_8)1166 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_lt_8) {
1167 TEST_REQUIRES_ARM_NEON;
1168 for (size_t k = 1; k < 8; k++) {
1169 GemmMicrokernelTester()
1170 .mr(4)
1171 .nr(8)
1172 .kr(1)
1173 .sr(1)
1174 .m(4)
1175 .n(8)
1176 .k(k)
1177 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1178 }
1179 }
1180
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_lt_8_strided_a)1181 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_lt_8_strided_a) {
1182 TEST_REQUIRES_ARM_NEON;
1183 for (size_t k = 1; k < 8; k++) {
1184 GemmMicrokernelTester()
1185 .mr(4)
1186 .nr(8)
1187 .kr(1)
1188 .sr(1)
1189 .m(4)
1190 .n(8)
1191 .k(k)
1192 .a_stride(11)
1193 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1194 }
1195 }
1196
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_lt_8_subtile)1197 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_lt_8_subtile) {
1198 TEST_REQUIRES_ARM_NEON;
1199 for (size_t k = 1; k < 8; k++) {
1200 for (uint32_t n = 1; n <= 8; n++) {
1201 for (uint32_t m = 1; m <= 4; m++) {
1202 GemmMicrokernelTester()
1203 .mr(4)
1204 .nr(8)
1205 .kr(1)
1206 .sr(1)
1207 .m(m)
1208 .n(n)
1209 .k(k)
1210 .iterations(1)
1211 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1212 }
1213 }
1214 }
1215 }
1216
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_gt_8)1217 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_gt_8) {
1218 TEST_REQUIRES_ARM_NEON;
1219 for (size_t k = 9; k < 16; k++) {
1220 GemmMicrokernelTester()
1221 .mr(4)
1222 .nr(8)
1223 .kr(1)
1224 .sr(1)
1225 .m(4)
1226 .n(8)
1227 .k(k)
1228 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1229 }
1230 }
1231
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_gt_8_strided_a)1232 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_gt_8_strided_a) {
1233 TEST_REQUIRES_ARM_NEON;
1234 for (size_t k = 9; k < 16; k++) {
1235 GemmMicrokernelTester()
1236 .mr(4)
1237 .nr(8)
1238 .kr(1)
1239 .sr(1)
1240 .m(4)
1241 .n(8)
1242 .k(k)
1243 .a_stride(19)
1244 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1245 }
1246 }
1247
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_gt_8_subtile)1248 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_gt_8_subtile) {
1249 TEST_REQUIRES_ARM_NEON;
1250 for (size_t k = 9; k < 16; k++) {
1251 for (uint32_t n = 1; n <= 8; n++) {
1252 for (uint32_t m = 1; m <= 4; m++) {
1253 GemmMicrokernelTester()
1254 .mr(4)
1255 .nr(8)
1256 .kr(1)
1257 .sr(1)
1258 .m(m)
1259 .n(n)
1260 .k(k)
1261 .iterations(1)
1262 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1263 }
1264 }
1265 }
1266 }
1267
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_div_4)1268 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_div_4) {
1269 TEST_REQUIRES_ARM_NEON;
1270 for (size_t k = 12; k <= 40; k += 4) {
1271 GemmMicrokernelTester()
1272 .mr(4)
1273 .nr(8)
1274 .kr(1)
1275 .sr(1)
1276 .m(4)
1277 .n(8)
1278 .k(k)
1279 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1280 }
1281 }
1282
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_div_4_strided_a)1283 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_div_4_strided_a) {
1284 TEST_REQUIRES_ARM_NEON;
1285 for (size_t k = 12; k <= 40; k += 4) {
1286 GemmMicrokernelTester()
1287 .mr(4)
1288 .nr(8)
1289 .kr(1)
1290 .sr(1)
1291 .m(4)
1292 .n(8)
1293 .k(k)
1294 .a_stride(43)
1295 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1296 }
1297 }
1298
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,k_div_4_subtile)1299 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, k_div_4_subtile) {
1300 TEST_REQUIRES_ARM_NEON;
1301 for (size_t k = 12; k <= 40; k += 4) {
1302 for (uint32_t n = 1; n <= 8; n++) {
1303 for (uint32_t m = 1; m <= 4; m++) {
1304 GemmMicrokernelTester()
1305 .mr(4)
1306 .nr(8)
1307 .kr(1)
1308 .sr(1)
1309 .m(m)
1310 .n(n)
1311 .k(k)
1312 .iterations(1)
1313 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1314 }
1315 }
1316 }
1317 }
1318
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,n_gt_8)1319 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, n_gt_8) {
1320 TEST_REQUIRES_ARM_NEON;
1321 for (uint32_t n = 9; n < 16; n++) {
1322 for (size_t k = 1; k <= 20; k += 5) {
1323 GemmMicrokernelTester()
1324 .mr(4)
1325 .nr(8)
1326 .kr(1)
1327 .sr(1)
1328 .m(4)
1329 .n(n)
1330 .k(k)
1331 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1332 }
1333 }
1334 }
1335
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,n_gt_8_strided_cn)1336 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
1337 TEST_REQUIRES_ARM_NEON;
1338 for (uint32_t n = 9; n < 16; n++) {
1339 for (size_t k = 1; k <= 20; k += 5) {
1340 GemmMicrokernelTester()
1341 .mr(4)
1342 .nr(8)
1343 .kr(1)
1344 .sr(1)
1345 .m(4)
1346 .n(n)
1347 .k(k)
1348 .cn_stride(11)
1349 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1350 }
1351 }
1352 }
1353
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,n_gt_8_strided_a)1354 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, n_gt_8_strided_a) {
1355 TEST_REQUIRES_ARM_NEON;
1356 for (uint32_t n = 9; n < 16; n++) {
1357 for (size_t k = 1; k <= 20; k += 5) {
1358 GemmMicrokernelTester()
1359 .mr(4)
1360 .nr(8)
1361 .kr(1)
1362 .sr(1)
1363 .m(4)
1364 .n(n)
1365 .k(k)
1366 .a_stride(23)
1367 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1368 }
1369 }
1370 }
1371
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,n_gt_8_subtile)1372 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, n_gt_8_subtile) {
1373 TEST_REQUIRES_ARM_NEON;
1374 for (uint32_t n = 9; n < 16; n++) {
1375 for (size_t k = 1; k <= 20; k += 5) {
1376 for (uint32_t m = 1; m <= 4; m++) {
1377 GemmMicrokernelTester()
1378 .mr(4)
1379 .nr(8)
1380 .kr(1)
1381 .sr(1)
1382 .m(m)
1383 .n(n)
1384 .k(k)
1385 .iterations(1)
1386 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1387 }
1388 }
1389 }
1390 }
1391
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,n_div_8)1392 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, n_div_8) {
1393 TEST_REQUIRES_ARM_NEON;
1394 for (uint32_t n = 16; n <= 24; n += 8) {
1395 for (size_t k = 1; k <= 20; k += 5) {
1396 GemmMicrokernelTester()
1397 .mr(4)
1398 .nr(8)
1399 .kr(1)
1400 .sr(1)
1401 .m(4)
1402 .n(n)
1403 .k(k)
1404 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1405 }
1406 }
1407 }
1408
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,n_div_8_strided_cn)1409 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, n_div_8_strided_cn) {
1410 TEST_REQUIRES_ARM_NEON;
1411 for (uint32_t n = 16; n <= 24; n += 8) {
1412 for (size_t k = 1; k <= 20; k += 5) {
1413 GemmMicrokernelTester()
1414 .mr(4)
1415 .nr(8)
1416 .kr(1)
1417 .sr(1)
1418 .m(4)
1419 .n(n)
1420 .k(k)
1421 .cn_stride(11)
1422 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1423 }
1424 }
1425 }
1426
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,n_div_8_strided_a)1427 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, n_div_8_strided_a) {
1428 TEST_REQUIRES_ARM_NEON;
1429 for (uint32_t n = 16; n <= 24; n += 8) {
1430 for (size_t k = 1; k <= 20; k += 5) {
1431 GemmMicrokernelTester()
1432 .mr(4)
1433 .nr(8)
1434 .kr(1)
1435 .sr(1)
1436 .m(4)
1437 .n(n)
1438 .k(k)
1439 .a_stride(23)
1440 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1441 }
1442 }
1443 }
1444
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,n_div_8_subtile)1445 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, n_div_8_subtile) {
1446 TEST_REQUIRES_ARM_NEON;
1447 for (uint32_t n = 16; n <= 24; n += 8) {
1448 for (size_t k = 1; k <= 20; k += 5) {
1449 for (uint32_t m = 1; m <= 4; m++) {
1450 GemmMicrokernelTester()
1451 .mr(4)
1452 .nr(8)
1453 .kr(1)
1454 .sr(1)
1455 .m(m)
1456 .n(n)
1457 .k(k)
1458 .iterations(1)
1459 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1460 }
1461 }
1462 }
1463 }
1464
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,strided_cm_subtile)1465 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, strided_cm_subtile) {
1466 TEST_REQUIRES_ARM_NEON;
1467 for (size_t k = 1; k <= 20; k += 5) {
1468 for (uint32_t n = 1; n <= 8; n++) {
1469 for (uint32_t m = 1; m <= 4; m++) {
1470 GemmMicrokernelTester()
1471 .mr(4)
1472 .nr(8)
1473 .kr(1)
1474 .sr(1)
1475 .m(m)
1476 .n(n)
1477 .k(k)
1478 .cm_stride(11)
1479 .iterations(1)
1480 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1481 }
1482 }
1483 }
1484 }
1485
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,qmin)1486 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, qmin) {
1487 TEST_REQUIRES_ARM_NEON;
1488 GemmMicrokernelTester()
1489 .mr(4)
1490 .nr(8)
1491 .kr(1)
1492 .sr(1)
1493 .m(4)
1494 .n(8)
1495 .k(4)
1496 .qmin(128)
1497 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1498 }
1499
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,qmax)1500 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, qmax) {
1501 TEST_REQUIRES_ARM_NEON;
1502 GemmMicrokernelTester()
1503 .mr(4)
1504 .nr(8)
1505 .kr(1)
1506 .sr(1)
1507 .m(4)
1508 .n(8)
1509 .k(4)
1510 .qmax(128)
1511 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1512 }
1513
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53,strided_cm)1514 TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_PRFM_CORTEX_A53, strided_cm) {
1515 TEST_REQUIRES_ARM_NEON;
1516 GemmMicrokernelTester()
1517 .mr(4)
1518 .nr(8)
1519 .kr(1)
1520 .sr(1)
1521 .m(4)
1522 .n(8)
1523 .k(4)
1524 .cm_stride(11)
1525 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
1526 }
1527 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
1528
1529
1530 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_eq_8)1531 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
1532 TEST_REQUIRES_ARM_NEON_FMA;
1533 GemmMicrokernelTester()
1534 .mr(1)
1535 .nr(8)
1536 .kr(1)
1537 .sr(1)
1538 .m(1)
1539 .n(8)
1540 .k(8)
1541 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1542 }
1543
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,strided_cn)1544 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
1545 TEST_REQUIRES_ARM_NEON_FMA;
1546 GemmMicrokernelTester()
1547 .mr(1)
1548 .nr(8)
1549 .kr(1)
1550 .sr(1)
1551 .m(1)
1552 .n(8)
1553 .k(8)
1554 .cn_stride(11)
1555 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1556 }
1557
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_eq_8_strided_a)1558 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
1559 TEST_REQUIRES_ARM_NEON_FMA;
1560 GemmMicrokernelTester()
1561 .mr(1)
1562 .nr(8)
1563 .kr(1)
1564 .sr(1)
1565 .m(1)
1566 .n(8)
1567 .k(8)
1568 .a_stride(11)
1569 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1570 }
1571
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_eq_8_subtile)1572 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
1573 TEST_REQUIRES_ARM_NEON_FMA;
1574 for (uint32_t n = 1; n <= 8; n++) {
1575 for (uint32_t m = 1; m <= 1; m++) {
1576 GemmMicrokernelTester()
1577 .mr(1)
1578 .nr(8)
1579 .kr(1)
1580 .sr(1)
1581 .m(m)
1582 .n(n)
1583 .k(8)
1584 .iterations(1)
1585 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1586 }
1587 }
1588 }
1589
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_eq_8_subtile_m)1590 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_m) {
1591 TEST_REQUIRES_ARM_NEON_FMA;
1592 for (uint32_t m = 1; m <= 1; m++) {
1593 GemmMicrokernelTester()
1594 .mr(1)
1595 .nr(8)
1596 .kr(1)
1597 .sr(1)
1598 .m(m)
1599 .n(8)
1600 .k(8)
1601 .iterations(1)
1602 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1603 }
1604 }
1605
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_eq_8_subtile_n)1606 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_n) {
1607 TEST_REQUIRES_ARM_NEON_FMA;
1608 for (uint32_t n = 1; n <= 8; n++) {
1609 GemmMicrokernelTester()
1610 .mr(1)
1611 .nr(8)
1612 .kr(1)
1613 .sr(1)
1614 .m(1)
1615 .n(n)
1616 .k(8)
1617 .iterations(1)
1618 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1619 }
1620 }
1621
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_eq_16)1622 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16) {
1623 TEST_REQUIRES_ARM_NEON_FMA;
1624 GemmMicrokernelTester()
1625 .mr(1)
1626 .nr(8)
1627 .kr(1)
1628 .sr(1)
1629 .m(1)
1630 .n(8)
1631 .k(16)
1632 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1633 }
1634
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_eq_16_strided_a)1635 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_strided_a) {
1636 TEST_REQUIRES_ARM_NEON_FMA;
1637 GemmMicrokernelTester()
1638 .mr(1)
1639 .nr(8)
1640 .kr(1)
1641 .sr(1)
1642 .m(1)
1643 .n(8)
1644 .k(16)
1645 .a_stride(19)
1646 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1647 }
1648
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_eq_16_subtile)1649 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_subtile) {
1650 TEST_REQUIRES_ARM_NEON_FMA;
1651 for (uint32_t n = 1; n <= 8; n++) {
1652 for (uint32_t m = 1; m <= 1; m++) {
1653 GemmMicrokernelTester()
1654 .mr(1)
1655 .nr(8)
1656 .kr(1)
1657 .sr(1)
1658 .m(m)
1659 .n(n)
1660 .k(16)
1661 .iterations(1)
1662 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1663 }
1664 }
1665 }
1666
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_lt_16)1667 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16) {
1668 TEST_REQUIRES_ARM_NEON_FMA;
1669 for (size_t k = 1; k < 16; k++) {
1670 GemmMicrokernelTester()
1671 .mr(1)
1672 .nr(8)
1673 .kr(1)
1674 .sr(1)
1675 .m(1)
1676 .n(8)
1677 .k(k)
1678 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1679 }
1680 }
1681
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_lt_16_strided_a)1682 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_strided_a) {
1683 TEST_REQUIRES_ARM_NEON_FMA;
1684 for (size_t k = 1; k < 16; k++) {
1685 GemmMicrokernelTester()
1686 .mr(1)
1687 .nr(8)
1688 .kr(1)
1689 .sr(1)
1690 .m(1)
1691 .n(8)
1692 .k(k)
1693 .a_stride(19)
1694 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1695 }
1696 }
1697
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_lt_16_subtile)1698 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_subtile) {
1699 TEST_REQUIRES_ARM_NEON_FMA;
1700 for (size_t k = 1; k < 16; k++) {
1701 for (uint32_t n = 1; n <= 8; n++) {
1702 for (uint32_t m = 1; m <= 1; m++) {
1703 GemmMicrokernelTester()
1704 .mr(1)
1705 .nr(8)
1706 .kr(1)
1707 .sr(1)
1708 .m(m)
1709 .n(n)
1710 .k(k)
1711 .iterations(1)
1712 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1713 }
1714 }
1715 }
1716 }
1717
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_gt_16)1718 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_16) {
1719 TEST_REQUIRES_ARM_NEON_FMA;
1720 for (size_t k = 17; k < 32; k++) {
1721 GemmMicrokernelTester()
1722 .mr(1)
1723 .nr(8)
1724 .kr(1)
1725 .sr(1)
1726 .m(1)
1727 .n(8)
1728 .k(k)
1729 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1730 }
1731 }
1732
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_gt_16_strided_a)1733 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_16_strided_a) {
1734 TEST_REQUIRES_ARM_NEON_FMA;
1735 for (size_t k = 17; k < 32; k++) {
1736 GemmMicrokernelTester()
1737 .mr(1)
1738 .nr(8)
1739 .kr(1)
1740 .sr(1)
1741 .m(1)
1742 .n(8)
1743 .k(k)
1744 .a_stride(37)
1745 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1746 }
1747 }
1748
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_gt_16_subtile)1749 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_16_subtile) {
1750 TEST_REQUIRES_ARM_NEON_FMA;
1751 for (size_t k = 17; k < 32; k++) {
1752 for (uint32_t n = 1; n <= 8; n++) {
1753 for (uint32_t m = 1; m <= 1; m++) {
1754 GemmMicrokernelTester()
1755 .mr(1)
1756 .nr(8)
1757 .kr(1)
1758 .sr(1)
1759 .m(m)
1760 .n(n)
1761 .k(k)
1762 .iterations(1)
1763 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1764 }
1765 }
1766 }
1767 }
1768
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_div_8)1769 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8) {
1770 TEST_REQUIRES_ARM_NEON_FMA;
1771 for (size_t k = 24; k <= 80; k += 8) {
1772 GemmMicrokernelTester()
1773 .mr(1)
1774 .nr(8)
1775 .kr(1)
1776 .sr(1)
1777 .m(1)
1778 .n(8)
1779 .k(k)
1780 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1781 }
1782 }
1783
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_div_8_strided_a)1784 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_strided_a) {
1785 TEST_REQUIRES_ARM_NEON_FMA;
1786 for (size_t k = 24; k <= 80; k += 8) {
1787 GemmMicrokernelTester()
1788 .mr(1)
1789 .nr(8)
1790 .kr(1)
1791 .sr(1)
1792 .m(1)
1793 .n(8)
1794 .k(k)
1795 .a_stride(83)
1796 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1797 }
1798 }
1799
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,k_div_8_subtile)1800 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_subtile) {
1801 TEST_REQUIRES_ARM_NEON_FMA;
1802 for (size_t k = 24; k <= 80; k += 8) {
1803 for (uint32_t n = 1; n <= 8; n++) {
1804 for (uint32_t m = 1; m <= 1; m++) {
1805 GemmMicrokernelTester()
1806 .mr(1)
1807 .nr(8)
1808 .kr(1)
1809 .sr(1)
1810 .m(m)
1811 .n(n)
1812 .k(k)
1813 .iterations(1)
1814 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1815 }
1816 }
1817 }
1818 }
1819
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,n_gt_8)1820 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
1821 TEST_REQUIRES_ARM_NEON_FMA;
1822 for (uint32_t n = 9; n < 16; n++) {
1823 for (size_t k = 1; k <= 40; k += 9) {
1824 GemmMicrokernelTester()
1825 .mr(1)
1826 .nr(8)
1827 .kr(1)
1828 .sr(1)
1829 .m(1)
1830 .n(n)
1831 .k(k)
1832 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1833 }
1834 }
1835 }
1836
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,n_gt_8_strided_cn)1837 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
1838 TEST_REQUIRES_ARM_NEON_FMA;
1839 for (uint32_t n = 9; n < 16; n++) {
1840 for (size_t k = 1; k <= 40; k += 9) {
1841 GemmMicrokernelTester()
1842 .mr(1)
1843 .nr(8)
1844 .kr(1)
1845 .sr(1)
1846 .m(1)
1847 .n(n)
1848 .k(k)
1849 .cn_stride(11)
1850 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1851 }
1852 }
1853 }
1854
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,n_gt_8_strided_a)1855 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
1856 TEST_REQUIRES_ARM_NEON_FMA;
1857 for (uint32_t n = 9; n < 16; n++) {
1858 for (size_t k = 1; k <= 40; k += 9) {
1859 GemmMicrokernelTester()
1860 .mr(1)
1861 .nr(8)
1862 .kr(1)
1863 .sr(1)
1864 .m(1)
1865 .n(n)
1866 .k(k)
1867 .a_stride(43)
1868 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1869 }
1870 }
1871 }
1872
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,n_gt_8_subtile)1873 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
1874 TEST_REQUIRES_ARM_NEON_FMA;
1875 for (uint32_t n = 9; n < 16; n++) {
1876 for (size_t k = 1; k <= 40; k += 9) {
1877 for (uint32_t m = 1; m <= 1; m++) {
1878 GemmMicrokernelTester()
1879 .mr(1)
1880 .nr(8)
1881 .kr(1)
1882 .sr(1)
1883 .m(m)
1884 .n(n)
1885 .k(k)
1886 .iterations(1)
1887 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1888 }
1889 }
1890 }
1891 }
1892
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,n_div_8)1893 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
1894 TEST_REQUIRES_ARM_NEON_FMA;
1895 for (uint32_t n = 16; n <= 24; n += 8) {
1896 for (size_t k = 1; k <= 40; k += 9) {
1897 GemmMicrokernelTester()
1898 .mr(1)
1899 .nr(8)
1900 .kr(1)
1901 .sr(1)
1902 .m(1)
1903 .n(n)
1904 .k(k)
1905 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1906 }
1907 }
1908 }
1909
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,n_div_8_strided_cn)1910 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
1911 TEST_REQUIRES_ARM_NEON_FMA;
1912 for (uint32_t n = 16; n <= 24; n += 8) {
1913 for (size_t k = 1; k <= 40; k += 9) {
1914 GemmMicrokernelTester()
1915 .mr(1)
1916 .nr(8)
1917 .kr(1)
1918 .sr(1)
1919 .m(1)
1920 .n(n)
1921 .k(k)
1922 .cn_stride(11)
1923 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1924 }
1925 }
1926 }
1927
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,n_div_8_strided_a)1928 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
1929 TEST_REQUIRES_ARM_NEON_FMA;
1930 for (uint32_t n = 16; n <= 24; n += 8) {
1931 for (size_t k = 1; k <= 40; k += 9) {
1932 GemmMicrokernelTester()
1933 .mr(1)
1934 .nr(8)
1935 .kr(1)
1936 .sr(1)
1937 .m(1)
1938 .n(n)
1939 .k(k)
1940 .a_stride(43)
1941 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1942 }
1943 }
1944 }
1945
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,n_div_8_subtile)1946 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
1947 TEST_REQUIRES_ARM_NEON_FMA;
1948 for (uint32_t n = 16; n <= 24; n += 8) {
1949 for (size_t k = 1; k <= 40; k += 9) {
1950 for (uint32_t m = 1; m <= 1; m++) {
1951 GemmMicrokernelTester()
1952 .mr(1)
1953 .nr(8)
1954 .kr(1)
1955 .sr(1)
1956 .m(m)
1957 .n(n)
1958 .k(k)
1959 .iterations(1)
1960 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1961 }
1962 }
1963 }
1964 }
1965
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,strided_cm_subtile)1966 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
1967 TEST_REQUIRES_ARM_NEON_FMA;
1968 for (size_t k = 1; k <= 40; k += 9) {
1969 for (uint32_t n = 1; n <= 8; n++) {
1970 for (uint32_t m = 1; m <= 1; m++) {
1971 GemmMicrokernelTester()
1972 .mr(1)
1973 .nr(8)
1974 .kr(1)
1975 .sr(1)
1976 .m(m)
1977 .n(n)
1978 .k(k)
1979 .cm_stride(11)
1980 .iterations(1)
1981 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1982 }
1983 }
1984 }
1985 }
1986
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,qmin)1987 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
1988 TEST_REQUIRES_ARM_NEON_FMA;
1989 GemmMicrokernelTester()
1990 .mr(1)
1991 .nr(8)
1992 .kr(1)
1993 .sr(1)
1994 .m(1)
1995 .n(8)
1996 .k(8)
1997 .qmin(128)
1998 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
1999 }
2000
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,qmax)2001 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
2002 TEST_REQUIRES_ARM_NEON_FMA;
2003 GemmMicrokernelTester()
2004 .mr(1)
2005 .nr(8)
2006 .kr(1)
2007 .sr(1)
2008 .m(1)
2009 .n(8)
2010 .k(8)
2011 .qmax(128)
2012 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
2013 }
2014
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53,strided_cm)2015 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
2016 TEST_REQUIRES_ARM_NEON_FMA;
2017 GemmMicrokernelTester()
2018 .mr(1)
2019 .nr(8)
2020 .kr(1)
2021 .sr(1)
2022 .m(1)
2023 .n(8)
2024 .k(8)
2025 .cm_stride(11)
2026 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53, xnn_init_f32_minmax_scalar_params);
2027 }
2028 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2029
2030
2031 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_eq_2)2032 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_eq_2) {
2033 TEST_REQUIRES_ARM_NEON_FMA;
2034 GemmMicrokernelTester()
2035 .mr(1)
2036 .nr(8)
2037 .kr(1)
2038 .sr(1)
2039 .m(1)
2040 .n(8)
2041 .k(2)
2042 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2043 }
2044
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,strided_cn)2045 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, strided_cn) {
2046 TEST_REQUIRES_ARM_NEON_FMA;
2047 GemmMicrokernelTester()
2048 .mr(1)
2049 .nr(8)
2050 .kr(1)
2051 .sr(1)
2052 .m(1)
2053 .n(8)
2054 .k(2)
2055 .cn_stride(11)
2056 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2057 }
2058
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_eq_2_strided_a)2059 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
2060 TEST_REQUIRES_ARM_NEON_FMA;
2061 GemmMicrokernelTester()
2062 .mr(1)
2063 .nr(8)
2064 .kr(1)
2065 .sr(1)
2066 .m(1)
2067 .n(8)
2068 .k(2)
2069 .a_stride(5)
2070 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2071 }
2072
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_eq_2_subtile)2073 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
2074 TEST_REQUIRES_ARM_NEON_FMA;
2075 for (uint32_t n = 1; n <= 8; n++) {
2076 for (uint32_t m = 1; m <= 1; m++) {
2077 GemmMicrokernelTester()
2078 .mr(1)
2079 .nr(8)
2080 .kr(1)
2081 .sr(1)
2082 .m(m)
2083 .n(n)
2084 .k(2)
2085 .iterations(1)
2086 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2087 }
2088 }
2089 }
2090
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_eq_2_subtile_m)2091 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
2092 TEST_REQUIRES_ARM_NEON_FMA;
2093 for (uint32_t m = 1; m <= 1; m++) {
2094 GemmMicrokernelTester()
2095 .mr(1)
2096 .nr(8)
2097 .kr(1)
2098 .sr(1)
2099 .m(m)
2100 .n(8)
2101 .k(2)
2102 .iterations(1)
2103 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2104 }
2105 }
2106
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_eq_2_subtile_n)2107 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
2108 TEST_REQUIRES_ARM_NEON_FMA;
2109 for (uint32_t n = 1; n <= 8; n++) {
2110 GemmMicrokernelTester()
2111 .mr(1)
2112 .nr(8)
2113 .kr(1)
2114 .sr(1)
2115 .m(1)
2116 .n(n)
2117 .k(2)
2118 .iterations(1)
2119 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2120 }
2121 }
2122
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_lt_2)2123 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_lt_2) {
2124 TEST_REQUIRES_ARM_NEON_FMA;
2125 for (size_t k = 1; k < 2; k++) {
2126 GemmMicrokernelTester()
2127 .mr(1)
2128 .nr(8)
2129 .kr(1)
2130 .sr(1)
2131 .m(1)
2132 .n(8)
2133 .k(k)
2134 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2135 }
2136 }
2137
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_lt_2_strided_a)2138 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
2139 TEST_REQUIRES_ARM_NEON_FMA;
2140 for (size_t k = 1; k < 2; k++) {
2141 GemmMicrokernelTester()
2142 .mr(1)
2143 .nr(8)
2144 .kr(1)
2145 .sr(1)
2146 .m(1)
2147 .n(8)
2148 .k(k)
2149 .a_stride(5)
2150 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2151 }
2152 }
2153
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_lt_2_subtile)2154 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
2155 TEST_REQUIRES_ARM_NEON_FMA;
2156 for (size_t k = 1; k < 2; k++) {
2157 for (uint32_t n = 1; n <= 8; n++) {
2158 for (uint32_t m = 1; m <= 1; m++) {
2159 GemmMicrokernelTester()
2160 .mr(1)
2161 .nr(8)
2162 .kr(1)
2163 .sr(1)
2164 .m(m)
2165 .n(n)
2166 .k(k)
2167 .iterations(1)
2168 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2169 }
2170 }
2171 }
2172 }
2173
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_gt_2)2174 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_gt_2) {
2175 TEST_REQUIRES_ARM_NEON_FMA;
2176 for (size_t k = 3; k < 4; k++) {
2177 GemmMicrokernelTester()
2178 .mr(1)
2179 .nr(8)
2180 .kr(1)
2181 .sr(1)
2182 .m(1)
2183 .n(8)
2184 .k(k)
2185 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2186 }
2187 }
2188
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_gt_2_strided_a)2189 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
2190 TEST_REQUIRES_ARM_NEON_FMA;
2191 for (size_t k = 3; k < 4; k++) {
2192 GemmMicrokernelTester()
2193 .mr(1)
2194 .nr(8)
2195 .kr(1)
2196 .sr(1)
2197 .m(1)
2198 .n(8)
2199 .k(k)
2200 .a_stride(7)
2201 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2202 }
2203 }
2204
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_gt_2_subtile)2205 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
2206 TEST_REQUIRES_ARM_NEON_FMA;
2207 for (size_t k = 3; k < 4; k++) {
2208 for (uint32_t n = 1; n <= 8; n++) {
2209 for (uint32_t m = 1; m <= 1; m++) {
2210 GemmMicrokernelTester()
2211 .mr(1)
2212 .nr(8)
2213 .kr(1)
2214 .sr(1)
2215 .m(m)
2216 .n(n)
2217 .k(k)
2218 .iterations(1)
2219 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2220 }
2221 }
2222 }
2223 }
2224
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_div_2)2225 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_div_2) {
2226 TEST_REQUIRES_ARM_NEON_FMA;
2227 for (size_t k = 4; k <= 20; k += 2) {
2228 GemmMicrokernelTester()
2229 .mr(1)
2230 .nr(8)
2231 .kr(1)
2232 .sr(1)
2233 .m(1)
2234 .n(8)
2235 .k(k)
2236 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2237 }
2238 }
2239
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_div_2_strided_a)2240 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
2241 TEST_REQUIRES_ARM_NEON_FMA;
2242 for (size_t k = 4; k <= 20; k += 2) {
2243 GemmMicrokernelTester()
2244 .mr(1)
2245 .nr(8)
2246 .kr(1)
2247 .sr(1)
2248 .m(1)
2249 .n(8)
2250 .k(k)
2251 .a_stride(23)
2252 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2253 }
2254 }
2255
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,k_div_2_subtile)2256 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
2257 TEST_REQUIRES_ARM_NEON_FMA;
2258 for (size_t k = 4; k <= 20; k += 2) {
2259 for (uint32_t n = 1; n <= 8; n++) {
2260 for (uint32_t m = 1; m <= 1; m++) {
2261 GemmMicrokernelTester()
2262 .mr(1)
2263 .nr(8)
2264 .kr(1)
2265 .sr(1)
2266 .m(m)
2267 .n(n)
2268 .k(k)
2269 .iterations(1)
2270 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2271 }
2272 }
2273 }
2274 }
2275
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,n_gt_8)2276 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, n_gt_8) {
2277 TEST_REQUIRES_ARM_NEON_FMA;
2278 for (uint32_t n = 9; n < 16; n++) {
2279 for (size_t k = 1; k <= 10; k += 3) {
2280 GemmMicrokernelTester()
2281 .mr(1)
2282 .nr(8)
2283 .kr(1)
2284 .sr(1)
2285 .m(1)
2286 .n(n)
2287 .k(k)
2288 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2289 }
2290 }
2291 }
2292
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,n_gt_8_strided_cn)2293 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
2294 TEST_REQUIRES_ARM_NEON_FMA;
2295 for (uint32_t n = 9; n < 16; n++) {
2296 for (size_t k = 1; k <= 10; k += 3) {
2297 GemmMicrokernelTester()
2298 .mr(1)
2299 .nr(8)
2300 .kr(1)
2301 .sr(1)
2302 .m(1)
2303 .n(n)
2304 .k(k)
2305 .cn_stride(11)
2306 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2307 }
2308 }
2309 }
2310
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,n_gt_8_strided_a)2311 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
2312 TEST_REQUIRES_ARM_NEON_FMA;
2313 for (uint32_t n = 9; n < 16; n++) {
2314 for (size_t k = 1; k <= 10; k += 3) {
2315 GemmMicrokernelTester()
2316 .mr(1)
2317 .nr(8)
2318 .kr(1)
2319 .sr(1)
2320 .m(1)
2321 .n(n)
2322 .k(k)
2323 .a_stride(13)
2324 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2325 }
2326 }
2327 }
2328
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,n_gt_8_subtile)2329 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
2330 TEST_REQUIRES_ARM_NEON_FMA;
2331 for (uint32_t n = 9; n < 16; n++) {
2332 for (size_t k = 1; k <= 10; k += 3) {
2333 for (uint32_t m = 1; m <= 1; m++) {
2334 GemmMicrokernelTester()
2335 .mr(1)
2336 .nr(8)
2337 .kr(1)
2338 .sr(1)
2339 .m(m)
2340 .n(n)
2341 .k(k)
2342 .iterations(1)
2343 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2344 }
2345 }
2346 }
2347 }
2348
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,n_div_8)2349 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, n_div_8) {
2350 TEST_REQUIRES_ARM_NEON_FMA;
2351 for (uint32_t n = 16; n <= 24; n += 8) {
2352 for (size_t k = 1; k <= 10; k += 3) {
2353 GemmMicrokernelTester()
2354 .mr(1)
2355 .nr(8)
2356 .kr(1)
2357 .sr(1)
2358 .m(1)
2359 .n(n)
2360 .k(k)
2361 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2362 }
2363 }
2364 }
2365
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,n_div_8_strided_cn)2366 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
2367 TEST_REQUIRES_ARM_NEON_FMA;
2368 for (uint32_t n = 16; n <= 24; n += 8) {
2369 for (size_t k = 1; k <= 10; k += 3) {
2370 GemmMicrokernelTester()
2371 .mr(1)
2372 .nr(8)
2373 .kr(1)
2374 .sr(1)
2375 .m(1)
2376 .n(n)
2377 .k(k)
2378 .cn_stride(11)
2379 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2380 }
2381 }
2382 }
2383
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,n_div_8_strided_a)2384 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
2385 TEST_REQUIRES_ARM_NEON_FMA;
2386 for (uint32_t n = 16; n <= 24; n += 8) {
2387 for (size_t k = 1; k <= 10; k += 3) {
2388 GemmMicrokernelTester()
2389 .mr(1)
2390 .nr(8)
2391 .kr(1)
2392 .sr(1)
2393 .m(1)
2394 .n(n)
2395 .k(k)
2396 .a_stride(13)
2397 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2398 }
2399 }
2400 }
2401
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,n_div_8_subtile)2402 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
2403 TEST_REQUIRES_ARM_NEON_FMA;
2404 for (uint32_t n = 16; n <= 24; n += 8) {
2405 for (size_t k = 1; k <= 10; k += 3) {
2406 for (uint32_t m = 1; m <= 1; m++) {
2407 GemmMicrokernelTester()
2408 .mr(1)
2409 .nr(8)
2410 .kr(1)
2411 .sr(1)
2412 .m(m)
2413 .n(n)
2414 .k(k)
2415 .iterations(1)
2416 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2417 }
2418 }
2419 }
2420 }
2421
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,strided_cm_subtile)2422 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
2423 TEST_REQUIRES_ARM_NEON_FMA;
2424 for (size_t k = 1; k <= 10; k += 3) {
2425 for (uint32_t n = 1; n <= 8; n++) {
2426 for (uint32_t m = 1; m <= 1; m++) {
2427 GemmMicrokernelTester()
2428 .mr(1)
2429 .nr(8)
2430 .kr(1)
2431 .sr(1)
2432 .m(m)
2433 .n(n)
2434 .k(k)
2435 .cm_stride(11)
2436 .iterations(1)
2437 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2438 }
2439 }
2440 }
2441 }
2442
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,qmin)2443 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, qmin) {
2444 TEST_REQUIRES_ARM_NEON_FMA;
2445 GemmMicrokernelTester()
2446 .mr(1)
2447 .nr(8)
2448 .kr(1)
2449 .sr(1)
2450 .m(1)
2451 .n(8)
2452 .k(2)
2453 .qmin(128)
2454 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2455 }
2456
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,qmax)2457 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, qmax) {
2458 TEST_REQUIRES_ARM_NEON_FMA;
2459 GemmMicrokernelTester()
2460 .mr(1)
2461 .nr(8)
2462 .kr(1)
2463 .sr(1)
2464 .m(1)
2465 .n(8)
2466 .k(2)
2467 .qmax(128)
2468 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2469 }
2470
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64,strided_cm)2471 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_LD64, strided_cm) {
2472 TEST_REQUIRES_ARM_NEON_FMA;
2473 GemmMicrokernelTester()
2474 .mr(1)
2475 .nr(8)
2476 .kr(1)
2477 .sr(1)
2478 .m(1)
2479 .n(8)
2480 .k(2)
2481 .cm_stride(11)
2482 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
2483 }
2484 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2485
2486
2487 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_8)2488 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_8) {
2489 TEST_REQUIRES_ARM_NEON_FMA;
2490 GemmMicrokernelTester()
2491 .mr(1)
2492 .nr(8)
2493 .kr(1)
2494 .sr(1)
2495 .m(1)
2496 .n(8)
2497 .k(8)
2498 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2499 }
2500
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,strided_cn)2501 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, strided_cn) {
2502 TEST_REQUIRES_ARM_NEON_FMA;
2503 GemmMicrokernelTester()
2504 .mr(1)
2505 .nr(8)
2506 .kr(1)
2507 .sr(1)
2508 .m(1)
2509 .n(8)
2510 .k(8)
2511 .cn_stride(11)
2512 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2513 }
2514
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_8_strided_a)2515 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_8_strided_a) {
2516 TEST_REQUIRES_ARM_NEON_FMA;
2517 GemmMicrokernelTester()
2518 .mr(1)
2519 .nr(8)
2520 .kr(1)
2521 .sr(1)
2522 .m(1)
2523 .n(8)
2524 .k(8)
2525 .a_stride(11)
2526 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2527 }
2528
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_8_subtile)2529 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_8_subtile) {
2530 TEST_REQUIRES_ARM_NEON_FMA;
2531 for (uint32_t n = 1; n <= 8; n++) {
2532 for (uint32_t m = 1; m <= 1; m++) {
2533 GemmMicrokernelTester()
2534 .mr(1)
2535 .nr(8)
2536 .kr(1)
2537 .sr(1)
2538 .m(m)
2539 .n(n)
2540 .k(8)
2541 .iterations(1)
2542 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2543 }
2544 }
2545 }
2546
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_8_subtile_m)2547 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_8_subtile_m) {
2548 TEST_REQUIRES_ARM_NEON_FMA;
2549 for (uint32_t m = 1; m <= 1; m++) {
2550 GemmMicrokernelTester()
2551 .mr(1)
2552 .nr(8)
2553 .kr(1)
2554 .sr(1)
2555 .m(m)
2556 .n(8)
2557 .k(8)
2558 .iterations(1)
2559 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2560 }
2561 }
2562
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_8_subtile_n)2563 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_8_subtile_n) {
2564 TEST_REQUIRES_ARM_NEON_FMA;
2565 for (uint32_t n = 1; n <= 8; n++) {
2566 GemmMicrokernelTester()
2567 .mr(1)
2568 .nr(8)
2569 .kr(1)
2570 .sr(1)
2571 .m(1)
2572 .n(n)
2573 .k(8)
2574 .iterations(1)
2575 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2576 }
2577 }
2578
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_16)2579 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_16) {
2580 TEST_REQUIRES_ARM_NEON_FMA;
2581 GemmMicrokernelTester()
2582 .mr(1)
2583 .nr(8)
2584 .kr(1)
2585 .sr(1)
2586 .m(1)
2587 .n(8)
2588 .k(16)
2589 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2590 }
2591
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_16_strided_a)2592 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_16_strided_a) {
2593 TEST_REQUIRES_ARM_NEON_FMA;
2594 GemmMicrokernelTester()
2595 .mr(1)
2596 .nr(8)
2597 .kr(1)
2598 .sr(1)
2599 .m(1)
2600 .n(8)
2601 .k(16)
2602 .a_stride(19)
2603 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2604 }
2605
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_16_subtile)2606 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_16_subtile) {
2607 TEST_REQUIRES_ARM_NEON_FMA;
2608 for (uint32_t n = 1; n <= 8; n++) {
2609 for (uint32_t m = 1; m <= 1; m++) {
2610 GemmMicrokernelTester()
2611 .mr(1)
2612 .nr(8)
2613 .kr(1)
2614 .sr(1)
2615 .m(m)
2616 .n(n)
2617 .k(16)
2618 .iterations(1)
2619 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2620 }
2621 }
2622 }
2623
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_lt_16)2624 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_lt_16) {
2625 TEST_REQUIRES_ARM_NEON_FMA;
2626 for (size_t k = 1; k < 16; k++) {
2627 GemmMicrokernelTester()
2628 .mr(1)
2629 .nr(8)
2630 .kr(1)
2631 .sr(1)
2632 .m(1)
2633 .n(8)
2634 .k(k)
2635 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2636 }
2637 }
2638
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_lt_16_strided_a)2639 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_lt_16_strided_a) {
2640 TEST_REQUIRES_ARM_NEON_FMA;
2641 for (size_t k = 1; k < 16; k++) {
2642 GemmMicrokernelTester()
2643 .mr(1)
2644 .nr(8)
2645 .kr(1)
2646 .sr(1)
2647 .m(1)
2648 .n(8)
2649 .k(k)
2650 .a_stride(19)
2651 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2652 }
2653 }
2654
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_lt_16_subtile)2655 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_lt_16_subtile) {
2656 TEST_REQUIRES_ARM_NEON_FMA;
2657 for (size_t k = 1; k < 16; k++) {
2658 for (uint32_t n = 1; n <= 8; n++) {
2659 for (uint32_t m = 1; m <= 1; m++) {
2660 GemmMicrokernelTester()
2661 .mr(1)
2662 .nr(8)
2663 .kr(1)
2664 .sr(1)
2665 .m(m)
2666 .n(n)
2667 .k(k)
2668 .iterations(1)
2669 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2670 }
2671 }
2672 }
2673 }
2674
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_gt_16)2675 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_gt_16) {
2676 TEST_REQUIRES_ARM_NEON_FMA;
2677 for (size_t k = 17; k < 32; k++) {
2678 GemmMicrokernelTester()
2679 .mr(1)
2680 .nr(8)
2681 .kr(1)
2682 .sr(1)
2683 .m(1)
2684 .n(8)
2685 .k(k)
2686 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2687 }
2688 }
2689
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_gt_16_strided_a)2690 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_gt_16_strided_a) {
2691 TEST_REQUIRES_ARM_NEON_FMA;
2692 for (size_t k = 17; k < 32; k++) {
2693 GemmMicrokernelTester()
2694 .mr(1)
2695 .nr(8)
2696 .kr(1)
2697 .sr(1)
2698 .m(1)
2699 .n(8)
2700 .k(k)
2701 .a_stride(37)
2702 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2703 }
2704 }
2705
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_gt_16_subtile)2706 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_gt_16_subtile) {
2707 TEST_REQUIRES_ARM_NEON_FMA;
2708 for (size_t k = 17; k < 32; k++) {
2709 for (uint32_t n = 1; n <= 8; n++) {
2710 for (uint32_t m = 1; m <= 1; m++) {
2711 GemmMicrokernelTester()
2712 .mr(1)
2713 .nr(8)
2714 .kr(1)
2715 .sr(1)
2716 .m(m)
2717 .n(n)
2718 .k(k)
2719 .iterations(1)
2720 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2721 }
2722 }
2723 }
2724 }
2725
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_div_8)2726 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_div_8) {
2727 TEST_REQUIRES_ARM_NEON_FMA;
2728 for (size_t k = 24; k <= 80; k += 8) {
2729 GemmMicrokernelTester()
2730 .mr(1)
2731 .nr(8)
2732 .kr(1)
2733 .sr(1)
2734 .m(1)
2735 .n(8)
2736 .k(k)
2737 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2738 }
2739 }
2740
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_div_8_strided_a)2741 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_div_8_strided_a) {
2742 TEST_REQUIRES_ARM_NEON_FMA;
2743 for (size_t k = 24; k <= 80; k += 8) {
2744 GemmMicrokernelTester()
2745 .mr(1)
2746 .nr(8)
2747 .kr(1)
2748 .sr(1)
2749 .m(1)
2750 .n(8)
2751 .k(k)
2752 .a_stride(83)
2753 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2754 }
2755 }
2756
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_div_8_subtile)2757 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_div_8_subtile) {
2758 TEST_REQUIRES_ARM_NEON_FMA;
2759 for (size_t k = 24; k <= 80; k += 8) {
2760 for (uint32_t n = 1; n <= 8; n++) {
2761 for (uint32_t m = 1; m <= 1; m++) {
2762 GemmMicrokernelTester()
2763 .mr(1)
2764 .nr(8)
2765 .kr(1)
2766 .sr(1)
2767 .m(m)
2768 .n(n)
2769 .k(k)
2770 .iterations(1)
2771 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2772 }
2773 }
2774 }
2775 }
2776
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_gt_8)2777 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_gt_8) {
2778 TEST_REQUIRES_ARM_NEON_FMA;
2779 for (uint32_t n = 9; n < 16; n++) {
2780 for (size_t k = 1; k <= 40; k += 9) {
2781 GemmMicrokernelTester()
2782 .mr(1)
2783 .nr(8)
2784 .kr(1)
2785 .sr(1)
2786 .m(1)
2787 .n(n)
2788 .k(k)
2789 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2790 }
2791 }
2792 }
2793
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_gt_8_strided_cn)2794 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
2795 TEST_REQUIRES_ARM_NEON_FMA;
2796 for (uint32_t n = 9; n < 16; n++) {
2797 for (size_t k = 1; k <= 40; k += 9) {
2798 GemmMicrokernelTester()
2799 .mr(1)
2800 .nr(8)
2801 .kr(1)
2802 .sr(1)
2803 .m(1)
2804 .n(n)
2805 .k(k)
2806 .cn_stride(11)
2807 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2808 }
2809 }
2810 }
2811
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_gt_8_strided_a)2812 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_gt_8_strided_a) {
2813 TEST_REQUIRES_ARM_NEON_FMA;
2814 for (uint32_t n = 9; n < 16; n++) {
2815 for (size_t k = 1; k <= 40; k += 9) {
2816 GemmMicrokernelTester()
2817 .mr(1)
2818 .nr(8)
2819 .kr(1)
2820 .sr(1)
2821 .m(1)
2822 .n(n)
2823 .k(k)
2824 .a_stride(43)
2825 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2826 }
2827 }
2828 }
2829
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_gt_8_subtile)2830 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_gt_8_subtile) {
2831 TEST_REQUIRES_ARM_NEON_FMA;
2832 for (uint32_t n = 9; n < 16; n++) {
2833 for (size_t k = 1; k <= 40; k += 9) {
2834 for (uint32_t m = 1; m <= 1; m++) {
2835 GemmMicrokernelTester()
2836 .mr(1)
2837 .nr(8)
2838 .kr(1)
2839 .sr(1)
2840 .m(m)
2841 .n(n)
2842 .k(k)
2843 .iterations(1)
2844 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2845 }
2846 }
2847 }
2848 }
2849
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_div_8)2850 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_div_8) {
2851 TEST_REQUIRES_ARM_NEON_FMA;
2852 for (uint32_t n = 16; n <= 24; n += 8) {
2853 for (size_t k = 1; k <= 40; k += 9) {
2854 GemmMicrokernelTester()
2855 .mr(1)
2856 .nr(8)
2857 .kr(1)
2858 .sr(1)
2859 .m(1)
2860 .n(n)
2861 .k(k)
2862 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2863 }
2864 }
2865 }
2866
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_div_8_strided_cn)2867 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_div_8_strided_cn) {
2868 TEST_REQUIRES_ARM_NEON_FMA;
2869 for (uint32_t n = 16; n <= 24; n += 8) {
2870 for (size_t k = 1; k <= 40; k += 9) {
2871 GemmMicrokernelTester()
2872 .mr(1)
2873 .nr(8)
2874 .kr(1)
2875 .sr(1)
2876 .m(1)
2877 .n(n)
2878 .k(k)
2879 .cn_stride(11)
2880 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2881 }
2882 }
2883 }
2884
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_div_8_strided_a)2885 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_div_8_strided_a) {
2886 TEST_REQUIRES_ARM_NEON_FMA;
2887 for (uint32_t n = 16; n <= 24; n += 8) {
2888 for (size_t k = 1; k <= 40; k += 9) {
2889 GemmMicrokernelTester()
2890 .mr(1)
2891 .nr(8)
2892 .kr(1)
2893 .sr(1)
2894 .m(1)
2895 .n(n)
2896 .k(k)
2897 .a_stride(43)
2898 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2899 }
2900 }
2901 }
2902
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_div_8_subtile)2903 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_div_8_subtile) {
2904 TEST_REQUIRES_ARM_NEON_FMA;
2905 for (uint32_t n = 16; n <= 24; n += 8) {
2906 for (size_t k = 1; k <= 40; k += 9) {
2907 for (uint32_t m = 1; m <= 1; m++) {
2908 GemmMicrokernelTester()
2909 .mr(1)
2910 .nr(8)
2911 .kr(1)
2912 .sr(1)
2913 .m(m)
2914 .n(n)
2915 .k(k)
2916 .iterations(1)
2917 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2918 }
2919 }
2920 }
2921 }
2922
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,strided_cm_subtile)2923 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, strided_cm_subtile) {
2924 TEST_REQUIRES_ARM_NEON_FMA;
2925 for (size_t k = 1; k <= 40; k += 9) {
2926 for (uint32_t n = 1; n <= 8; n++) {
2927 for (uint32_t m = 1; m <= 1; m++) {
2928 GemmMicrokernelTester()
2929 .mr(1)
2930 .nr(8)
2931 .kr(1)
2932 .sr(1)
2933 .m(m)
2934 .n(n)
2935 .k(k)
2936 .cm_stride(11)
2937 .iterations(1)
2938 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2939 }
2940 }
2941 }
2942 }
2943
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,qmin)2944 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, qmin) {
2945 TEST_REQUIRES_ARM_NEON_FMA;
2946 GemmMicrokernelTester()
2947 .mr(1)
2948 .nr(8)
2949 .kr(1)
2950 .sr(1)
2951 .m(1)
2952 .n(8)
2953 .k(8)
2954 .qmin(128)
2955 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2956 }
2957
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,qmax)2958 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, qmax) {
2959 TEST_REQUIRES_ARM_NEON_FMA;
2960 GemmMicrokernelTester()
2961 .mr(1)
2962 .nr(8)
2963 .kr(1)
2964 .sr(1)
2965 .m(1)
2966 .n(8)
2967 .k(8)
2968 .qmax(128)
2969 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2970 }
2971
TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,strided_cm)2972 TEST(F32_GEMM_MINMAX_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, strided_cm) {
2973 TEST_REQUIRES_ARM_NEON_FMA;
2974 GemmMicrokernelTester()
2975 .mr(1)
2976 .nr(8)
2977 .kr(1)
2978 .sr(1)
2979 .m(1)
2980 .n(8)
2981 .k(8)
2982 .cm_stride(11)
2983 .Test(xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
2984 }
2985 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2986
2987
2988 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_eq_8)2989 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
2990 TEST_REQUIRES_ARM_NEON_FMA;
2991 GemmMicrokernelTester()
2992 .mr(4)
2993 .nr(2)
2994 .kr(1)
2995 .sr(1)
2996 .m(4)
2997 .n(2)
2998 .k(8)
2999 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3000 }
3001
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,strided_cn)3002 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
3003 TEST_REQUIRES_ARM_NEON_FMA;
3004 GemmMicrokernelTester()
3005 .mr(4)
3006 .nr(2)
3007 .kr(1)
3008 .sr(1)
3009 .m(4)
3010 .n(2)
3011 .k(8)
3012 .cn_stride(5)
3013 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3014 }
3015
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_strided_a)3016 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
3017 TEST_REQUIRES_ARM_NEON_FMA;
3018 GemmMicrokernelTester()
3019 .mr(4)
3020 .nr(2)
3021 .kr(1)
3022 .sr(1)
3023 .m(4)
3024 .n(2)
3025 .k(8)
3026 .a_stride(11)
3027 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3028 }
3029
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile)3030 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
3031 TEST_REQUIRES_ARM_NEON_FMA;
3032 for (uint32_t n = 1; n <= 2; n++) {
3033 for (uint32_t m = 1; m <= 4; m++) {
3034 GemmMicrokernelTester()
3035 .mr(4)
3036 .nr(2)
3037 .kr(1)
3038 .sr(1)
3039 .m(m)
3040 .n(n)
3041 .k(8)
3042 .iterations(1)
3043 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3044 }
3045 }
3046 }
3047
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile_m)3048 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
3049 TEST_REQUIRES_ARM_NEON_FMA;
3050 for (uint32_t m = 1; m <= 4; m++) {
3051 GemmMicrokernelTester()
3052 .mr(4)
3053 .nr(2)
3054 .kr(1)
3055 .sr(1)
3056 .m(m)
3057 .n(2)
3058 .k(8)
3059 .iterations(1)
3060 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3061 }
3062 }
3063
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile_n)3064 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
3065 TEST_REQUIRES_ARM_NEON_FMA;
3066 for (uint32_t n = 1; n <= 2; n++) {
3067 GemmMicrokernelTester()
3068 .mr(4)
3069 .nr(2)
3070 .kr(1)
3071 .sr(1)
3072 .m(4)
3073 .n(n)
3074 .k(8)
3075 .iterations(1)
3076 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3077 }
3078 }
3079
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_lt_8)3080 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_lt_8) {
3081 TEST_REQUIRES_ARM_NEON_FMA;
3082 for (size_t k = 1; k < 8; k++) {
3083 GemmMicrokernelTester()
3084 .mr(4)
3085 .nr(2)
3086 .kr(1)
3087 .sr(1)
3088 .m(4)
3089 .n(2)
3090 .k(k)
3091 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3092 }
3093 }
3094
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_lt_8_strided_a)3095 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_lt_8_strided_a) {
3096 TEST_REQUIRES_ARM_NEON_FMA;
3097 for (size_t k = 1; k < 8; k++) {
3098 GemmMicrokernelTester()
3099 .mr(4)
3100 .nr(2)
3101 .kr(1)
3102 .sr(1)
3103 .m(4)
3104 .n(2)
3105 .k(k)
3106 .a_stride(11)
3107 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3108 }
3109 }
3110
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_lt_8_subtile)3111 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_lt_8_subtile) {
3112 TEST_REQUIRES_ARM_NEON_FMA;
3113 for (size_t k = 1; k < 8; k++) {
3114 for (uint32_t n = 1; n <= 2; n++) {
3115 for (uint32_t m = 1; m <= 4; m++) {
3116 GemmMicrokernelTester()
3117 .mr(4)
3118 .nr(2)
3119 .kr(1)
3120 .sr(1)
3121 .m(m)
3122 .n(n)
3123 .k(k)
3124 .iterations(1)
3125 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3126 }
3127 }
3128 }
3129 }
3130
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_gt_8)3131 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_gt_8) {
3132 TEST_REQUIRES_ARM_NEON_FMA;
3133 for (size_t k = 9; k < 16; k++) {
3134 GemmMicrokernelTester()
3135 .mr(4)
3136 .nr(2)
3137 .kr(1)
3138 .sr(1)
3139 .m(4)
3140 .n(2)
3141 .k(k)
3142 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3143 }
3144 }
3145
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_gt_8_strided_a)3146 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
3147 TEST_REQUIRES_ARM_NEON_FMA;
3148 for (size_t k = 9; k < 16; k++) {
3149 GemmMicrokernelTester()
3150 .mr(4)
3151 .nr(2)
3152 .kr(1)
3153 .sr(1)
3154 .m(4)
3155 .n(2)
3156 .k(k)
3157 .a_stride(19)
3158 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3159 }
3160 }
3161
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_gt_8_subtile)3162 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
3163 TEST_REQUIRES_ARM_NEON_FMA;
3164 for (size_t k = 9; k < 16; k++) {
3165 for (uint32_t n = 1; n <= 2; n++) {
3166 for (uint32_t m = 1; m <= 4; m++) {
3167 GemmMicrokernelTester()
3168 .mr(4)
3169 .nr(2)
3170 .kr(1)
3171 .sr(1)
3172 .m(m)
3173 .n(n)
3174 .k(k)
3175 .iterations(1)
3176 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3177 }
3178 }
3179 }
3180 }
3181
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_div_8)3182 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
3183 TEST_REQUIRES_ARM_NEON_FMA;
3184 for (size_t k = 16; k <= 80; k += 8) {
3185 GemmMicrokernelTester()
3186 .mr(4)
3187 .nr(2)
3188 .kr(1)
3189 .sr(1)
3190 .m(4)
3191 .n(2)
3192 .k(k)
3193 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3194 }
3195 }
3196
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_div_8_strided_a)3197 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
3198 TEST_REQUIRES_ARM_NEON_FMA;
3199 for (size_t k = 16; k <= 80; k += 8) {
3200 GemmMicrokernelTester()
3201 .mr(4)
3202 .nr(2)
3203 .kr(1)
3204 .sr(1)
3205 .m(4)
3206 .n(2)
3207 .k(k)
3208 .a_stride(83)
3209 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3210 }
3211 }
3212
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,k_div_8_subtile)3213 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
3214 TEST_REQUIRES_ARM_NEON_FMA;
3215 for (size_t k = 16; k <= 80; k += 8) {
3216 for (uint32_t n = 1; n <= 2; n++) {
3217 for (uint32_t m = 1; m <= 4; m++) {
3218 GemmMicrokernelTester()
3219 .mr(4)
3220 .nr(2)
3221 .kr(1)
3222 .sr(1)
3223 .m(m)
3224 .n(n)
3225 .k(k)
3226 .iterations(1)
3227 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3228 }
3229 }
3230 }
3231 }
3232
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,n_gt_2)3233 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, n_gt_2) {
3234 TEST_REQUIRES_ARM_NEON_FMA;
3235 for (uint32_t n = 3; n < 4; n++) {
3236 for (size_t k = 1; k <= 40; k += 9) {
3237 GemmMicrokernelTester()
3238 .mr(4)
3239 .nr(2)
3240 .kr(1)
3241 .sr(1)
3242 .m(4)
3243 .n(n)
3244 .k(k)
3245 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3246 }
3247 }
3248 }
3249
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,n_gt_2_strided_cn)3250 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, n_gt_2_strided_cn) {
3251 TEST_REQUIRES_ARM_NEON_FMA;
3252 for (uint32_t n = 3; n < 4; n++) {
3253 for (size_t k = 1; k <= 40; k += 9) {
3254 GemmMicrokernelTester()
3255 .mr(4)
3256 .nr(2)
3257 .kr(1)
3258 .sr(1)
3259 .m(4)
3260 .n(n)
3261 .k(k)
3262 .cn_stride(5)
3263 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3264 }
3265 }
3266 }
3267
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,n_gt_2_strided_a)3268 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, n_gt_2_strided_a) {
3269 TEST_REQUIRES_ARM_NEON_FMA;
3270 for (uint32_t n = 3; n < 4; n++) {
3271 for (size_t k = 1; k <= 40; k += 9) {
3272 GemmMicrokernelTester()
3273 .mr(4)
3274 .nr(2)
3275 .kr(1)
3276 .sr(1)
3277 .m(4)
3278 .n(n)
3279 .k(k)
3280 .a_stride(43)
3281 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3282 }
3283 }
3284 }
3285
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,n_gt_2_subtile)3286 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, n_gt_2_subtile) {
3287 TEST_REQUIRES_ARM_NEON_FMA;
3288 for (uint32_t n = 3; n < 4; n++) {
3289 for (size_t k = 1; k <= 40; k += 9) {
3290 for (uint32_t m = 1; m <= 4; m++) {
3291 GemmMicrokernelTester()
3292 .mr(4)
3293 .nr(2)
3294 .kr(1)
3295 .sr(1)
3296 .m(m)
3297 .n(n)
3298 .k(k)
3299 .iterations(1)
3300 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3301 }
3302 }
3303 }
3304 }
3305
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,n_div_2)3306 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, n_div_2) {
3307 TEST_REQUIRES_ARM_NEON_FMA;
3308 for (uint32_t n = 4; n <= 6; n += 2) {
3309 for (size_t k = 1; k <= 40; k += 9) {
3310 GemmMicrokernelTester()
3311 .mr(4)
3312 .nr(2)
3313 .kr(1)
3314 .sr(1)
3315 .m(4)
3316 .n(n)
3317 .k(k)
3318 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3319 }
3320 }
3321 }
3322
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,n_div_2_strided_cn)3323 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, n_div_2_strided_cn) {
3324 TEST_REQUIRES_ARM_NEON_FMA;
3325 for (uint32_t n = 4; n <= 6; n += 2) {
3326 for (size_t k = 1; k <= 40; k += 9) {
3327 GemmMicrokernelTester()
3328 .mr(4)
3329 .nr(2)
3330 .kr(1)
3331 .sr(1)
3332 .m(4)
3333 .n(n)
3334 .k(k)
3335 .cn_stride(5)
3336 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3337 }
3338 }
3339 }
3340
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,n_div_2_strided_a)3341 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, n_div_2_strided_a) {
3342 TEST_REQUIRES_ARM_NEON_FMA;
3343 for (uint32_t n = 4; n <= 6; n += 2) {
3344 for (size_t k = 1; k <= 40; k += 9) {
3345 GemmMicrokernelTester()
3346 .mr(4)
3347 .nr(2)
3348 .kr(1)
3349 .sr(1)
3350 .m(4)
3351 .n(n)
3352 .k(k)
3353 .a_stride(43)
3354 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3355 }
3356 }
3357 }
3358
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,n_div_2_subtile)3359 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, n_div_2_subtile) {
3360 TEST_REQUIRES_ARM_NEON_FMA;
3361 for (uint32_t n = 4; n <= 6; n += 2) {
3362 for (size_t k = 1; k <= 40; k += 9) {
3363 for (uint32_t m = 1; m <= 4; m++) {
3364 GemmMicrokernelTester()
3365 .mr(4)
3366 .nr(2)
3367 .kr(1)
3368 .sr(1)
3369 .m(m)
3370 .n(n)
3371 .k(k)
3372 .iterations(1)
3373 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3374 }
3375 }
3376 }
3377 }
3378
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,strided_cm_subtile)3379 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
3380 TEST_REQUIRES_ARM_NEON_FMA;
3381 for (size_t k = 1; k <= 40; k += 9) {
3382 for (uint32_t n = 1; n <= 2; n++) {
3383 for (uint32_t m = 1; m <= 4; m++) {
3384 GemmMicrokernelTester()
3385 .mr(4)
3386 .nr(2)
3387 .kr(1)
3388 .sr(1)
3389 .m(m)
3390 .n(n)
3391 .k(k)
3392 .cm_stride(5)
3393 .iterations(1)
3394 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3395 }
3396 }
3397 }
3398 }
3399
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,qmin)3400 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, qmin) {
3401 TEST_REQUIRES_ARM_NEON_FMA;
3402 GemmMicrokernelTester()
3403 .mr(4)
3404 .nr(2)
3405 .kr(1)
3406 .sr(1)
3407 .m(4)
3408 .n(2)
3409 .k(8)
3410 .qmin(128)
3411 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3412 }
3413
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,qmax)3414 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, qmax) {
3415 TEST_REQUIRES_ARM_NEON_FMA;
3416 GemmMicrokernelTester()
3417 .mr(4)
3418 .nr(2)
3419 .kr(1)
3420 .sr(1)
3421 .m(4)
3422 .n(2)
3423 .k(8)
3424 .qmax(128)
3425 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3426 }
3427
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75,strided_cm)3428 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
3429 TEST_REQUIRES_ARM_NEON_FMA;
3430 GemmMicrokernelTester()
3431 .mr(4)
3432 .nr(2)
3433 .kr(1)
3434 .sr(1)
3435 .m(4)
3436 .n(2)
3437 .k(8)
3438 .cm_stride(5)
3439 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
3440 }
3441 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3442
3443
3444 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_eq_2)3445 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_eq_2) {
3446 TEST_REQUIRES_ARM_NEON_FMA;
3447 GemmMicrokernelTester()
3448 .mr(4)
3449 .nr(2)
3450 .kr(1)
3451 .sr(1)
3452 .m(4)
3453 .n(2)
3454 .k(2)
3455 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3456 }
3457
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,strided_cn)3458 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, strided_cn) {
3459 TEST_REQUIRES_ARM_NEON_FMA;
3460 GemmMicrokernelTester()
3461 .mr(4)
3462 .nr(2)
3463 .kr(1)
3464 .sr(1)
3465 .m(4)
3466 .n(2)
3467 .k(2)
3468 .cn_stride(5)
3469 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3470 }
3471
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_eq_2_strided_a)3472 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
3473 TEST_REQUIRES_ARM_NEON_FMA;
3474 GemmMicrokernelTester()
3475 .mr(4)
3476 .nr(2)
3477 .kr(1)
3478 .sr(1)
3479 .m(4)
3480 .n(2)
3481 .k(2)
3482 .a_stride(5)
3483 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3484 }
3485
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_eq_2_subtile)3486 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
3487 TEST_REQUIRES_ARM_NEON_FMA;
3488 for (uint32_t n = 1; n <= 2; n++) {
3489 for (uint32_t m = 1; m <= 4; m++) {
3490 GemmMicrokernelTester()
3491 .mr(4)
3492 .nr(2)
3493 .kr(1)
3494 .sr(1)
3495 .m(m)
3496 .n(n)
3497 .k(2)
3498 .iterations(1)
3499 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3500 }
3501 }
3502 }
3503
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_eq_2_subtile_m)3504 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
3505 TEST_REQUIRES_ARM_NEON_FMA;
3506 for (uint32_t m = 1; m <= 4; m++) {
3507 GemmMicrokernelTester()
3508 .mr(4)
3509 .nr(2)
3510 .kr(1)
3511 .sr(1)
3512 .m(m)
3513 .n(2)
3514 .k(2)
3515 .iterations(1)
3516 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3517 }
3518 }
3519
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_eq_2_subtile_n)3520 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
3521 TEST_REQUIRES_ARM_NEON_FMA;
3522 for (uint32_t n = 1; n <= 2; n++) {
3523 GemmMicrokernelTester()
3524 .mr(4)
3525 .nr(2)
3526 .kr(1)
3527 .sr(1)
3528 .m(4)
3529 .n(n)
3530 .k(2)
3531 .iterations(1)
3532 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3533 }
3534 }
3535
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_lt_2)3536 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_lt_2) {
3537 TEST_REQUIRES_ARM_NEON_FMA;
3538 for (size_t k = 1; k < 2; k++) {
3539 GemmMicrokernelTester()
3540 .mr(4)
3541 .nr(2)
3542 .kr(1)
3543 .sr(1)
3544 .m(4)
3545 .n(2)
3546 .k(k)
3547 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3548 }
3549 }
3550
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_lt_2_strided_a)3551 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
3552 TEST_REQUIRES_ARM_NEON_FMA;
3553 for (size_t k = 1; k < 2; k++) {
3554 GemmMicrokernelTester()
3555 .mr(4)
3556 .nr(2)
3557 .kr(1)
3558 .sr(1)
3559 .m(4)
3560 .n(2)
3561 .k(k)
3562 .a_stride(5)
3563 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3564 }
3565 }
3566
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_lt_2_subtile)3567 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
3568 TEST_REQUIRES_ARM_NEON_FMA;
3569 for (size_t k = 1; k < 2; k++) {
3570 for (uint32_t n = 1; n <= 2; n++) {
3571 for (uint32_t m = 1; m <= 4; m++) {
3572 GemmMicrokernelTester()
3573 .mr(4)
3574 .nr(2)
3575 .kr(1)
3576 .sr(1)
3577 .m(m)
3578 .n(n)
3579 .k(k)
3580 .iterations(1)
3581 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3582 }
3583 }
3584 }
3585 }
3586
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_gt_2)3587 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_gt_2) {
3588 TEST_REQUIRES_ARM_NEON_FMA;
3589 for (size_t k = 3; k < 4; k++) {
3590 GemmMicrokernelTester()
3591 .mr(4)
3592 .nr(2)
3593 .kr(1)
3594 .sr(1)
3595 .m(4)
3596 .n(2)
3597 .k(k)
3598 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3599 }
3600 }
3601
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_gt_2_strided_a)3602 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
3603 TEST_REQUIRES_ARM_NEON_FMA;
3604 for (size_t k = 3; k < 4; k++) {
3605 GemmMicrokernelTester()
3606 .mr(4)
3607 .nr(2)
3608 .kr(1)
3609 .sr(1)
3610 .m(4)
3611 .n(2)
3612 .k(k)
3613 .a_stride(7)
3614 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3615 }
3616 }
3617
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_gt_2_subtile)3618 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
3619 TEST_REQUIRES_ARM_NEON_FMA;
3620 for (size_t k = 3; k < 4; k++) {
3621 for (uint32_t n = 1; n <= 2; n++) {
3622 for (uint32_t m = 1; m <= 4; m++) {
3623 GemmMicrokernelTester()
3624 .mr(4)
3625 .nr(2)
3626 .kr(1)
3627 .sr(1)
3628 .m(m)
3629 .n(n)
3630 .k(k)
3631 .iterations(1)
3632 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3633 }
3634 }
3635 }
3636 }
3637
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_div_2)3638 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_div_2) {
3639 TEST_REQUIRES_ARM_NEON_FMA;
3640 for (size_t k = 4; k <= 20; k += 2) {
3641 GemmMicrokernelTester()
3642 .mr(4)
3643 .nr(2)
3644 .kr(1)
3645 .sr(1)
3646 .m(4)
3647 .n(2)
3648 .k(k)
3649 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3650 }
3651 }
3652
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_div_2_strided_a)3653 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
3654 TEST_REQUIRES_ARM_NEON_FMA;
3655 for (size_t k = 4; k <= 20; k += 2) {
3656 GemmMicrokernelTester()
3657 .mr(4)
3658 .nr(2)
3659 .kr(1)
3660 .sr(1)
3661 .m(4)
3662 .n(2)
3663 .k(k)
3664 .a_stride(23)
3665 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3666 }
3667 }
3668
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,k_div_2_subtile)3669 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
3670 TEST_REQUIRES_ARM_NEON_FMA;
3671 for (size_t k = 4; k <= 20; k += 2) {
3672 for (uint32_t n = 1; n <= 2; n++) {
3673 for (uint32_t m = 1; m <= 4; m++) {
3674 GemmMicrokernelTester()
3675 .mr(4)
3676 .nr(2)
3677 .kr(1)
3678 .sr(1)
3679 .m(m)
3680 .n(n)
3681 .k(k)
3682 .iterations(1)
3683 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3684 }
3685 }
3686 }
3687 }
3688
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,n_gt_2)3689 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, n_gt_2) {
3690 TEST_REQUIRES_ARM_NEON_FMA;
3691 for (uint32_t n = 3; n < 4; n++) {
3692 for (size_t k = 1; k <= 10; k += 3) {
3693 GemmMicrokernelTester()
3694 .mr(4)
3695 .nr(2)
3696 .kr(1)
3697 .sr(1)
3698 .m(4)
3699 .n(n)
3700 .k(k)
3701 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3702 }
3703 }
3704 }
3705
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,n_gt_2_strided_cn)3706 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, n_gt_2_strided_cn) {
3707 TEST_REQUIRES_ARM_NEON_FMA;
3708 for (uint32_t n = 3; n < 4; n++) {
3709 for (size_t k = 1; k <= 10; k += 3) {
3710 GemmMicrokernelTester()
3711 .mr(4)
3712 .nr(2)
3713 .kr(1)
3714 .sr(1)
3715 .m(4)
3716 .n(n)
3717 .k(k)
3718 .cn_stride(5)
3719 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3720 }
3721 }
3722 }
3723
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,n_gt_2_strided_a)3724 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, n_gt_2_strided_a) {
3725 TEST_REQUIRES_ARM_NEON_FMA;
3726 for (uint32_t n = 3; n < 4; n++) {
3727 for (size_t k = 1; k <= 10; k += 3) {
3728 GemmMicrokernelTester()
3729 .mr(4)
3730 .nr(2)
3731 .kr(1)
3732 .sr(1)
3733 .m(4)
3734 .n(n)
3735 .k(k)
3736 .a_stride(13)
3737 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3738 }
3739 }
3740 }
3741
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,n_gt_2_subtile)3742 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, n_gt_2_subtile) {
3743 TEST_REQUIRES_ARM_NEON_FMA;
3744 for (uint32_t n = 3; n < 4; n++) {
3745 for (size_t k = 1; k <= 10; k += 3) {
3746 for (uint32_t m = 1; m <= 4; m++) {
3747 GemmMicrokernelTester()
3748 .mr(4)
3749 .nr(2)
3750 .kr(1)
3751 .sr(1)
3752 .m(m)
3753 .n(n)
3754 .k(k)
3755 .iterations(1)
3756 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3757 }
3758 }
3759 }
3760 }
3761
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,n_div_2)3762 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, n_div_2) {
3763 TEST_REQUIRES_ARM_NEON_FMA;
3764 for (uint32_t n = 4; n <= 6; n += 2) {
3765 for (size_t k = 1; k <= 10; k += 3) {
3766 GemmMicrokernelTester()
3767 .mr(4)
3768 .nr(2)
3769 .kr(1)
3770 .sr(1)
3771 .m(4)
3772 .n(n)
3773 .k(k)
3774 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3775 }
3776 }
3777 }
3778
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,n_div_2_strided_cn)3779 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, n_div_2_strided_cn) {
3780 TEST_REQUIRES_ARM_NEON_FMA;
3781 for (uint32_t n = 4; n <= 6; n += 2) {
3782 for (size_t k = 1; k <= 10; k += 3) {
3783 GemmMicrokernelTester()
3784 .mr(4)
3785 .nr(2)
3786 .kr(1)
3787 .sr(1)
3788 .m(4)
3789 .n(n)
3790 .k(k)
3791 .cn_stride(5)
3792 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3793 }
3794 }
3795 }
3796
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,n_div_2_strided_a)3797 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, n_div_2_strided_a) {
3798 TEST_REQUIRES_ARM_NEON_FMA;
3799 for (uint32_t n = 4; n <= 6; n += 2) {
3800 for (size_t k = 1; k <= 10; k += 3) {
3801 GemmMicrokernelTester()
3802 .mr(4)
3803 .nr(2)
3804 .kr(1)
3805 .sr(1)
3806 .m(4)
3807 .n(n)
3808 .k(k)
3809 .a_stride(13)
3810 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3811 }
3812 }
3813 }
3814
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,n_div_2_subtile)3815 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, n_div_2_subtile) {
3816 TEST_REQUIRES_ARM_NEON_FMA;
3817 for (uint32_t n = 4; n <= 6; n += 2) {
3818 for (size_t k = 1; k <= 10; k += 3) {
3819 for (uint32_t m = 1; m <= 4; m++) {
3820 GemmMicrokernelTester()
3821 .mr(4)
3822 .nr(2)
3823 .kr(1)
3824 .sr(1)
3825 .m(m)
3826 .n(n)
3827 .k(k)
3828 .iterations(1)
3829 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3830 }
3831 }
3832 }
3833 }
3834
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,strided_cm_subtile)3835 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
3836 TEST_REQUIRES_ARM_NEON_FMA;
3837 for (size_t k = 1; k <= 10; k += 3) {
3838 for (uint32_t n = 1; n <= 2; n++) {
3839 for (uint32_t m = 1; m <= 4; m++) {
3840 GemmMicrokernelTester()
3841 .mr(4)
3842 .nr(2)
3843 .kr(1)
3844 .sr(1)
3845 .m(m)
3846 .n(n)
3847 .k(k)
3848 .cm_stride(5)
3849 .iterations(1)
3850 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3851 }
3852 }
3853 }
3854 }
3855
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,qmin)3856 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, qmin) {
3857 TEST_REQUIRES_ARM_NEON_FMA;
3858 GemmMicrokernelTester()
3859 .mr(4)
3860 .nr(2)
3861 .kr(1)
3862 .sr(1)
3863 .m(4)
3864 .n(2)
3865 .k(2)
3866 .qmin(128)
3867 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3868 }
3869
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,qmax)3870 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, qmax) {
3871 TEST_REQUIRES_ARM_NEON_FMA;
3872 GemmMicrokernelTester()
3873 .mr(4)
3874 .nr(2)
3875 .kr(1)
3876 .sr(1)
3877 .m(4)
3878 .n(2)
3879 .k(2)
3880 .qmax(128)
3881 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3882 }
3883
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64,strided_cm)3884 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_LD64, strided_cm) {
3885 TEST_REQUIRES_ARM_NEON_FMA;
3886 GemmMicrokernelTester()
3887 .mr(4)
3888 .nr(2)
3889 .kr(1)
3890 .sr(1)
3891 .m(4)
3892 .n(2)
3893 .k(2)
3894 .cm_stride(5)
3895 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
3896 }
3897 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3898
3899
3900 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8)3901 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8) {
3902 TEST_REQUIRES_ARM_NEON_FMA;
3903 GemmMicrokernelTester()
3904 .mr(4)
3905 .nr(2)
3906 .kr(1)
3907 .sr(1)
3908 .m(4)
3909 .n(2)
3910 .k(8)
3911 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
3912 }
3913
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cn)3914 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cn) {
3915 TEST_REQUIRES_ARM_NEON_FMA;
3916 GemmMicrokernelTester()
3917 .mr(4)
3918 .nr(2)
3919 .kr(1)
3920 .sr(1)
3921 .m(4)
3922 .n(2)
3923 .k(8)
3924 .cn_stride(5)
3925 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
3926 }
3927
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_strided_a)3928 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_strided_a) {
3929 TEST_REQUIRES_ARM_NEON_FMA;
3930 GemmMicrokernelTester()
3931 .mr(4)
3932 .nr(2)
3933 .kr(1)
3934 .sr(1)
3935 .m(4)
3936 .n(2)
3937 .k(8)
3938 .a_stride(11)
3939 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
3940 }
3941
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile)3942 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile) {
3943 TEST_REQUIRES_ARM_NEON_FMA;
3944 for (uint32_t n = 1; n <= 2; n++) {
3945 for (uint32_t m = 1; m <= 4; m++) {
3946 GemmMicrokernelTester()
3947 .mr(4)
3948 .nr(2)
3949 .kr(1)
3950 .sr(1)
3951 .m(m)
3952 .n(n)
3953 .k(8)
3954 .iterations(1)
3955 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
3956 }
3957 }
3958 }
3959
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_m)3960 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_m) {
3961 TEST_REQUIRES_ARM_NEON_FMA;
3962 for (uint32_t m = 1; m <= 4; m++) {
3963 GemmMicrokernelTester()
3964 .mr(4)
3965 .nr(2)
3966 .kr(1)
3967 .sr(1)
3968 .m(m)
3969 .n(2)
3970 .k(8)
3971 .iterations(1)
3972 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
3973 }
3974 }
3975
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_n)3976 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_n) {
3977 TEST_REQUIRES_ARM_NEON_FMA;
3978 for (uint32_t n = 1; n <= 2; n++) {
3979 GemmMicrokernelTester()
3980 .mr(4)
3981 .nr(2)
3982 .kr(1)
3983 .sr(1)
3984 .m(4)
3985 .n(n)
3986 .k(8)
3987 .iterations(1)
3988 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
3989 }
3990 }
3991
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_8)3992 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_8) {
3993 TEST_REQUIRES_ARM_NEON_FMA;
3994 for (size_t k = 1; k < 8; k++) {
3995 GemmMicrokernelTester()
3996 .mr(4)
3997 .nr(2)
3998 .kr(1)
3999 .sr(1)
4000 .m(4)
4001 .n(2)
4002 .k(k)
4003 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4004 }
4005 }
4006
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_8_strided_a)4007 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_8_strided_a) {
4008 TEST_REQUIRES_ARM_NEON_FMA;
4009 for (size_t k = 1; k < 8; k++) {
4010 GemmMicrokernelTester()
4011 .mr(4)
4012 .nr(2)
4013 .kr(1)
4014 .sr(1)
4015 .m(4)
4016 .n(2)
4017 .k(k)
4018 .a_stride(11)
4019 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4020 }
4021 }
4022
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_8_subtile)4023 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_8_subtile) {
4024 TEST_REQUIRES_ARM_NEON_FMA;
4025 for (size_t k = 1; k < 8; k++) {
4026 for (uint32_t n = 1; n <= 2; n++) {
4027 for (uint32_t m = 1; m <= 4; m++) {
4028 GemmMicrokernelTester()
4029 .mr(4)
4030 .nr(2)
4031 .kr(1)
4032 .sr(1)
4033 .m(m)
4034 .n(n)
4035 .k(k)
4036 .iterations(1)
4037 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4038 }
4039 }
4040 }
4041 }
4042
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_8)4043 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_8) {
4044 TEST_REQUIRES_ARM_NEON_FMA;
4045 for (size_t k = 9; k < 16; k++) {
4046 GemmMicrokernelTester()
4047 .mr(4)
4048 .nr(2)
4049 .kr(1)
4050 .sr(1)
4051 .m(4)
4052 .n(2)
4053 .k(k)
4054 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4055 }
4056 }
4057
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_8_strided_a)4058 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_8_strided_a) {
4059 TEST_REQUIRES_ARM_NEON_FMA;
4060 for (size_t k = 9; k < 16; k++) {
4061 GemmMicrokernelTester()
4062 .mr(4)
4063 .nr(2)
4064 .kr(1)
4065 .sr(1)
4066 .m(4)
4067 .n(2)
4068 .k(k)
4069 .a_stride(19)
4070 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4071 }
4072 }
4073
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_8_subtile)4074 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_8_subtile) {
4075 TEST_REQUIRES_ARM_NEON_FMA;
4076 for (size_t k = 9; k < 16; k++) {
4077 for (uint32_t n = 1; n <= 2; n++) {
4078 for (uint32_t m = 1; m <= 4; m++) {
4079 GemmMicrokernelTester()
4080 .mr(4)
4081 .nr(2)
4082 .kr(1)
4083 .sr(1)
4084 .m(m)
4085 .n(n)
4086 .k(k)
4087 .iterations(1)
4088 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4089 }
4090 }
4091 }
4092 }
4093
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8)4094 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8) {
4095 TEST_REQUIRES_ARM_NEON_FMA;
4096 for (size_t k = 16; k <= 80; k += 8) {
4097 GemmMicrokernelTester()
4098 .mr(4)
4099 .nr(2)
4100 .kr(1)
4101 .sr(1)
4102 .m(4)
4103 .n(2)
4104 .k(k)
4105 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4106 }
4107 }
4108
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_strided_a)4109 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_strided_a) {
4110 TEST_REQUIRES_ARM_NEON_FMA;
4111 for (size_t k = 16; k <= 80; k += 8) {
4112 GemmMicrokernelTester()
4113 .mr(4)
4114 .nr(2)
4115 .kr(1)
4116 .sr(1)
4117 .m(4)
4118 .n(2)
4119 .k(k)
4120 .a_stride(83)
4121 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4122 }
4123 }
4124
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_subtile)4125 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_subtile) {
4126 TEST_REQUIRES_ARM_NEON_FMA;
4127 for (size_t k = 16; k <= 80; k += 8) {
4128 for (uint32_t n = 1; n <= 2; n++) {
4129 for (uint32_t m = 1; m <= 4; m++) {
4130 GemmMicrokernelTester()
4131 .mr(4)
4132 .nr(2)
4133 .kr(1)
4134 .sr(1)
4135 .m(m)
4136 .n(n)
4137 .k(k)
4138 .iterations(1)
4139 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4140 }
4141 }
4142 }
4143 }
4144
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_2)4145 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_2) {
4146 TEST_REQUIRES_ARM_NEON_FMA;
4147 for (uint32_t n = 3; n < 4; n++) {
4148 for (size_t k = 1; k <= 40; k += 9) {
4149 GemmMicrokernelTester()
4150 .mr(4)
4151 .nr(2)
4152 .kr(1)
4153 .sr(1)
4154 .m(4)
4155 .n(n)
4156 .k(k)
4157 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4158 }
4159 }
4160 }
4161
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_2_strided_cn)4162 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_2_strided_cn) {
4163 TEST_REQUIRES_ARM_NEON_FMA;
4164 for (uint32_t n = 3; n < 4; n++) {
4165 for (size_t k = 1; k <= 40; k += 9) {
4166 GemmMicrokernelTester()
4167 .mr(4)
4168 .nr(2)
4169 .kr(1)
4170 .sr(1)
4171 .m(4)
4172 .n(n)
4173 .k(k)
4174 .cn_stride(5)
4175 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4176 }
4177 }
4178 }
4179
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_2_strided_a)4180 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_2_strided_a) {
4181 TEST_REQUIRES_ARM_NEON_FMA;
4182 for (uint32_t n = 3; n < 4; n++) {
4183 for (size_t k = 1; k <= 40; k += 9) {
4184 GemmMicrokernelTester()
4185 .mr(4)
4186 .nr(2)
4187 .kr(1)
4188 .sr(1)
4189 .m(4)
4190 .n(n)
4191 .k(k)
4192 .a_stride(43)
4193 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4194 }
4195 }
4196 }
4197
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_2_subtile)4198 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_2_subtile) {
4199 TEST_REQUIRES_ARM_NEON_FMA;
4200 for (uint32_t n = 3; n < 4; n++) {
4201 for (size_t k = 1; k <= 40; k += 9) {
4202 for (uint32_t m = 1; m <= 4; m++) {
4203 GemmMicrokernelTester()
4204 .mr(4)
4205 .nr(2)
4206 .kr(1)
4207 .sr(1)
4208 .m(m)
4209 .n(n)
4210 .k(k)
4211 .iterations(1)
4212 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4213 }
4214 }
4215 }
4216 }
4217
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_2)4218 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_2) {
4219 TEST_REQUIRES_ARM_NEON_FMA;
4220 for (uint32_t n = 4; n <= 6; n += 2) {
4221 for (size_t k = 1; k <= 40; k += 9) {
4222 GemmMicrokernelTester()
4223 .mr(4)
4224 .nr(2)
4225 .kr(1)
4226 .sr(1)
4227 .m(4)
4228 .n(n)
4229 .k(k)
4230 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4231 }
4232 }
4233 }
4234
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_2_strided_cn)4235 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_2_strided_cn) {
4236 TEST_REQUIRES_ARM_NEON_FMA;
4237 for (uint32_t n = 4; n <= 6; n += 2) {
4238 for (size_t k = 1; k <= 40; k += 9) {
4239 GemmMicrokernelTester()
4240 .mr(4)
4241 .nr(2)
4242 .kr(1)
4243 .sr(1)
4244 .m(4)
4245 .n(n)
4246 .k(k)
4247 .cn_stride(5)
4248 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4249 }
4250 }
4251 }
4252
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_2_strided_a)4253 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_2_strided_a) {
4254 TEST_REQUIRES_ARM_NEON_FMA;
4255 for (uint32_t n = 4; n <= 6; n += 2) {
4256 for (size_t k = 1; k <= 40; k += 9) {
4257 GemmMicrokernelTester()
4258 .mr(4)
4259 .nr(2)
4260 .kr(1)
4261 .sr(1)
4262 .m(4)
4263 .n(n)
4264 .k(k)
4265 .a_stride(43)
4266 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4267 }
4268 }
4269 }
4270
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_2_subtile)4271 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_2_subtile) {
4272 TEST_REQUIRES_ARM_NEON_FMA;
4273 for (uint32_t n = 4; n <= 6; n += 2) {
4274 for (size_t k = 1; k <= 40; k += 9) {
4275 for (uint32_t m = 1; m <= 4; m++) {
4276 GemmMicrokernelTester()
4277 .mr(4)
4278 .nr(2)
4279 .kr(1)
4280 .sr(1)
4281 .m(m)
4282 .n(n)
4283 .k(k)
4284 .iterations(1)
4285 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4286 }
4287 }
4288 }
4289 }
4290
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm_subtile)4291 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm_subtile) {
4292 TEST_REQUIRES_ARM_NEON_FMA;
4293 for (size_t k = 1; k <= 40; k += 9) {
4294 for (uint32_t n = 1; n <= 2; n++) {
4295 for (uint32_t m = 1; m <= 4; m++) {
4296 GemmMicrokernelTester()
4297 .mr(4)
4298 .nr(2)
4299 .kr(1)
4300 .sr(1)
4301 .m(m)
4302 .n(n)
4303 .k(k)
4304 .cm_stride(5)
4305 .iterations(1)
4306 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4307 }
4308 }
4309 }
4310 }
4311
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmin)4312 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmin) {
4313 TEST_REQUIRES_ARM_NEON_FMA;
4314 GemmMicrokernelTester()
4315 .mr(4)
4316 .nr(2)
4317 .kr(1)
4318 .sr(1)
4319 .m(4)
4320 .n(2)
4321 .k(8)
4322 .qmin(128)
4323 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4324 }
4325
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmax)4326 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmax) {
4327 TEST_REQUIRES_ARM_NEON_FMA;
4328 GemmMicrokernelTester()
4329 .mr(4)
4330 .nr(2)
4331 .kr(1)
4332 .sr(1)
4333 .m(4)
4334 .n(2)
4335 .k(8)
4336 .qmax(128)
4337 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4338 }
4339
TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm)4340 TEST(F32_GEMM_MINMAX_4X2__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm) {
4341 TEST_REQUIRES_ARM_NEON_FMA;
4342 GemmMicrokernelTester()
4343 .mr(4)
4344 .nr(2)
4345 .kr(1)
4346 .sr(1)
4347 .m(4)
4348 .n(2)
4349 .k(8)
4350 .cm_stride(5)
4351 .Test(xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
4352 }
4353 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4354
4355
4356 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4)4357 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4) {
4358 TEST_REQUIRES_ARM_NEON_FMA;
4359 GemmMicrokernelTester()
4360 .mr(4)
4361 .nr(8)
4362 .kr(1)
4363 .sr(1)
4364 .m(4)
4365 .n(8)
4366 .k(4)
4367 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4368 }
4369
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,strided_cn)4370 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cn) {
4371 TEST_REQUIRES_ARM_NEON_FMA;
4372 GemmMicrokernelTester()
4373 .mr(4)
4374 .nr(8)
4375 .kr(1)
4376 .sr(1)
4377 .m(4)
4378 .n(8)
4379 .k(4)
4380 .cn_stride(11)
4381 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4382 }
4383
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4_strided_a)4384 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_strided_a) {
4385 TEST_REQUIRES_ARM_NEON_FMA;
4386 GemmMicrokernelTester()
4387 .mr(4)
4388 .nr(8)
4389 .kr(1)
4390 .sr(1)
4391 .m(4)
4392 .n(8)
4393 .k(4)
4394 .a_stride(7)
4395 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4396 }
4397
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4_subtile)4398 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile) {
4399 TEST_REQUIRES_ARM_NEON_FMA;
4400 for (uint32_t n = 1; n <= 8; n++) {
4401 for (uint32_t m = 1; m <= 4; m++) {
4402 GemmMicrokernelTester()
4403 .mr(4)
4404 .nr(8)
4405 .kr(1)
4406 .sr(1)
4407 .m(m)
4408 .n(n)
4409 .k(4)
4410 .iterations(1)
4411 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4412 }
4413 }
4414 }
4415
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4_subtile_m)4416 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_m) {
4417 TEST_REQUIRES_ARM_NEON_FMA;
4418 for (uint32_t m = 1; m <= 4; m++) {
4419 GemmMicrokernelTester()
4420 .mr(4)
4421 .nr(8)
4422 .kr(1)
4423 .sr(1)
4424 .m(m)
4425 .n(8)
4426 .k(4)
4427 .iterations(1)
4428 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4429 }
4430 }
4431
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4_subtile_n)4432 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_n) {
4433 TEST_REQUIRES_ARM_NEON_FMA;
4434 for (uint32_t n = 1; n <= 8; n++) {
4435 GemmMicrokernelTester()
4436 .mr(4)
4437 .nr(8)
4438 .kr(1)
4439 .sr(1)
4440 .m(4)
4441 .n(n)
4442 .k(4)
4443 .iterations(1)
4444 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4445 }
4446 }
4447
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_8)4448 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8) {
4449 TEST_REQUIRES_ARM_NEON_FMA;
4450 GemmMicrokernelTester()
4451 .mr(4)
4452 .nr(8)
4453 .kr(1)
4454 .sr(1)
4455 .m(4)
4456 .n(8)
4457 .k(8)
4458 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4459 }
4460
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_8_strided_a)4461 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_strided_a) {
4462 TEST_REQUIRES_ARM_NEON_FMA;
4463 GemmMicrokernelTester()
4464 .mr(4)
4465 .nr(8)
4466 .kr(1)
4467 .sr(1)
4468 .m(4)
4469 .n(8)
4470 .k(8)
4471 .a_stride(11)
4472 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4473 }
4474
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_8_subtile)4475 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_subtile) {
4476 TEST_REQUIRES_ARM_NEON_FMA;
4477 for (uint32_t n = 1; n <= 8; n++) {
4478 for (uint32_t m = 1; m <= 4; m++) {
4479 GemmMicrokernelTester()
4480 .mr(4)
4481 .nr(8)
4482 .kr(1)
4483 .sr(1)
4484 .m(m)
4485 .n(n)
4486 .k(8)
4487 .iterations(1)
4488 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4489 }
4490 }
4491 }
4492
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_lt_8)4493 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8) {
4494 TEST_REQUIRES_ARM_NEON_FMA;
4495 for (size_t k = 1; k < 8; k++) {
4496 GemmMicrokernelTester()
4497 .mr(4)
4498 .nr(8)
4499 .kr(1)
4500 .sr(1)
4501 .m(4)
4502 .n(8)
4503 .k(k)
4504 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4505 }
4506 }
4507
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_lt_8_strided_a)4508 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_strided_a) {
4509 TEST_REQUIRES_ARM_NEON_FMA;
4510 for (size_t k = 1; k < 8; k++) {
4511 GemmMicrokernelTester()
4512 .mr(4)
4513 .nr(8)
4514 .kr(1)
4515 .sr(1)
4516 .m(4)
4517 .n(8)
4518 .k(k)
4519 .a_stride(11)
4520 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4521 }
4522 }
4523
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_lt_8_subtile)4524 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_subtile) {
4525 TEST_REQUIRES_ARM_NEON_FMA;
4526 for (size_t k = 1; k < 8; k++) {
4527 for (uint32_t n = 1; n <= 8; n++) {
4528 for (uint32_t m = 1; m <= 4; m++) {
4529 GemmMicrokernelTester()
4530 .mr(4)
4531 .nr(8)
4532 .kr(1)
4533 .sr(1)
4534 .m(m)
4535 .n(n)
4536 .k(k)
4537 .iterations(1)
4538 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4539 }
4540 }
4541 }
4542 }
4543
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_gt_8)4544 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8) {
4545 TEST_REQUIRES_ARM_NEON_FMA;
4546 for (size_t k = 9; k < 16; k++) {
4547 GemmMicrokernelTester()
4548 .mr(4)
4549 .nr(8)
4550 .kr(1)
4551 .sr(1)
4552 .m(4)
4553 .n(8)
4554 .k(k)
4555 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4556 }
4557 }
4558
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_gt_8_strided_a)4559 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8_strided_a) {
4560 TEST_REQUIRES_ARM_NEON_FMA;
4561 for (size_t k = 9; k < 16; k++) {
4562 GemmMicrokernelTester()
4563 .mr(4)
4564 .nr(8)
4565 .kr(1)
4566 .sr(1)
4567 .m(4)
4568 .n(8)
4569 .k(k)
4570 .a_stride(19)
4571 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4572 }
4573 }
4574
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_gt_8_subtile)4575 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8_subtile) {
4576 TEST_REQUIRES_ARM_NEON_FMA;
4577 for (size_t k = 9; k < 16; k++) {
4578 for (uint32_t n = 1; n <= 8; n++) {
4579 for (uint32_t m = 1; m <= 4; m++) {
4580 GemmMicrokernelTester()
4581 .mr(4)
4582 .nr(8)
4583 .kr(1)
4584 .sr(1)
4585 .m(m)
4586 .n(n)
4587 .k(k)
4588 .iterations(1)
4589 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4590 }
4591 }
4592 }
4593 }
4594
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_div_4)4595 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4) {
4596 TEST_REQUIRES_ARM_NEON_FMA;
4597 for (size_t k = 12; k <= 40; k += 4) {
4598 GemmMicrokernelTester()
4599 .mr(4)
4600 .nr(8)
4601 .kr(1)
4602 .sr(1)
4603 .m(4)
4604 .n(8)
4605 .k(k)
4606 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4607 }
4608 }
4609
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_div_4_strided_a)4610 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_strided_a) {
4611 TEST_REQUIRES_ARM_NEON_FMA;
4612 for (size_t k = 12; k <= 40; k += 4) {
4613 GemmMicrokernelTester()
4614 .mr(4)
4615 .nr(8)
4616 .kr(1)
4617 .sr(1)
4618 .m(4)
4619 .n(8)
4620 .k(k)
4621 .a_stride(43)
4622 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4623 }
4624 }
4625
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,k_div_4_subtile)4626 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_subtile) {
4627 TEST_REQUIRES_ARM_NEON_FMA;
4628 for (size_t k = 12; k <= 40; k += 4) {
4629 for (uint32_t n = 1; n <= 8; n++) {
4630 for (uint32_t m = 1; m <= 4; m++) {
4631 GemmMicrokernelTester()
4632 .mr(4)
4633 .nr(8)
4634 .kr(1)
4635 .sr(1)
4636 .m(m)
4637 .n(n)
4638 .k(k)
4639 .iterations(1)
4640 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4641 }
4642 }
4643 }
4644 }
4645
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,n_gt_8)4646 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8) {
4647 TEST_REQUIRES_ARM_NEON_FMA;
4648 for (uint32_t n = 9; n < 16; n++) {
4649 for (size_t k = 1; k <= 20; k += 5) {
4650 GemmMicrokernelTester()
4651 .mr(4)
4652 .nr(8)
4653 .kr(1)
4654 .sr(1)
4655 .m(4)
4656 .n(n)
4657 .k(k)
4658 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4659 }
4660 }
4661 }
4662
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,n_gt_8_strided_cn)4663 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_cn) {
4664 TEST_REQUIRES_ARM_NEON_FMA;
4665 for (uint32_t n = 9; n < 16; n++) {
4666 for (size_t k = 1; k <= 20; k += 5) {
4667 GemmMicrokernelTester()
4668 .mr(4)
4669 .nr(8)
4670 .kr(1)
4671 .sr(1)
4672 .m(4)
4673 .n(n)
4674 .k(k)
4675 .cn_stride(11)
4676 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4677 }
4678 }
4679 }
4680
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,n_gt_8_strided_a)4681 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_a) {
4682 TEST_REQUIRES_ARM_NEON_FMA;
4683 for (uint32_t n = 9; n < 16; n++) {
4684 for (size_t k = 1; k <= 20; k += 5) {
4685 GemmMicrokernelTester()
4686 .mr(4)
4687 .nr(8)
4688 .kr(1)
4689 .sr(1)
4690 .m(4)
4691 .n(n)
4692 .k(k)
4693 .a_stride(23)
4694 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4695 }
4696 }
4697 }
4698
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,n_gt_8_subtile)4699 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_subtile) {
4700 TEST_REQUIRES_ARM_NEON_FMA;
4701 for (uint32_t n = 9; n < 16; n++) {
4702 for (size_t k = 1; k <= 20; k += 5) {
4703 for (uint32_t m = 1; m <= 4; m++) {
4704 GemmMicrokernelTester()
4705 .mr(4)
4706 .nr(8)
4707 .kr(1)
4708 .sr(1)
4709 .m(m)
4710 .n(n)
4711 .k(k)
4712 .iterations(1)
4713 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4714 }
4715 }
4716 }
4717 }
4718
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,n_div_8)4719 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8) {
4720 TEST_REQUIRES_ARM_NEON_FMA;
4721 for (uint32_t n = 16; n <= 24; n += 8) {
4722 for (size_t k = 1; k <= 20; k += 5) {
4723 GemmMicrokernelTester()
4724 .mr(4)
4725 .nr(8)
4726 .kr(1)
4727 .sr(1)
4728 .m(4)
4729 .n(n)
4730 .k(k)
4731 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4732 }
4733 }
4734 }
4735
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,n_div_8_strided_cn)4736 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_cn) {
4737 TEST_REQUIRES_ARM_NEON_FMA;
4738 for (uint32_t n = 16; n <= 24; n += 8) {
4739 for (size_t k = 1; k <= 20; k += 5) {
4740 GemmMicrokernelTester()
4741 .mr(4)
4742 .nr(8)
4743 .kr(1)
4744 .sr(1)
4745 .m(4)
4746 .n(n)
4747 .k(k)
4748 .cn_stride(11)
4749 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4750 }
4751 }
4752 }
4753
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,n_div_8_strided_a)4754 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_a) {
4755 TEST_REQUIRES_ARM_NEON_FMA;
4756 for (uint32_t n = 16; n <= 24; n += 8) {
4757 for (size_t k = 1; k <= 20; k += 5) {
4758 GemmMicrokernelTester()
4759 .mr(4)
4760 .nr(8)
4761 .kr(1)
4762 .sr(1)
4763 .m(4)
4764 .n(n)
4765 .k(k)
4766 .a_stride(23)
4767 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4768 }
4769 }
4770 }
4771
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,n_div_8_subtile)4772 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_subtile) {
4773 TEST_REQUIRES_ARM_NEON_FMA;
4774 for (uint32_t n = 16; n <= 24; n += 8) {
4775 for (size_t k = 1; k <= 20; k += 5) {
4776 for (uint32_t m = 1; m <= 4; m++) {
4777 GemmMicrokernelTester()
4778 .mr(4)
4779 .nr(8)
4780 .kr(1)
4781 .sr(1)
4782 .m(m)
4783 .n(n)
4784 .k(k)
4785 .iterations(1)
4786 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4787 }
4788 }
4789 }
4790 }
4791
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,strided_cm_subtile)4792 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm_subtile) {
4793 TEST_REQUIRES_ARM_NEON_FMA;
4794 for (size_t k = 1; k <= 20; k += 5) {
4795 for (uint32_t n = 1; n <= 8; n++) {
4796 for (uint32_t m = 1; m <= 4; m++) {
4797 GemmMicrokernelTester()
4798 .mr(4)
4799 .nr(8)
4800 .kr(1)
4801 .sr(1)
4802 .m(m)
4803 .n(n)
4804 .k(k)
4805 .cm_stride(11)
4806 .iterations(1)
4807 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4808 }
4809 }
4810 }
4811 }
4812
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,qmin)4813 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, qmin) {
4814 TEST_REQUIRES_ARM_NEON_FMA;
4815 GemmMicrokernelTester()
4816 .mr(4)
4817 .nr(8)
4818 .kr(1)
4819 .sr(1)
4820 .m(4)
4821 .n(8)
4822 .k(4)
4823 .qmin(128)
4824 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4825 }
4826
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,qmax)4827 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, qmax) {
4828 TEST_REQUIRES_ARM_NEON_FMA;
4829 GemmMicrokernelTester()
4830 .mr(4)
4831 .nr(8)
4832 .kr(1)
4833 .sr(1)
4834 .m(4)
4835 .n(8)
4836 .k(4)
4837 .qmax(128)
4838 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4839 }
4840
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55,strided_cm)4841 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm) {
4842 TEST_REQUIRES_ARM_NEON_FMA;
4843 GemmMicrokernelTester()
4844 .mr(4)
4845 .nr(8)
4846 .kr(1)
4847 .sr(1)
4848 .m(4)
4849 .n(8)
4850 .k(4)
4851 .cm_stride(11)
4852 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
4853 }
4854 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4855
4856
4857 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8)4858 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
4859 TEST_REQUIRES_ARM_NEON_FMA;
4860 GemmMicrokernelTester()
4861 .mr(4)
4862 .nr(8)
4863 .kr(1)
4864 .sr(1)
4865 .m(4)
4866 .n(8)
4867 .k(8)
4868 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
4869 }
4870
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,strided_cn)4871 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
4872 TEST_REQUIRES_ARM_NEON_FMA;
4873 GemmMicrokernelTester()
4874 .mr(4)
4875 .nr(8)
4876 .kr(1)
4877 .sr(1)
4878 .m(4)
4879 .n(8)
4880 .k(8)
4881 .cn_stride(11)
4882 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
4883 }
4884
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_strided_a)4885 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
4886 TEST_REQUIRES_ARM_NEON_FMA;
4887 GemmMicrokernelTester()
4888 .mr(4)
4889 .nr(8)
4890 .kr(1)
4891 .sr(1)
4892 .m(4)
4893 .n(8)
4894 .k(8)
4895 .a_stride(11)
4896 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
4897 }
4898
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile)4899 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
4900 TEST_REQUIRES_ARM_NEON_FMA;
4901 for (uint32_t n = 1; n <= 8; n++) {
4902 for (uint32_t m = 1; m <= 4; m++) {
4903 GemmMicrokernelTester()
4904 .mr(4)
4905 .nr(8)
4906 .kr(1)
4907 .sr(1)
4908 .m(m)
4909 .n(n)
4910 .k(8)
4911 .iterations(1)
4912 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
4913 }
4914 }
4915 }
4916
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile_m)4917 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
4918 TEST_REQUIRES_ARM_NEON_FMA;
4919 for (uint32_t m = 1; m <= 4; m++) {
4920 GemmMicrokernelTester()
4921 .mr(4)
4922 .nr(8)
4923 .kr(1)
4924 .sr(1)
4925 .m(m)
4926 .n(8)
4927 .k(8)
4928 .iterations(1)
4929 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
4930 }
4931 }
4932
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile_n)4933 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
4934 TEST_REQUIRES_ARM_NEON_FMA;
4935 for (uint32_t n = 1; n <= 8; n++) {
4936 GemmMicrokernelTester()
4937 .mr(4)
4938 .nr(8)
4939 .kr(1)
4940 .sr(1)
4941 .m(4)
4942 .n(n)
4943 .k(8)
4944 .iterations(1)
4945 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
4946 }
4947 }
4948
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_16)4949 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
4950 TEST_REQUIRES_ARM_NEON_FMA;
4951 GemmMicrokernelTester()
4952 .mr(4)
4953 .nr(8)
4954 .kr(1)
4955 .sr(1)
4956 .m(4)
4957 .n(8)
4958 .k(16)
4959 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
4960 }
4961
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_16_strided_a)4962 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
4963 TEST_REQUIRES_ARM_NEON_FMA;
4964 GemmMicrokernelTester()
4965 .mr(4)
4966 .nr(8)
4967 .kr(1)
4968 .sr(1)
4969 .m(4)
4970 .n(8)
4971 .k(16)
4972 .a_stride(19)
4973 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
4974 }
4975
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_16_subtile)4976 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
4977 TEST_REQUIRES_ARM_NEON_FMA;
4978 for (uint32_t n = 1; n <= 8; n++) {
4979 for (uint32_t m = 1; m <= 4; m++) {
4980 GemmMicrokernelTester()
4981 .mr(4)
4982 .nr(8)
4983 .kr(1)
4984 .sr(1)
4985 .m(m)
4986 .n(n)
4987 .k(16)
4988 .iterations(1)
4989 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
4990 }
4991 }
4992 }
4993
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_lt_16)4994 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
4995 TEST_REQUIRES_ARM_NEON_FMA;
4996 for (size_t k = 1; k < 16; k++) {
4997 GemmMicrokernelTester()
4998 .mr(4)
4999 .nr(8)
5000 .kr(1)
5001 .sr(1)
5002 .m(4)
5003 .n(8)
5004 .k(k)
5005 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5006 }
5007 }
5008
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_lt_16_strided_a)5009 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
5010 TEST_REQUIRES_ARM_NEON_FMA;
5011 for (size_t k = 1; k < 16; k++) {
5012 GemmMicrokernelTester()
5013 .mr(4)
5014 .nr(8)
5015 .kr(1)
5016 .sr(1)
5017 .m(4)
5018 .n(8)
5019 .k(k)
5020 .a_stride(19)
5021 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5022 }
5023 }
5024
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_lt_16_subtile)5025 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
5026 TEST_REQUIRES_ARM_NEON_FMA;
5027 for (size_t k = 1; k < 16; k++) {
5028 for (uint32_t n = 1; n <= 8; n++) {
5029 for (uint32_t m = 1; m <= 4; m++) {
5030 GemmMicrokernelTester()
5031 .mr(4)
5032 .nr(8)
5033 .kr(1)
5034 .sr(1)
5035 .m(m)
5036 .n(n)
5037 .k(k)
5038 .iterations(1)
5039 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5040 }
5041 }
5042 }
5043 }
5044
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_gt_16)5045 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
5046 TEST_REQUIRES_ARM_NEON_FMA;
5047 for (size_t k = 17; k < 32; k++) {
5048 GemmMicrokernelTester()
5049 .mr(4)
5050 .nr(8)
5051 .kr(1)
5052 .sr(1)
5053 .m(4)
5054 .n(8)
5055 .k(k)
5056 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5057 }
5058 }
5059
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_gt_16_strided_a)5060 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_strided_a) {
5061 TEST_REQUIRES_ARM_NEON_FMA;
5062 for (size_t k = 17; k < 32; k++) {
5063 GemmMicrokernelTester()
5064 .mr(4)
5065 .nr(8)
5066 .kr(1)
5067 .sr(1)
5068 .m(4)
5069 .n(8)
5070 .k(k)
5071 .a_stride(37)
5072 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5073 }
5074 }
5075
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_gt_16_subtile)5076 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_subtile) {
5077 TEST_REQUIRES_ARM_NEON_FMA;
5078 for (size_t k = 17; k < 32; k++) {
5079 for (uint32_t n = 1; n <= 8; n++) {
5080 for (uint32_t m = 1; m <= 4; m++) {
5081 GemmMicrokernelTester()
5082 .mr(4)
5083 .nr(8)
5084 .kr(1)
5085 .sr(1)
5086 .m(m)
5087 .n(n)
5088 .k(k)
5089 .iterations(1)
5090 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5091 }
5092 }
5093 }
5094 }
5095
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_div_8)5096 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
5097 TEST_REQUIRES_ARM_NEON_FMA;
5098 for (size_t k = 24; k <= 80; k += 8) {
5099 GemmMicrokernelTester()
5100 .mr(4)
5101 .nr(8)
5102 .kr(1)
5103 .sr(1)
5104 .m(4)
5105 .n(8)
5106 .k(k)
5107 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5108 }
5109 }
5110
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_div_8_strided_a)5111 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
5112 TEST_REQUIRES_ARM_NEON_FMA;
5113 for (size_t k = 24; k <= 80; k += 8) {
5114 GemmMicrokernelTester()
5115 .mr(4)
5116 .nr(8)
5117 .kr(1)
5118 .sr(1)
5119 .m(4)
5120 .n(8)
5121 .k(k)
5122 .a_stride(83)
5123 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5124 }
5125 }
5126
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,k_div_8_subtile)5127 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
5128 TEST_REQUIRES_ARM_NEON_FMA;
5129 for (size_t k = 24; k <= 80; k += 8) {
5130 for (uint32_t n = 1; n <= 8; n++) {
5131 for (uint32_t m = 1; m <= 4; m++) {
5132 GemmMicrokernelTester()
5133 .mr(4)
5134 .nr(8)
5135 .kr(1)
5136 .sr(1)
5137 .m(m)
5138 .n(n)
5139 .k(k)
5140 .iterations(1)
5141 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5142 }
5143 }
5144 }
5145 }
5146
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8)5147 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
5148 TEST_REQUIRES_ARM_NEON_FMA;
5149 for (uint32_t n = 9; n < 16; n++) {
5150 for (size_t k = 1; k <= 40; k += 9) {
5151 GemmMicrokernelTester()
5152 .mr(4)
5153 .nr(8)
5154 .kr(1)
5155 .sr(1)
5156 .m(4)
5157 .n(n)
5158 .k(k)
5159 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5160 }
5161 }
5162 }
5163
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8_strided_cn)5164 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
5165 TEST_REQUIRES_ARM_NEON_FMA;
5166 for (uint32_t n = 9; n < 16; n++) {
5167 for (size_t k = 1; k <= 40; k += 9) {
5168 GemmMicrokernelTester()
5169 .mr(4)
5170 .nr(8)
5171 .kr(1)
5172 .sr(1)
5173 .m(4)
5174 .n(n)
5175 .k(k)
5176 .cn_stride(11)
5177 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5178 }
5179 }
5180 }
5181
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8_strided_a)5182 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
5183 TEST_REQUIRES_ARM_NEON_FMA;
5184 for (uint32_t n = 9; n < 16; n++) {
5185 for (size_t k = 1; k <= 40; k += 9) {
5186 GemmMicrokernelTester()
5187 .mr(4)
5188 .nr(8)
5189 .kr(1)
5190 .sr(1)
5191 .m(4)
5192 .n(n)
5193 .k(k)
5194 .a_stride(43)
5195 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5196 }
5197 }
5198 }
5199
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8_subtile)5200 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
5201 TEST_REQUIRES_ARM_NEON_FMA;
5202 for (uint32_t n = 9; n < 16; n++) {
5203 for (size_t k = 1; k <= 40; k += 9) {
5204 for (uint32_t m = 1; m <= 4; m++) {
5205 GemmMicrokernelTester()
5206 .mr(4)
5207 .nr(8)
5208 .kr(1)
5209 .sr(1)
5210 .m(m)
5211 .n(n)
5212 .k(k)
5213 .iterations(1)
5214 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5215 }
5216 }
5217 }
5218 }
5219
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8)5220 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
5221 TEST_REQUIRES_ARM_NEON_FMA;
5222 for (uint32_t n = 16; n <= 24; n += 8) {
5223 for (size_t k = 1; k <= 40; k += 9) {
5224 GemmMicrokernelTester()
5225 .mr(4)
5226 .nr(8)
5227 .kr(1)
5228 .sr(1)
5229 .m(4)
5230 .n(n)
5231 .k(k)
5232 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5233 }
5234 }
5235 }
5236
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8_strided_cn)5237 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
5238 TEST_REQUIRES_ARM_NEON_FMA;
5239 for (uint32_t n = 16; n <= 24; n += 8) {
5240 for (size_t k = 1; k <= 40; k += 9) {
5241 GemmMicrokernelTester()
5242 .mr(4)
5243 .nr(8)
5244 .kr(1)
5245 .sr(1)
5246 .m(4)
5247 .n(n)
5248 .k(k)
5249 .cn_stride(11)
5250 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5251 }
5252 }
5253 }
5254
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8_strided_a)5255 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
5256 TEST_REQUIRES_ARM_NEON_FMA;
5257 for (uint32_t n = 16; n <= 24; n += 8) {
5258 for (size_t k = 1; k <= 40; k += 9) {
5259 GemmMicrokernelTester()
5260 .mr(4)
5261 .nr(8)
5262 .kr(1)
5263 .sr(1)
5264 .m(4)
5265 .n(n)
5266 .k(k)
5267 .a_stride(43)
5268 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5269 }
5270 }
5271 }
5272
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8_subtile)5273 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
5274 TEST_REQUIRES_ARM_NEON_FMA;
5275 for (uint32_t n = 16; n <= 24; n += 8) {
5276 for (size_t k = 1; k <= 40; k += 9) {
5277 for (uint32_t m = 1; m <= 4; m++) {
5278 GemmMicrokernelTester()
5279 .mr(4)
5280 .nr(8)
5281 .kr(1)
5282 .sr(1)
5283 .m(m)
5284 .n(n)
5285 .k(k)
5286 .iterations(1)
5287 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5288 }
5289 }
5290 }
5291 }
5292
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,strided_cm_subtile)5293 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
5294 TEST_REQUIRES_ARM_NEON_FMA;
5295 for (size_t k = 1; k <= 40; k += 9) {
5296 for (uint32_t n = 1; n <= 8; n++) {
5297 for (uint32_t m = 1; m <= 4; m++) {
5298 GemmMicrokernelTester()
5299 .mr(4)
5300 .nr(8)
5301 .kr(1)
5302 .sr(1)
5303 .m(m)
5304 .n(n)
5305 .k(k)
5306 .cm_stride(11)
5307 .iterations(1)
5308 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5309 }
5310 }
5311 }
5312 }
5313
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,qmin)5314 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
5315 TEST_REQUIRES_ARM_NEON_FMA;
5316 GemmMicrokernelTester()
5317 .mr(4)
5318 .nr(8)
5319 .kr(1)
5320 .sr(1)
5321 .m(4)
5322 .n(8)
5323 .k(8)
5324 .qmin(128)
5325 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5326 }
5327
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,qmax)5328 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
5329 TEST_REQUIRES_ARM_NEON_FMA;
5330 GemmMicrokernelTester()
5331 .mr(4)
5332 .nr(8)
5333 .kr(1)
5334 .sr(1)
5335 .m(4)
5336 .n(8)
5337 .k(8)
5338 .qmax(128)
5339 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5340 }
5341
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75,strided_cm)5342 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
5343 TEST_REQUIRES_ARM_NEON_FMA;
5344 GemmMicrokernelTester()
5345 .mr(4)
5346 .nr(8)
5347 .kr(1)
5348 .sr(1)
5349 .m(4)
5350 .n(8)
5351 .k(8)
5352 .cm_stride(11)
5353 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
5354 }
5355 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5356
5357
5358 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_4)5359 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_4) {
5360 TEST_REQUIRES_ARM_NEON_FMA;
5361 GemmMicrokernelTester()
5362 .mr(4)
5363 .nr(8)
5364 .kr(1)
5365 .sr(1)
5366 .m(4)
5367 .n(8)
5368 .k(4)
5369 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5370 }
5371
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,strided_cn)5372 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, strided_cn) {
5373 TEST_REQUIRES_ARM_NEON_FMA;
5374 GemmMicrokernelTester()
5375 .mr(4)
5376 .nr(8)
5377 .kr(1)
5378 .sr(1)
5379 .m(4)
5380 .n(8)
5381 .k(4)
5382 .cn_stride(11)
5383 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5384 }
5385
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_4_strided_a)5386 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_4_strided_a) {
5387 TEST_REQUIRES_ARM_NEON_FMA;
5388 GemmMicrokernelTester()
5389 .mr(4)
5390 .nr(8)
5391 .kr(1)
5392 .sr(1)
5393 .m(4)
5394 .n(8)
5395 .k(4)
5396 .a_stride(7)
5397 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5398 }
5399
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_4_subtile)5400 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_4_subtile) {
5401 TEST_REQUIRES_ARM_NEON_FMA;
5402 for (uint32_t n = 1; n <= 8; n++) {
5403 for (uint32_t m = 1; m <= 4; m++) {
5404 GemmMicrokernelTester()
5405 .mr(4)
5406 .nr(8)
5407 .kr(1)
5408 .sr(1)
5409 .m(m)
5410 .n(n)
5411 .k(4)
5412 .iterations(1)
5413 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5414 }
5415 }
5416 }
5417
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_4_subtile_m)5418 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_4_subtile_m) {
5419 TEST_REQUIRES_ARM_NEON_FMA;
5420 for (uint32_t m = 1; m <= 4; m++) {
5421 GemmMicrokernelTester()
5422 .mr(4)
5423 .nr(8)
5424 .kr(1)
5425 .sr(1)
5426 .m(m)
5427 .n(8)
5428 .k(4)
5429 .iterations(1)
5430 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5431 }
5432 }
5433
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_4_subtile_n)5434 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_4_subtile_n) {
5435 TEST_REQUIRES_ARM_NEON_FMA;
5436 for (uint32_t n = 1; n <= 8; n++) {
5437 GemmMicrokernelTester()
5438 .mr(4)
5439 .nr(8)
5440 .kr(1)
5441 .sr(1)
5442 .m(4)
5443 .n(n)
5444 .k(4)
5445 .iterations(1)
5446 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5447 }
5448 }
5449
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_8)5450 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_8) {
5451 TEST_REQUIRES_ARM_NEON_FMA;
5452 GemmMicrokernelTester()
5453 .mr(4)
5454 .nr(8)
5455 .kr(1)
5456 .sr(1)
5457 .m(4)
5458 .n(8)
5459 .k(8)
5460 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5461 }
5462
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_8_strided_a)5463 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_8_strided_a) {
5464 TEST_REQUIRES_ARM_NEON_FMA;
5465 GemmMicrokernelTester()
5466 .mr(4)
5467 .nr(8)
5468 .kr(1)
5469 .sr(1)
5470 .m(4)
5471 .n(8)
5472 .k(8)
5473 .a_stride(11)
5474 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5475 }
5476
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_eq_8_subtile)5477 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_eq_8_subtile) {
5478 TEST_REQUIRES_ARM_NEON_FMA;
5479 for (uint32_t n = 1; n <= 8; n++) {
5480 for (uint32_t m = 1; m <= 4; m++) {
5481 GemmMicrokernelTester()
5482 .mr(4)
5483 .nr(8)
5484 .kr(1)
5485 .sr(1)
5486 .m(m)
5487 .n(n)
5488 .k(8)
5489 .iterations(1)
5490 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5491 }
5492 }
5493 }
5494
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_lt_8)5495 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_lt_8) {
5496 TEST_REQUIRES_ARM_NEON_FMA;
5497 for (size_t k = 1; k < 8; k++) {
5498 GemmMicrokernelTester()
5499 .mr(4)
5500 .nr(8)
5501 .kr(1)
5502 .sr(1)
5503 .m(4)
5504 .n(8)
5505 .k(k)
5506 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5507 }
5508 }
5509
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_lt_8_strided_a)5510 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_lt_8_strided_a) {
5511 TEST_REQUIRES_ARM_NEON_FMA;
5512 for (size_t k = 1; k < 8; k++) {
5513 GemmMicrokernelTester()
5514 .mr(4)
5515 .nr(8)
5516 .kr(1)
5517 .sr(1)
5518 .m(4)
5519 .n(8)
5520 .k(k)
5521 .a_stride(11)
5522 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5523 }
5524 }
5525
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_lt_8_subtile)5526 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_lt_8_subtile) {
5527 TEST_REQUIRES_ARM_NEON_FMA;
5528 for (size_t k = 1; k < 8; k++) {
5529 for (uint32_t n = 1; n <= 8; n++) {
5530 for (uint32_t m = 1; m <= 4; m++) {
5531 GemmMicrokernelTester()
5532 .mr(4)
5533 .nr(8)
5534 .kr(1)
5535 .sr(1)
5536 .m(m)
5537 .n(n)
5538 .k(k)
5539 .iterations(1)
5540 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5541 }
5542 }
5543 }
5544 }
5545
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_gt_8)5546 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_gt_8) {
5547 TEST_REQUIRES_ARM_NEON_FMA;
5548 for (size_t k = 9; k < 16; k++) {
5549 GemmMicrokernelTester()
5550 .mr(4)
5551 .nr(8)
5552 .kr(1)
5553 .sr(1)
5554 .m(4)
5555 .n(8)
5556 .k(k)
5557 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5558 }
5559 }
5560
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_gt_8_strided_a)5561 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_gt_8_strided_a) {
5562 TEST_REQUIRES_ARM_NEON_FMA;
5563 for (size_t k = 9; k < 16; k++) {
5564 GemmMicrokernelTester()
5565 .mr(4)
5566 .nr(8)
5567 .kr(1)
5568 .sr(1)
5569 .m(4)
5570 .n(8)
5571 .k(k)
5572 .a_stride(19)
5573 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5574 }
5575 }
5576
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_gt_8_subtile)5577 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_gt_8_subtile) {
5578 TEST_REQUIRES_ARM_NEON_FMA;
5579 for (size_t k = 9; k < 16; k++) {
5580 for (uint32_t n = 1; n <= 8; n++) {
5581 for (uint32_t m = 1; m <= 4; m++) {
5582 GemmMicrokernelTester()
5583 .mr(4)
5584 .nr(8)
5585 .kr(1)
5586 .sr(1)
5587 .m(m)
5588 .n(n)
5589 .k(k)
5590 .iterations(1)
5591 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5592 }
5593 }
5594 }
5595 }
5596
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_div_4)5597 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_div_4) {
5598 TEST_REQUIRES_ARM_NEON_FMA;
5599 for (size_t k = 12; k <= 40; k += 4) {
5600 GemmMicrokernelTester()
5601 .mr(4)
5602 .nr(8)
5603 .kr(1)
5604 .sr(1)
5605 .m(4)
5606 .n(8)
5607 .k(k)
5608 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5609 }
5610 }
5611
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_div_4_strided_a)5612 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_div_4_strided_a) {
5613 TEST_REQUIRES_ARM_NEON_FMA;
5614 for (size_t k = 12; k <= 40; k += 4) {
5615 GemmMicrokernelTester()
5616 .mr(4)
5617 .nr(8)
5618 .kr(1)
5619 .sr(1)
5620 .m(4)
5621 .n(8)
5622 .k(k)
5623 .a_stride(43)
5624 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5625 }
5626 }
5627
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,k_div_4_subtile)5628 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, k_div_4_subtile) {
5629 TEST_REQUIRES_ARM_NEON_FMA;
5630 for (size_t k = 12; k <= 40; k += 4) {
5631 for (uint32_t n = 1; n <= 8; n++) {
5632 for (uint32_t m = 1; m <= 4; m++) {
5633 GemmMicrokernelTester()
5634 .mr(4)
5635 .nr(8)
5636 .kr(1)
5637 .sr(1)
5638 .m(m)
5639 .n(n)
5640 .k(k)
5641 .iterations(1)
5642 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5643 }
5644 }
5645 }
5646 }
5647
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_gt_8)5648 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_gt_8) {
5649 TEST_REQUIRES_ARM_NEON_FMA;
5650 for (uint32_t n = 9; n < 16; n++) {
5651 for (size_t k = 1; k <= 20; k += 5) {
5652 GemmMicrokernelTester()
5653 .mr(4)
5654 .nr(8)
5655 .kr(1)
5656 .sr(1)
5657 .m(4)
5658 .n(n)
5659 .k(k)
5660 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5661 }
5662 }
5663 }
5664
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_gt_8_strided_cn)5665 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_gt_8_strided_cn) {
5666 TEST_REQUIRES_ARM_NEON_FMA;
5667 for (uint32_t n = 9; n < 16; n++) {
5668 for (size_t k = 1; k <= 20; k += 5) {
5669 GemmMicrokernelTester()
5670 .mr(4)
5671 .nr(8)
5672 .kr(1)
5673 .sr(1)
5674 .m(4)
5675 .n(n)
5676 .k(k)
5677 .cn_stride(11)
5678 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5679 }
5680 }
5681 }
5682
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_gt_8_strided_a)5683 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_gt_8_strided_a) {
5684 TEST_REQUIRES_ARM_NEON_FMA;
5685 for (uint32_t n = 9; n < 16; n++) {
5686 for (size_t k = 1; k <= 20; k += 5) {
5687 GemmMicrokernelTester()
5688 .mr(4)
5689 .nr(8)
5690 .kr(1)
5691 .sr(1)
5692 .m(4)
5693 .n(n)
5694 .k(k)
5695 .a_stride(23)
5696 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5697 }
5698 }
5699 }
5700
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_gt_8_subtile)5701 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_gt_8_subtile) {
5702 TEST_REQUIRES_ARM_NEON_FMA;
5703 for (uint32_t n = 9; n < 16; n++) {
5704 for (size_t k = 1; k <= 20; k += 5) {
5705 for (uint32_t m = 1; m <= 4; m++) {
5706 GemmMicrokernelTester()
5707 .mr(4)
5708 .nr(8)
5709 .kr(1)
5710 .sr(1)
5711 .m(m)
5712 .n(n)
5713 .k(k)
5714 .iterations(1)
5715 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5716 }
5717 }
5718 }
5719 }
5720
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_div_8)5721 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_div_8) {
5722 TEST_REQUIRES_ARM_NEON_FMA;
5723 for (uint32_t n = 16; n <= 24; n += 8) {
5724 for (size_t k = 1; k <= 20; k += 5) {
5725 GemmMicrokernelTester()
5726 .mr(4)
5727 .nr(8)
5728 .kr(1)
5729 .sr(1)
5730 .m(4)
5731 .n(n)
5732 .k(k)
5733 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5734 }
5735 }
5736 }
5737
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_div_8_strided_cn)5738 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_div_8_strided_cn) {
5739 TEST_REQUIRES_ARM_NEON_FMA;
5740 for (uint32_t n = 16; n <= 24; n += 8) {
5741 for (size_t k = 1; k <= 20; k += 5) {
5742 GemmMicrokernelTester()
5743 .mr(4)
5744 .nr(8)
5745 .kr(1)
5746 .sr(1)
5747 .m(4)
5748 .n(n)
5749 .k(k)
5750 .cn_stride(11)
5751 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5752 }
5753 }
5754 }
5755
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_div_8_strided_a)5756 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_div_8_strided_a) {
5757 TEST_REQUIRES_ARM_NEON_FMA;
5758 for (uint32_t n = 16; n <= 24; n += 8) {
5759 for (size_t k = 1; k <= 20; k += 5) {
5760 GemmMicrokernelTester()
5761 .mr(4)
5762 .nr(8)
5763 .kr(1)
5764 .sr(1)
5765 .m(4)
5766 .n(n)
5767 .k(k)
5768 .a_stride(23)
5769 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5770 }
5771 }
5772 }
5773
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,n_div_8_subtile)5774 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, n_div_8_subtile) {
5775 TEST_REQUIRES_ARM_NEON_FMA;
5776 for (uint32_t n = 16; n <= 24; n += 8) {
5777 for (size_t k = 1; k <= 20; k += 5) {
5778 for (uint32_t m = 1; m <= 4; m++) {
5779 GemmMicrokernelTester()
5780 .mr(4)
5781 .nr(8)
5782 .kr(1)
5783 .sr(1)
5784 .m(m)
5785 .n(n)
5786 .k(k)
5787 .iterations(1)
5788 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5789 }
5790 }
5791 }
5792 }
5793
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,strided_cm_subtile)5794 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, strided_cm_subtile) {
5795 TEST_REQUIRES_ARM_NEON_FMA;
5796 for (size_t k = 1; k <= 20; k += 5) {
5797 for (uint32_t n = 1; n <= 8; n++) {
5798 for (uint32_t m = 1; m <= 4; m++) {
5799 GemmMicrokernelTester()
5800 .mr(4)
5801 .nr(8)
5802 .kr(1)
5803 .sr(1)
5804 .m(m)
5805 .n(n)
5806 .k(k)
5807 .cm_stride(11)
5808 .iterations(1)
5809 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5810 }
5811 }
5812 }
5813 }
5814
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,qmin)5815 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, qmin) {
5816 TEST_REQUIRES_ARM_NEON_FMA;
5817 GemmMicrokernelTester()
5818 .mr(4)
5819 .nr(8)
5820 .kr(1)
5821 .sr(1)
5822 .m(4)
5823 .n(8)
5824 .k(4)
5825 .qmin(128)
5826 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5827 }
5828
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,qmax)5829 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, qmax) {
5830 TEST_REQUIRES_ARM_NEON_FMA;
5831 GemmMicrokernelTester()
5832 .mr(4)
5833 .nr(8)
5834 .kr(1)
5835 .sr(1)
5836 .m(4)
5837 .n(8)
5838 .k(4)
5839 .qmax(128)
5840 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5841 }
5842
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53,strided_cm)5843 TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A53, strided_cm) {
5844 TEST_REQUIRES_ARM_NEON_FMA;
5845 GemmMicrokernelTester()
5846 .mr(4)
5847 .nr(8)
5848 .kr(1)
5849 .sr(1)
5850 .m(4)
5851 .n(8)
5852 .k(4)
5853 .cm_stride(11)
5854 .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53, xnn_init_f32_minmax_scalar_params);
5855 }
5856 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5857
5858
5859 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8)5860 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8) {
5861 TEST_REQUIRES_ARM_NEON_FMA;
5862 GemmMicrokernelTester()
5863 .mr(5)
5864 .nr(8)
5865 .kr(1)
5866 .sr(1)
5867 .m(5)
5868 .n(8)
5869 .k(8)
5870 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
5871 }
5872
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cn)5873 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cn) {
5874 TEST_REQUIRES_ARM_NEON_FMA;
5875 GemmMicrokernelTester()
5876 .mr(5)
5877 .nr(8)
5878 .kr(1)
5879 .sr(1)
5880 .m(5)
5881 .n(8)
5882 .k(8)
5883 .cn_stride(11)
5884 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
5885 }
5886
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_strided_a)5887 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_strided_a) {
5888 TEST_REQUIRES_ARM_NEON_FMA;
5889 GemmMicrokernelTester()
5890 .mr(5)
5891 .nr(8)
5892 .kr(1)
5893 .sr(1)
5894 .m(5)
5895 .n(8)
5896 .k(8)
5897 .a_stride(11)
5898 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
5899 }
5900
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile)5901 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile) {
5902 TEST_REQUIRES_ARM_NEON_FMA;
5903 for (uint32_t n = 1; n <= 8; n++) {
5904 for (uint32_t m = 1; m <= 5; m++) {
5905 GemmMicrokernelTester()
5906 .mr(5)
5907 .nr(8)
5908 .kr(1)
5909 .sr(1)
5910 .m(m)
5911 .n(n)
5912 .k(8)
5913 .iterations(1)
5914 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
5915 }
5916 }
5917 }
5918
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_m)5919 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_m) {
5920 TEST_REQUIRES_ARM_NEON_FMA;
5921 for (uint32_t m = 1; m <= 5; m++) {
5922 GemmMicrokernelTester()
5923 .mr(5)
5924 .nr(8)
5925 .kr(1)
5926 .sr(1)
5927 .m(m)
5928 .n(8)
5929 .k(8)
5930 .iterations(1)
5931 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
5932 }
5933 }
5934
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_n)5935 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_n) {
5936 TEST_REQUIRES_ARM_NEON_FMA;
5937 for (uint32_t n = 1; n <= 8; n++) {
5938 GemmMicrokernelTester()
5939 .mr(5)
5940 .nr(8)
5941 .kr(1)
5942 .sr(1)
5943 .m(5)
5944 .n(n)
5945 .k(8)
5946 .iterations(1)
5947 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
5948 }
5949 }
5950
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16)5951 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16) {
5952 TEST_REQUIRES_ARM_NEON_FMA;
5953 GemmMicrokernelTester()
5954 .mr(5)
5955 .nr(8)
5956 .kr(1)
5957 .sr(1)
5958 .m(5)
5959 .n(8)
5960 .k(16)
5961 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
5962 }
5963
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_strided_a)5964 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_strided_a) {
5965 TEST_REQUIRES_ARM_NEON_FMA;
5966 GemmMicrokernelTester()
5967 .mr(5)
5968 .nr(8)
5969 .kr(1)
5970 .sr(1)
5971 .m(5)
5972 .n(8)
5973 .k(16)
5974 .a_stride(19)
5975 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
5976 }
5977
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_subtile)5978 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_subtile) {
5979 TEST_REQUIRES_ARM_NEON_FMA;
5980 for (uint32_t n = 1; n <= 8; n++) {
5981 for (uint32_t m = 1; m <= 5; m++) {
5982 GemmMicrokernelTester()
5983 .mr(5)
5984 .nr(8)
5985 .kr(1)
5986 .sr(1)
5987 .m(m)
5988 .n(n)
5989 .k(16)
5990 .iterations(1)
5991 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
5992 }
5993 }
5994 }
5995
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16)5996 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16) {
5997 TEST_REQUIRES_ARM_NEON_FMA;
5998 for (size_t k = 1; k < 16; k++) {
5999 GemmMicrokernelTester()
6000 .mr(5)
6001 .nr(8)
6002 .kr(1)
6003 .sr(1)
6004 .m(5)
6005 .n(8)
6006 .k(k)
6007 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6008 }
6009 }
6010
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_strided_a)6011 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_strided_a) {
6012 TEST_REQUIRES_ARM_NEON_FMA;
6013 for (size_t k = 1; k < 16; k++) {
6014 GemmMicrokernelTester()
6015 .mr(5)
6016 .nr(8)
6017 .kr(1)
6018 .sr(1)
6019 .m(5)
6020 .n(8)
6021 .k(k)
6022 .a_stride(19)
6023 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6024 }
6025 }
6026
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_subtile)6027 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_subtile) {
6028 TEST_REQUIRES_ARM_NEON_FMA;
6029 for (size_t k = 1; k < 16; k++) {
6030 for (uint32_t n = 1; n <= 8; n++) {
6031 for (uint32_t m = 1; m <= 5; m++) {
6032 GemmMicrokernelTester()
6033 .mr(5)
6034 .nr(8)
6035 .kr(1)
6036 .sr(1)
6037 .m(m)
6038 .n(n)
6039 .k(k)
6040 .iterations(1)
6041 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6042 }
6043 }
6044 }
6045 }
6046
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16)6047 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16) {
6048 TEST_REQUIRES_ARM_NEON_FMA;
6049 for (size_t k = 17; k < 32; k++) {
6050 GemmMicrokernelTester()
6051 .mr(5)
6052 .nr(8)
6053 .kr(1)
6054 .sr(1)
6055 .m(5)
6056 .n(8)
6057 .k(k)
6058 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6059 }
6060 }
6061
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_strided_a)6062 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_strided_a) {
6063 TEST_REQUIRES_ARM_NEON_FMA;
6064 for (size_t k = 17; k < 32; k++) {
6065 GemmMicrokernelTester()
6066 .mr(5)
6067 .nr(8)
6068 .kr(1)
6069 .sr(1)
6070 .m(5)
6071 .n(8)
6072 .k(k)
6073 .a_stride(37)
6074 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6075 }
6076 }
6077
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_subtile)6078 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_subtile) {
6079 TEST_REQUIRES_ARM_NEON_FMA;
6080 for (size_t k = 17; k < 32; k++) {
6081 for (uint32_t n = 1; n <= 8; n++) {
6082 for (uint32_t m = 1; m <= 5; m++) {
6083 GemmMicrokernelTester()
6084 .mr(5)
6085 .nr(8)
6086 .kr(1)
6087 .sr(1)
6088 .m(m)
6089 .n(n)
6090 .k(k)
6091 .iterations(1)
6092 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6093 }
6094 }
6095 }
6096 }
6097
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8)6098 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8) {
6099 TEST_REQUIRES_ARM_NEON_FMA;
6100 for (size_t k = 24; k <= 80; k += 8) {
6101 GemmMicrokernelTester()
6102 .mr(5)
6103 .nr(8)
6104 .kr(1)
6105 .sr(1)
6106 .m(5)
6107 .n(8)
6108 .k(k)
6109 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6110 }
6111 }
6112
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_strided_a)6113 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_strided_a) {
6114 TEST_REQUIRES_ARM_NEON_FMA;
6115 for (size_t k = 24; k <= 80; k += 8) {
6116 GemmMicrokernelTester()
6117 .mr(5)
6118 .nr(8)
6119 .kr(1)
6120 .sr(1)
6121 .m(5)
6122 .n(8)
6123 .k(k)
6124 .a_stride(83)
6125 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6126 }
6127 }
6128
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_subtile)6129 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_subtile) {
6130 TEST_REQUIRES_ARM_NEON_FMA;
6131 for (size_t k = 24; k <= 80; k += 8) {
6132 for (uint32_t n = 1; n <= 8; n++) {
6133 for (uint32_t m = 1; m <= 5; m++) {
6134 GemmMicrokernelTester()
6135 .mr(5)
6136 .nr(8)
6137 .kr(1)
6138 .sr(1)
6139 .m(m)
6140 .n(n)
6141 .k(k)
6142 .iterations(1)
6143 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6144 }
6145 }
6146 }
6147 }
6148
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8)6149 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8) {
6150 TEST_REQUIRES_ARM_NEON_FMA;
6151 for (uint32_t n = 9; n < 16; n++) {
6152 for (size_t k = 1; k <= 40; k += 9) {
6153 GemmMicrokernelTester()
6154 .mr(5)
6155 .nr(8)
6156 .kr(1)
6157 .sr(1)
6158 .m(5)
6159 .n(n)
6160 .k(k)
6161 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6162 }
6163 }
6164 }
6165
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_cn)6166 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_cn) {
6167 TEST_REQUIRES_ARM_NEON_FMA;
6168 for (uint32_t n = 9; n < 16; n++) {
6169 for (size_t k = 1; k <= 40; k += 9) {
6170 GemmMicrokernelTester()
6171 .mr(5)
6172 .nr(8)
6173 .kr(1)
6174 .sr(1)
6175 .m(5)
6176 .n(n)
6177 .k(k)
6178 .cn_stride(11)
6179 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6180 }
6181 }
6182 }
6183
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_a)6184 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_a) {
6185 TEST_REQUIRES_ARM_NEON_FMA;
6186 for (uint32_t n = 9; n < 16; n++) {
6187 for (size_t k = 1; k <= 40; k += 9) {
6188 GemmMicrokernelTester()
6189 .mr(5)
6190 .nr(8)
6191 .kr(1)
6192 .sr(1)
6193 .m(5)
6194 .n(n)
6195 .k(k)
6196 .a_stride(43)
6197 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6198 }
6199 }
6200 }
6201
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_subtile)6202 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_subtile) {
6203 TEST_REQUIRES_ARM_NEON_FMA;
6204 for (uint32_t n = 9; n < 16; n++) {
6205 for (size_t k = 1; k <= 40; k += 9) {
6206 for (uint32_t m = 1; m <= 5; m++) {
6207 GemmMicrokernelTester()
6208 .mr(5)
6209 .nr(8)
6210 .kr(1)
6211 .sr(1)
6212 .m(m)
6213 .n(n)
6214 .k(k)
6215 .iterations(1)
6216 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6217 }
6218 }
6219 }
6220 }
6221
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8)6222 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8) {
6223 TEST_REQUIRES_ARM_NEON_FMA;
6224 for (uint32_t n = 16; n <= 24; n += 8) {
6225 for (size_t k = 1; k <= 40; k += 9) {
6226 GemmMicrokernelTester()
6227 .mr(5)
6228 .nr(8)
6229 .kr(1)
6230 .sr(1)
6231 .m(5)
6232 .n(n)
6233 .k(k)
6234 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6235 }
6236 }
6237 }
6238
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_cn)6239 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_cn) {
6240 TEST_REQUIRES_ARM_NEON_FMA;
6241 for (uint32_t n = 16; n <= 24; n += 8) {
6242 for (size_t k = 1; k <= 40; k += 9) {
6243 GemmMicrokernelTester()
6244 .mr(5)
6245 .nr(8)
6246 .kr(1)
6247 .sr(1)
6248 .m(5)
6249 .n(n)
6250 .k(k)
6251 .cn_stride(11)
6252 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6253 }
6254 }
6255 }
6256
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_a)6257 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_a) {
6258 TEST_REQUIRES_ARM_NEON_FMA;
6259 for (uint32_t n = 16; n <= 24; n += 8) {
6260 for (size_t k = 1; k <= 40; k += 9) {
6261 GemmMicrokernelTester()
6262 .mr(5)
6263 .nr(8)
6264 .kr(1)
6265 .sr(1)
6266 .m(5)
6267 .n(n)
6268 .k(k)
6269 .a_stride(43)
6270 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6271 }
6272 }
6273 }
6274
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_subtile)6275 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_subtile) {
6276 TEST_REQUIRES_ARM_NEON_FMA;
6277 for (uint32_t n = 16; n <= 24; n += 8) {
6278 for (size_t k = 1; k <= 40; k += 9) {
6279 for (uint32_t m = 1; m <= 5; m++) {
6280 GemmMicrokernelTester()
6281 .mr(5)
6282 .nr(8)
6283 .kr(1)
6284 .sr(1)
6285 .m(m)
6286 .n(n)
6287 .k(k)
6288 .iterations(1)
6289 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6290 }
6291 }
6292 }
6293 }
6294
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm_subtile)6295 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm_subtile) {
6296 TEST_REQUIRES_ARM_NEON_FMA;
6297 for (size_t k = 1; k <= 40; k += 9) {
6298 for (uint32_t n = 1; n <= 8; n++) {
6299 for (uint32_t m = 1; m <= 5; m++) {
6300 GemmMicrokernelTester()
6301 .mr(5)
6302 .nr(8)
6303 .kr(1)
6304 .sr(1)
6305 .m(m)
6306 .n(n)
6307 .k(k)
6308 .cm_stride(11)
6309 .iterations(1)
6310 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6311 }
6312 }
6313 }
6314 }
6315
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmin)6316 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmin) {
6317 TEST_REQUIRES_ARM_NEON_FMA;
6318 GemmMicrokernelTester()
6319 .mr(5)
6320 .nr(8)
6321 .kr(1)
6322 .sr(1)
6323 .m(5)
6324 .n(8)
6325 .k(8)
6326 .qmin(128)
6327 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6328 }
6329
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmax)6330 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmax) {
6331 TEST_REQUIRES_ARM_NEON_FMA;
6332 GemmMicrokernelTester()
6333 .mr(5)
6334 .nr(8)
6335 .kr(1)
6336 .sr(1)
6337 .m(5)
6338 .n(8)
6339 .k(8)
6340 .qmax(128)
6341 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6342 }
6343
TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm)6344 TEST(F32_GEMM_MINMAX_5X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm) {
6345 TEST_REQUIRES_ARM_NEON_FMA;
6346 GemmMicrokernelTester()
6347 .mr(5)
6348 .nr(8)
6349 .kr(1)
6350 .sr(1)
6351 .m(5)
6352 .n(8)
6353 .k(8)
6354 .cm_stride(11)
6355 .Test(xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
6356 }
6357 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6358
6359
6360 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4)6361 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4) {
6362 TEST_REQUIRES_ARM_NEON_FMA;
6363 GemmMicrokernelTester()
6364 .mr(6)
6365 .nr(8)
6366 .kr(1)
6367 .sr(1)
6368 .m(6)
6369 .n(8)
6370 .k(4)
6371 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6372 }
6373
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,strided_cn)6374 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cn) {
6375 TEST_REQUIRES_ARM_NEON_FMA;
6376 GemmMicrokernelTester()
6377 .mr(6)
6378 .nr(8)
6379 .kr(1)
6380 .sr(1)
6381 .m(6)
6382 .n(8)
6383 .k(4)
6384 .cn_stride(11)
6385 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6386 }
6387
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4_strided_a)6388 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_strided_a) {
6389 TEST_REQUIRES_ARM_NEON_FMA;
6390 GemmMicrokernelTester()
6391 .mr(6)
6392 .nr(8)
6393 .kr(1)
6394 .sr(1)
6395 .m(6)
6396 .n(8)
6397 .k(4)
6398 .a_stride(7)
6399 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6400 }
6401
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4_subtile)6402 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile) {
6403 TEST_REQUIRES_ARM_NEON_FMA;
6404 for (uint32_t n = 1; n <= 8; n++) {
6405 for (uint32_t m = 1; m <= 6; m++) {
6406 GemmMicrokernelTester()
6407 .mr(6)
6408 .nr(8)
6409 .kr(1)
6410 .sr(1)
6411 .m(m)
6412 .n(n)
6413 .k(4)
6414 .iterations(1)
6415 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6416 }
6417 }
6418 }
6419
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4_subtile_m)6420 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_m) {
6421 TEST_REQUIRES_ARM_NEON_FMA;
6422 for (uint32_t m = 1; m <= 6; m++) {
6423 GemmMicrokernelTester()
6424 .mr(6)
6425 .nr(8)
6426 .kr(1)
6427 .sr(1)
6428 .m(m)
6429 .n(8)
6430 .k(4)
6431 .iterations(1)
6432 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6433 }
6434 }
6435
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_4_subtile_n)6436 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_n) {
6437 TEST_REQUIRES_ARM_NEON_FMA;
6438 for (uint32_t n = 1; n <= 8; n++) {
6439 GemmMicrokernelTester()
6440 .mr(6)
6441 .nr(8)
6442 .kr(1)
6443 .sr(1)
6444 .m(6)
6445 .n(n)
6446 .k(4)
6447 .iterations(1)
6448 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6449 }
6450 }
6451
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_8)6452 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8) {
6453 TEST_REQUIRES_ARM_NEON_FMA;
6454 GemmMicrokernelTester()
6455 .mr(6)
6456 .nr(8)
6457 .kr(1)
6458 .sr(1)
6459 .m(6)
6460 .n(8)
6461 .k(8)
6462 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6463 }
6464
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_8_strided_a)6465 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_strided_a) {
6466 TEST_REQUIRES_ARM_NEON_FMA;
6467 GemmMicrokernelTester()
6468 .mr(6)
6469 .nr(8)
6470 .kr(1)
6471 .sr(1)
6472 .m(6)
6473 .n(8)
6474 .k(8)
6475 .a_stride(11)
6476 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6477 }
6478
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_eq_8_subtile)6479 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_subtile) {
6480 TEST_REQUIRES_ARM_NEON_FMA;
6481 for (uint32_t n = 1; n <= 8; n++) {
6482 for (uint32_t m = 1; m <= 6; m++) {
6483 GemmMicrokernelTester()
6484 .mr(6)
6485 .nr(8)
6486 .kr(1)
6487 .sr(1)
6488 .m(m)
6489 .n(n)
6490 .k(8)
6491 .iterations(1)
6492 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6493 }
6494 }
6495 }
6496
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_lt_8)6497 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8) {
6498 TEST_REQUIRES_ARM_NEON_FMA;
6499 for (size_t k = 1; k < 8; k++) {
6500 GemmMicrokernelTester()
6501 .mr(6)
6502 .nr(8)
6503 .kr(1)
6504 .sr(1)
6505 .m(6)
6506 .n(8)
6507 .k(k)
6508 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6509 }
6510 }
6511
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_lt_8_strided_a)6512 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_strided_a) {
6513 TEST_REQUIRES_ARM_NEON_FMA;
6514 for (size_t k = 1; k < 8; k++) {
6515 GemmMicrokernelTester()
6516 .mr(6)
6517 .nr(8)
6518 .kr(1)
6519 .sr(1)
6520 .m(6)
6521 .n(8)
6522 .k(k)
6523 .a_stride(11)
6524 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6525 }
6526 }
6527
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_lt_8_subtile)6528 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_subtile) {
6529 TEST_REQUIRES_ARM_NEON_FMA;
6530 for (size_t k = 1; k < 8; k++) {
6531 for (uint32_t n = 1; n <= 8; n++) {
6532 for (uint32_t m = 1; m <= 6; m++) {
6533 GemmMicrokernelTester()
6534 .mr(6)
6535 .nr(8)
6536 .kr(1)
6537 .sr(1)
6538 .m(m)
6539 .n(n)
6540 .k(k)
6541 .iterations(1)
6542 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6543 }
6544 }
6545 }
6546 }
6547
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_gt_8)6548 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8) {
6549 TEST_REQUIRES_ARM_NEON_FMA;
6550 for (size_t k = 9; k < 16; k++) {
6551 GemmMicrokernelTester()
6552 .mr(6)
6553 .nr(8)
6554 .kr(1)
6555 .sr(1)
6556 .m(6)
6557 .n(8)
6558 .k(k)
6559 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6560 }
6561 }
6562
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_gt_8_strided_a)6563 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8_strided_a) {
6564 TEST_REQUIRES_ARM_NEON_FMA;
6565 for (size_t k = 9; k < 16; k++) {
6566 GemmMicrokernelTester()
6567 .mr(6)
6568 .nr(8)
6569 .kr(1)
6570 .sr(1)
6571 .m(6)
6572 .n(8)
6573 .k(k)
6574 .a_stride(19)
6575 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6576 }
6577 }
6578
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_gt_8_subtile)6579 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8_subtile) {
6580 TEST_REQUIRES_ARM_NEON_FMA;
6581 for (size_t k = 9; k < 16; k++) {
6582 for (uint32_t n = 1; n <= 8; n++) {
6583 for (uint32_t m = 1; m <= 6; m++) {
6584 GemmMicrokernelTester()
6585 .mr(6)
6586 .nr(8)
6587 .kr(1)
6588 .sr(1)
6589 .m(m)
6590 .n(n)
6591 .k(k)
6592 .iterations(1)
6593 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6594 }
6595 }
6596 }
6597 }
6598
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_div_4)6599 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4) {
6600 TEST_REQUIRES_ARM_NEON_FMA;
6601 for (size_t k = 12; k <= 40; k += 4) {
6602 GemmMicrokernelTester()
6603 .mr(6)
6604 .nr(8)
6605 .kr(1)
6606 .sr(1)
6607 .m(6)
6608 .n(8)
6609 .k(k)
6610 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6611 }
6612 }
6613
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_div_4_strided_a)6614 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_strided_a) {
6615 TEST_REQUIRES_ARM_NEON_FMA;
6616 for (size_t k = 12; k <= 40; k += 4) {
6617 GemmMicrokernelTester()
6618 .mr(6)
6619 .nr(8)
6620 .kr(1)
6621 .sr(1)
6622 .m(6)
6623 .n(8)
6624 .k(k)
6625 .a_stride(43)
6626 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6627 }
6628 }
6629
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,k_div_4_subtile)6630 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_subtile) {
6631 TEST_REQUIRES_ARM_NEON_FMA;
6632 for (size_t k = 12; k <= 40; k += 4) {
6633 for (uint32_t n = 1; n <= 8; n++) {
6634 for (uint32_t m = 1; m <= 6; m++) {
6635 GemmMicrokernelTester()
6636 .mr(6)
6637 .nr(8)
6638 .kr(1)
6639 .sr(1)
6640 .m(m)
6641 .n(n)
6642 .k(k)
6643 .iterations(1)
6644 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6645 }
6646 }
6647 }
6648 }
6649
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,n_gt_8)6650 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8) {
6651 TEST_REQUIRES_ARM_NEON_FMA;
6652 for (uint32_t n = 9; n < 16; n++) {
6653 for (size_t k = 1; k <= 20; k += 5) {
6654 GemmMicrokernelTester()
6655 .mr(6)
6656 .nr(8)
6657 .kr(1)
6658 .sr(1)
6659 .m(6)
6660 .n(n)
6661 .k(k)
6662 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6663 }
6664 }
6665 }
6666
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,n_gt_8_strided_cn)6667 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_cn) {
6668 TEST_REQUIRES_ARM_NEON_FMA;
6669 for (uint32_t n = 9; n < 16; n++) {
6670 for (size_t k = 1; k <= 20; k += 5) {
6671 GemmMicrokernelTester()
6672 .mr(6)
6673 .nr(8)
6674 .kr(1)
6675 .sr(1)
6676 .m(6)
6677 .n(n)
6678 .k(k)
6679 .cn_stride(11)
6680 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6681 }
6682 }
6683 }
6684
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,n_gt_8_strided_a)6685 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_a) {
6686 TEST_REQUIRES_ARM_NEON_FMA;
6687 for (uint32_t n = 9; n < 16; n++) {
6688 for (size_t k = 1; k <= 20; k += 5) {
6689 GemmMicrokernelTester()
6690 .mr(6)
6691 .nr(8)
6692 .kr(1)
6693 .sr(1)
6694 .m(6)
6695 .n(n)
6696 .k(k)
6697 .a_stride(23)
6698 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6699 }
6700 }
6701 }
6702
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,n_gt_8_subtile)6703 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_subtile) {
6704 TEST_REQUIRES_ARM_NEON_FMA;
6705 for (uint32_t n = 9; n < 16; n++) {
6706 for (size_t k = 1; k <= 20; k += 5) {
6707 for (uint32_t m = 1; m <= 6; m++) {
6708 GemmMicrokernelTester()
6709 .mr(6)
6710 .nr(8)
6711 .kr(1)
6712 .sr(1)
6713 .m(m)
6714 .n(n)
6715 .k(k)
6716 .iterations(1)
6717 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6718 }
6719 }
6720 }
6721 }
6722
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,n_div_8)6723 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8) {
6724 TEST_REQUIRES_ARM_NEON_FMA;
6725 for (uint32_t n = 16; n <= 24; n += 8) {
6726 for (size_t k = 1; k <= 20; k += 5) {
6727 GemmMicrokernelTester()
6728 .mr(6)
6729 .nr(8)
6730 .kr(1)
6731 .sr(1)
6732 .m(6)
6733 .n(n)
6734 .k(k)
6735 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6736 }
6737 }
6738 }
6739
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,n_div_8_strided_cn)6740 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_cn) {
6741 TEST_REQUIRES_ARM_NEON_FMA;
6742 for (uint32_t n = 16; n <= 24; n += 8) {
6743 for (size_t k = 1; k <= 20; k += 5) {
6744 GemmMicrokernelTester()
6745 .mr(6)
6746 .nr(8)
6747 .kr(1)
6748 .sr(1)
6749 .m(6)
6750 .n(n)
6751 .k(k)
6752 .cn_stride(11)
6753 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6754 }
6755 }
6756 }
6757
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,n_div_8_strided_a)6758 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_a) {
6759 TEST_REQUIRES_ARM_NEON_FMA;
6760 for (uint32_t n = 16; n <= 24; n += 8) {
6761 for (size_t k = 1; k <= 20; k += 5) {
6762 GemmMicrokernelTester()
6763 .mr(6)
6764 .nr(8)
6765 .kr(1)
6766 .sr(1)
6767 .m(6)
6768 .n(n)
6769 .k(k)
6770 .a_stride(23)
6771 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6772 }
6773 }
6774 }
6775
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,n_div_8_subtile)6776 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_subtile) {
6777 TEST_REQUIRES_ARM_NEON_FMA;
6778 for (uint32_t n = 16; n <= 24; n += 8) {
6779 for (size_t k = 1; k <= 20; k += 5) {
6780 for (uint32_t m = 1; m <= 6; m++) {
6781 GemmMicrokernelTester()
6782 .mr(6)
6783 .nr(8)
6784 .kr(1)
6785 .sr(1)
6786 .m(m)
6787 .n(n)
6788 .k(k)
6789 .iterations(1)
6790 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6791 }
6792 }
6793 }
6794 }
6795
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,strided_cm_subtile)6796 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm_subtile) {
6797 TEST_REQUIRES_ARM_NEON_FMA;
6798 for (size_t k = 1; k <= 20; k += 5) {
6799 for (uint32_t n = 1; n <= 8; n++) {
6800 for (uint32_t m = 1; m <= 6; m++) {
6801 GemmMicrokernelTester()
6802 .mr(6)
6803 .nr(8)
6804 .kr(1)
6805 .sr(1)
6806 .m(m)
6807 .n(n)
6808 .k(k)
6809 .cm_stride(11)
6810 .iterations(1)
6811 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6812 }
6813 }
6814 }
6815 }
6816
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,qmin)6817 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, qmin) {
6818 TEST_REQUIRES_ARM_NEON_FMA;
6819 GemmMicrokernelTester()
6820 .mr(6)
6821 .nr(8)
6822 .kr(1)
6823 .sr(1)
6824 .m(6)
6825 .n(8)
6826 .k(4)
6827 .qmin(128)
6828 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6829 }
6830
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,qmax)6831 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, qmax) {
6832 TEST_REQUIRES_ARM_NEON_FMA;
6833 GemmMicrokernelTester()
6834 .mr(6)
6835 .nr(8)
6836 .kr(1)
6837 .sr(1)
6838 .m(6)
6839 .n(8)
6840 .k(4)
6841 .qmax(128)
6842 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6843 }
6844
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55,strided_cm)6845 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm) {
6846 TEST_REQUIRES_ARM_NEON_FMA;
6847 GemmMicrokernelTester()
6848 .mr(6)
6849 .nr(8)
6850 .kr(1)
6851 .sr(1)
6852 .m(6)
6853 .n(8)
6854 .k(4)
6855 .cm_stride(11)
6856 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55, xnn_init_f32_minmax_scalar_params);
6857 }
6858 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6859
6860
6861 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8)6862 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
6863 TEST_REQUIRES_ARM_NEON_FMA;
6864 GemmMicrokernelTester()
6865 .mr(6)
6866 .nr(8)
6867 .kr(1)
6868 .sr(1)
6869 .m(6)
6870 .n(8)
6871 .k(8)
6872 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
6873 }
6874
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,strided_cn)6875 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
6876 TEST_REQUIRES_ARM_NEON_FMA;
6877 GemmMicrokernelTester()
6878 .mr(6)
6879 .nr(8)
6880 .kr(1)
6881 .sr(1)
6882 .m(6)
6883 .n(8)
6884 .k(8)
6885 .cn_stride(11)
6886 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
6887 }
6888
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_strided_a)6889 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
6890 TEST_REQUIRES_ARM_NEON_FMA;
6891 GemmMicrokernelTester()
6892 .mr(6)
6893 .nr(8)
6894 .kr(1)
6895 .sr(1)
6896 .m(6)
6897 .n(8)
6898 .k(8)
6899 .a_stride(11)
6900 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
6901 }
6902
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile)6903 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
6904 TEST_REQUIRES_ARM_NEON_FMA;
6905 for (uint32_t n = 1; n <= 8; n++) {
6906 for (uint32_t m = 1; m <= 6; m++) {
6907 GemmMicrokernelTester()
6908 .mr(6)
6909 .nr(8)
6910 .kr(1)
6911 .sr(1)
6912 .m(m)
6913 .n(n)
6914 .k(8)
6915 .iterations(1)
6916 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
6917 }
6918 }
6919 }
6920
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile_m)6921 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
6922 TEST_REQUIRES_ARM_NEON_FMA;
6923 for (uint32_t m = 1; m <= 6; m++) {
6924 GemmMicrokernelTester()
6925 .mr(6)
6926 .nr(8)
6927 .kr(1)
6928 .sr(1)
6929 .m(m)
6930 .n(8)
6931 .k(8)
6932 .iterations(1)
6933 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
6934 }
6935 }
6936
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile_n)6937 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
6938 TEST_REQUIRES_ARM_NEON_FMA;
6939 for (uint32_t n = 1; n <= 8; n++) {
6940 GemmMicrokernelTester()
6941 .mr(6)
6942 .nr(8)
6943 .kr(1)
6944 .sr(1)
6945 .m(6)
6946 .n(n)
6947 .k(8)
6948 .iterations(1)
6949 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
6950 }
6951 }
6952
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_16)6953 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
6954 TEST_REQUIRES_ARM_NEON_FMA;
6955 GemmMicrokernelTester()
6956 .mr(6)
6957 .nr(8)
6958 .kr(1)
6959 .sr(1)
6960 .m(6)
6961 .n(8)
6962 .k(16)
6963 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
6964 }
6965
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_16_strided_a)6966 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
6967 TEST_REQUIRES_ARM_NEON_FMA;
6968 GemmMicrokernelTester()
6969 .mr(6)
6970 .nr(8)
6971 .kr(1)
6972 .sr(1)
6973 .m(6)
6974 .n(8)
6975 .k(16)
6976 .a_stride(19)
6977 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
6978 }
6979
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_16_subtile)6980 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
6981 TEST_REQUIRES_ARM_NEON_FMA;
6982 for (uint32_t n = 1; n <= 8; n++) {
6983 for (uint32_t m = 1; m <= 6; m++) {
6984 GemmMicrokernelTester()
6985 .mr(6)
6986 .nr(8)
6987 .kr(1)
6988 .sr(1)
6989 .m(m)
6990 .n(n)
6991 .k(16)
6992 .iterations(1)
6993 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
6994 }
6995 }
6996 }
6997
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_lt_16)6998 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
6999 TEST_REQUIRES_ARM_NEON_FMA;
7000 for (size_t k = 1; k < 16; k++) {
7001 GemmMicrokernelTester()
7002 .mr(6)
7003 .nr(8)
7004 .kr(1)
7005 .sr(1)
7006 .m(6)
7007 .n(8)
7008 .k(k)
7009 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7010 }
7011 }
7012
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_lt_16_strided_a)7013 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
7014 TEST_REQUIRES_ARM_NEON_FMA;
7015 for (size_t k = 1; k < 16; k++) {
7016 GemmMicrokernelTester()
7017 .mr(6)
7018 .nr(8)
7019 .kr(1)
7020 .sr(1)
7021 .m(6)
7022 .n(8)
7023 .k(k)
7024 .a_stride(19)
7025 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7026 }
7027 }
7028
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_lt_16_subtile)7029 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
7030 TEST_REQUIRES_ARM_NEON_FMA;
7031 for (size_t k = 1; k < 16; k++) {
7032 for (uint32_t n = 1; n <= 8; n++) {
7033 for (uint32_t m = 1; m <= 6; m++) {
7034 GemmMicrokernelTester()
7035 .mr(6)
7036 .nr(8)
7037 .kr(1)
7038 .sr(1)
7039 .m(m)
7040 .n(n)
7041 .k(k)
7042 .iterations(1)
7043 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7044 }
7045 }
7046 }
7047 }
7048
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_gt_16)7049 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
7050 TEST_REQUIRES_ARM_NEON_FMA;
7051 for (size_t k = 17; k < 32; k++) {
7052 GemmMicrokernelTester()
7053 .mr(6)
7054 .nr(8)
7055 .kr(1)
7056 .sr(1)
7057 .m(6)
7058 .n(8)
7059 .k(k)
7060 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7061 }
7062 }
7063
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_gt_16_strided_a)7064 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_strided_a) {
7065 TEST_REQUIRES_ARM_NEON_FMA;
7066 for (size_t k = 17; k < 32; k++) {
7067 GemmMicrokernelTester()
7068 .mr(6)
7069 .nr(8)
7070 .kr(1)
7071 .sr(1)
7072 .m(6)
7073 .n(8)
7074 .k(k)
7075 .a_stride(37)
7076 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7077 }
7078 }
7079
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_gt_16_subtile)7080 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_subtile) {
7081 TEST_REQUIRES_ARM_NEON_FMA;
7082 for (size_t k = 17; k < 32; k++) {
7083 for (uint32_t n = 1; n <= 8; n++) {
7084 for (uint32_t m = 1; m <= 6; m++) {
7085 GemmMicrokernelTester()
7086 .mr(6)
7087 .nr(8)
7088 .kr(1)
7089 .sr(1)
7090 .m(m)
7091 .n(n)
7092 .k(k)
7093 .iterations(1)
7094 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7095 }
7096 }
7097 }
7098 }
7099
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_div_8)7100 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
7101 TEST_REQUIRES_ARM_NEON_FMA;
7102 for (size_t k = 24; k <= 80; k += 8) {
7103 GemmMicrokernelTester()
7104 .mr(6)
7105 .nr(8)
7106 .kr(1)
7107 .sr(1)
7108 .m(6)
7109 .n(8)
7110 .k(k)
7111 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7112 }
7113 }
7114
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_div_8_strided_a)7115 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
7116 TEST_REQUIRES_ARM_NEON_FMA;
7117 for (size_t k = 24; k <= 80; k += 8) {
7118 GemmMicrokernelTester()
7119 .mr(6)
7120 .nr(8)
7121 .kr(1)
7122 .sr(1)
7123 .m(6)
7124 .n(8)
7125 .k(k)
7126 .a_stride(83)
7127 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7128 }
7129 }
7130
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,k_div_8_subtile)7131 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
7132 TEST_REQUIRES_ARM_NEON_FMA;
7133 for (size_t k = 24; k <= 80; k += 8) {
7134 for (uint32_t n = 1; n <= 8; n++) {
7135 for (uint32_t m = 1; m <= 6; m++) {
7136 GemmMicrokernelTester()
7137 .mr(6)
7138 .nr(8)
7139 .kr(1)
7140 .sr(1)
7141 .m(m)
7142 .n(n)
7143 .k(k)
7144 .iterations(1)
7145 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7146 }
7147 }
7148 }
7149 }
7150
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8)7151 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
7152 TEST_REQUIRES_ARM_NEON_FMA;
7153 for (uint32_t n = 9; n < 16; n++) {
7154 for (size_t k = 1; k <= 40; k += 9) {
7155 GemmMicrokernelTester()
7156 .mr(6)
7157 .nr(8)
7158 .kr(1)
7159 .sr(1)
7160 .m(6)
7161 .n(n)
7162 .k(k)
7163 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7164 }
7165 }
7166 }
7167
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8_strided_cn)7168 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
7169 TEST_REQUIRES_ARM_NEON_FMA;
7170 for (uint32_t n = 9; n < 16; n++) {
7171 for (size_t k = 1; k <= 40; k += 9) {
7172 GemmMicrokernelTester()
7173 .mr(6)
7174 .nr(8)
7175 .kr(1)
7176 .sr(1)
7177 .m(6)
7178 .n(n)
7179 .k(k)
7180 .cn_stride(11)
7181 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7182 }
7183 }
7184 }
7185
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8_strided_a)7186 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
7187 TEST_REQUIRES_ARM_NEON_FMA;
7188 for (uint32_t n = 9; n < 16; n++) {
7189 for (size_t k = 1; k <= 40; k += 9) {
7190 GemmMicrokernelTester()
7191 .mr(6)
7192 .nr(8)
7193 .kr(1)
7194 .sr(1)
7195 .m(6)
7196 .n(n)
7197 .k(k)
7198 .a_stride(43)
7199 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7200 }
7201 }
7202 }
7203
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8_subtile)7204 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
7205 TEST_REQUIRES_ARM_NEON_FMA;
7206 for (uint32_t n = 9; n < 16; n++) {
7207 for (size_t k = 1; k <= 40; k += 9) {
7208 for (uint32_t m = 1; m <= 6; m++) {
7209 GemmMicrokernelTester()
7210 .mr(6)
7211 .nr(8)
7212 .kr(1)
7213 .sr(1)
7214 .m(m)
7215 .n(n)
7216 .k(k)
7217 .iterations(1)
7218 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7219 }
7220 }
7221 }
7222 }
7223
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8)7224 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
7225 TEST_REQUIRES_ARM_NEON_FMA;
7226 for (uint32_t n = 16; n <= 24; n += 8) {
7227 for (size_t k = 1; k <= 40; k += 9) {
7228 GemmMicrokernelTester()
7229 .mr(6)
7230 .nr(8)
7231 .kr(1)
7232 .sr(1)
7233 .m(6)
7234 .n(n)
7235 .k(k)
7236 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7237 }
7238 }
7239 }
7240
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8_strided_cn)7241 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
7242 TEST_REQUIRES_ARM_NEON_FMA;
7243 for (uint32_t n = 16; n <= 24; n += 8) {
7244 for (size_t k = 1; k <= 40; k += 9) {
7245 GemmMicrokernelTester()
7246 .mr(6)
7247 .nr(8)
7248 .kr(1)
7249 .sr(1)
7250 .m(6)
7251 .n(n)
7252 .k(k)
7253 .cn_stride(11)
7254 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7255 }
7256 }
7257 }
7258
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8_strided_a)7259 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
7260 TEST_REQUIRES_ARM_NEON_FMA;
7261 for (uint32_t n = 16; n <= 24; n += 8) {
7262 for (size_t k = 1; k <= 40; k += 9) {
7263 GemmMicrokernelTester()
7264 .mr(6)
7265 .nr(8)
7266 .kr(1)
7267 .sr(1)
7268 .m(6)
7269 .n(n)
7270 .k(k)
7271 .a_stride(43)
7272 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7273 }
7274 }
7275 }
7276
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8_subtile)7277 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
7278 TEST_REQUIRES_ARM_NEON_FMA;
7279 for (uint32_t n = 16; n <= 24; n += 8) {
7280 for (size_t k = 1; k <= 40; k += 9) {
7281 for (uint32_t m = 1; m <= 6; m++) {
7282 GemmMicrokernelTester()
7283 .mr(6)
7284 .nr(8)
7285 .kr(1)
7286 .sr(1)
7287 .m(m)
7288 .n(n)
7289 .k(k)
7290 .iterations(1)
7291 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7292 }
7293 }
7294 }
7295 }
7296
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,strided_cm_subtile)7297 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
7298 TEST_REQUIRES_ARM_NEON_FMA;
7299 for (size_t k = 1; k <= 40; k += 9) {
7300 for (uint32_t n = 1; n <= 8; n++) {
7301 for (uint32_t m = 1; m <= 6; m++) {
7302 GemmMicrokernelTester()
7303 .mr(6)
7304 .nr(8)
7305 .kr(1)
7306 .sr(1)
7307 .m(m)
7308 .n(n)
7309 .k(k)
7310 .cm_stride(11)
7311 .iterations(1)
7312 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7313 }
7314 }
7315 }
7316 }
7317
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,qmin)7318 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
7319 TEST_REQUIRES_ARM_NEON_FMA;
7320 GemmMicrokernelTester()
7321 .mr(6)
7322 .nr(8)
7323 .kr(1)
7324 .sr(1)
7325 .m(6)
7326 .n(8)
7327 .k(8)
7328 .qmin(128)
7329 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7330 }
7331
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,qmax)7332 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
7333 TEST_REQUIRES_ARM_NEON_FMA;
7334 GemmMicrokernelTester()
7335 .mr(6)
7336 .nr(8)
7337 .kr(1)
7338 .sr(1)
7339 .m(6)
7340 .n(8)
7341 .k(8)
7342 .qmax(128)
7343 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7344 }
7345
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75,strided_cm)7346 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
7347 TEST_REQUIRES_ARM_NEON_FMA;
7348 GemmMicrokernelTester()
7349 .mr(6)
7350 .nr(8)
7351 .kr(1)
7352 .sr(1)
7353 .m(6)
7354 .n(8)
7355 .k(8)
7356 .cm_stride(11)
7357 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
7358 }
7359 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7360
7361
7362 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_eq_2)7363 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_eq_2) {
7364 TEST_REQUIRES_ARM_NEON_FMA;
7365 GemmMicrokernelTester()
7366 .mr(6)
7367 .nr(8)
7368 .kr(1)
7369 .sr(1)
7370 .m(6)
7371 .n(8)
7372 .k(2)
7373 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7374 }
7375
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,strided_cn)7376 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, strided_cn) {
7377 TEST_REQUIRES_ARM_NEON_FMA;
7378 GemmMicrokernelTester()
7379 .mr(6)
7380 .nr(8)
7381 .kr(1)
7382 .sr(1)
7383 .m(6)
7384 .n(8)
7385 .k(2)
7386 .cn_stride(11)
7387 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7388 }
7389
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_eq_2_strided_a)7390 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
7391 TEST_REQUIRES_ARM_NEON_FMA;
7392 GemmMicrokernelTester()
7393 .mr(6)
7394 .nr(8)
7395 .kr(1)
7396 .sr(1)
7397 .m(6)
7398 .n(8)
7399 .k(2)
7400 .a_stride(5)
7401 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7402 }
7403
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_eq_2_subtile)7404 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
7405 TEST_REQUIRES_ARM_NEON_FMA;
7406 for (uint32_t n = 1; n <= 8; n++) {
7407 for (uint32_t m = 1; m <= 6; m++) {
7408 GemmMicrokernelTester()
7409 .mr(6)
7410 .nr(8)
7411 .kr(1)
7412 .sr(1)
7413 .m(m)
7414 .n(n)
7415 .k(2)
7416 .iterations(1)
7417 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7418 }
7419 }
7420 }
7421
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_eq_2_subtile_m)7422 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
7423 TEST_REQUIRES_ARM_NEON_FMA;
7424 for (uint32_t m = 1; m <= 6; m++) {
7425 GemmMicrokernelTester()
7426 .mr(6)
7427 .nr(8)
7428 .kr(1)
7429 .sr(1)
7430 .m(m)
7431 .n(8)
7432 .k(2)
7433 .iterations(1)
7434 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7435 }
7436 }
7437
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_eq_2_subtile_n)7438 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
7439 TEST_REQUIRES_ARM_NEON_FMA;
7440 for (uint32_t n = 1; n <= 8; n++) {
7441 GemmMicrokernelTester()
7442 .mr(6)
7443 .nr(8)
7444 .kr(1)
7445 .sr(1)
7446 .m(6)
7447 .n(n)
7448 .k(2)
7449 .iterations(1)
7450 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7451 }
7452 }
7453
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_lt_2)7454 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_lt_2) {
7455 TEST_REQUIRES_ARM_NEON_FMA;
7456 for (size_t k = 1; k < 2; k++) {
7457 GemmMicrokernelTester()
7458 .mr(6)
7459 .nr(8)
7460 .kr(1)
7461 .sr(1)
7462 .m(6)
7463 .n(8)
7464 .k(k)
7465 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7466 }
7467 }
7468
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_lt_2_strided_a)7469 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
7470 TEST_REQUIRES_ARM_NEON_FMA;
7471 for (size_t k = 1; k < 2; k++) {
7472 GemmMicrokernelTester()
7473 .mr(6)
7474 .nr(8)
7475 .kr(1)
7476 .sr(1)
7477 .m(6)
7478 .n(8)
7479 .k(k)
7480 .a_stride(5)
7481 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7482 }
7483 }
7484
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_lt_2_subtile)7485 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
7486 TEST_REQUIRES_ARM_NEON_FMA;
7487 for (size_t k = 1; k < 2; k++) {
7488 for (uint32_t n = 1; n <= 8; n++) {
7489 for (uint32_t m = 1; m <= 6; m++) {
7490 GemmMicrokernelTester()
7491 .mr(6)
7492 .nr(8)
7493 .kr(1)
7494 .sr(1)
7495 .m(m)
7496 .n(n)
7497 .k(k)
7498 .iterations(1)
7499 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7500 }
7501 }
7502 }
7503 }
7504
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_gt_2)7505 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_gt_2) {
7506 TEST_REQUIRES_ARM_NEON_FMA;
7507 for (size_t k = 3; k < 4; k++) {
7508 GemmMicrokernelTester()
7509 .mr(6)
7510 .nr(8)
7511 .kr(1)
7512 .sr(1)
7513 .m(6)
7514 .n(8)
7515 .k(k)
7516 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7517 }
7518 }
7519
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_gt_2_strided_a)7520 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
7521 TEST_REQUIRES_ARM_NEON_FMA;
7522 for (size_t k = 3; k < 4; k++) {
7523 GemmMicrokernelTester()
7524 .mr(6)
7525 .nr(8)
7526 .kr(1)
7527 .sr(1)
7528 .m(6)
7529 .n(8)
7530 .k(k)
7531 .a_stride(7)
7532 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7533 }
7534 }
7535
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_gt_2_subtile)7536 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
7537 TEST_REQUIRES_ARM_NEON_FMA;
7538 for (size_t k = 3; k < 4; k++) {
7539 for (uint32_t n = 1; n <= 8; n++) {
7540 for (uint32_t m = 1; m <= 6; m++) {
7541 GemmMicrokernelTester()
7542 .mr(6)
7543 .nr(8)
7544 .kr(1)
7545 .sr(1)
7546 .m(m)
7547 .n(n)
7548 .k(k)
7549 .iterations(1)
7550 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7551 }
7552 }
7553 }
7554 }
7555
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_div_2)7556 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_div_2) {
7557 TEST_REQUIRES_ARM_NEON_FMA;
7558 for (size_t k = 4; k <= 20; k += 2) {
7559 GemmMicrokernelTester()
7560 .mr(6)
7561 .nr(8)
7562 .kr(1)
7563 .sr(1)
7564 .m(6)
7565 .n(8)
7566 .k(k)
7567 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7568 }
7569 }
7570
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_div_2_strided_a)7571 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
7572 TEST_REQUIRES_ARM_NEON_FMA;
7573 for (size_t k = 4; k <= 20; k += 2) {
7574 GemmMicrokernelTester()
7575 .mr(6)
7576 .nr(8)
7577 .kr(1)
7578 .sr(1)
7579 .m(6)
7580 .n(8)
7581 .k(k)
7582 .a_stride(23)
7583 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7584 }
7585 }
7586
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,k_div_2_subtile)7587 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
7588 TEST_REQUIRES_ARM_NEON_FMA;
7589 for (size_t k = 4; k <= 20; k += 2) {
7590 for (uint32_t n = 1; n <= 8; n++) {
7591 for (uint32_t m = 1; m <= 6; m++) {
7592 GemmMicrokernelTester()
7593 .mr(6)
7594 .nr(8)
7595 .kr(1)
7596 .sr(1)
7597 .m(m)
7598 .n(n)
7599 .k(k)
7600 .iterations(1)
7601 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7602 }
7603 }
7604 }
7605 }
7606
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,n_gt_8)7607 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, n_gt_8) {
7608 TEST_REQUIRES_ARM_NEON_FMA;
7609 for (uint32_t n = 9; n < 16; n++) {
7610 for (size_t k = 1; k <= 10; k += 3) {
7611 GemmMicrokernelTester()
7612 .mr(6)
7613 .nr(8)
7614 .kr(1)
7615 .sr(1)
7616 .m(6)
7617 .n(n)
7618 .k(k)
7619 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7620 }
7621 }
7622 }
7623
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,n_gt_8_strided_cn)7624 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
7625 TEST_REQUIRES_ARM_NEON_FMA;
7626 for (uint32_t n = 9; n < 16; n++) {
7627 for (size_t k = 1; k <= 10; k += 3) {
7628 GemmMicrokernelTester()
7629 .mr(6)
7630 .nr(8)
7631 .kr(1)
7632 .sr(1)
7633 .m(6)
7634 .n(n)
7635 .k(k)
7636 .cn_stride(11)
7637 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7638 }
7639 }
7640 }
7641
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,n_gt_8_strided_a)7642 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
7643 TEST_REQUIRES_ARM_NEON_FMA;
7644 for (uint32_t n = 9; n < 16; n++) {
7645 for (size_t k = 1; k <= 10; k += 3) {
7646 GemmMicrokernelTester()
7647 .mr(6)
7648 .nr(8)
7649 .kr(1)
7650 .sr(1)
7651 .m(6)
7652 .n(n)
7653 .k(k)
7654 .a_stride(13)
7655 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7656 }
7657 }
7658 }
7659
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,n_gt_8_subtile)7660 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
7661 TEST_REQUIRES_ARM_NEON_FMA;
7662 for (uint32_t n = 9; n < 16; n++) {
7663 for (size_t k = 1; k <= 10; k += 3) {
7664 for (uint32_t m = 1; m <= 6; m++) {
7665 GemmMicrokernelTester()
7666 .mr(6)
7667 .nr(8)
7668 .kr(1)
7669 .sr(1)
7670 .m(m)
7671 .n(n)
7672 .k(k)
7673 .iterations(1)
7674 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7675 }
7676 }
7677 }
7678 }
7679
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,n_div_8)7680 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, n_div_8) {
7681 TEST_REQUIRES_ARM_NEON_FMA;
7682 for (uint32_t n = 16; n <= 24; n += 8) {
7683 for (size_t k = 1; k <= 10; k += 3) {
7684 GemmMicrokernelTester()
7685 .mr(6)
7686 .nr(8)
7687 .kr(1)
7688 .sr(1)
7689 .m(6)
7690 .n(n)
7691 .k(k)
7692 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7693 }
7694 }
7695 }
7696
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,n_div_8_strided_cn)7697 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
7698 TEST_REQUIRES_ARM_NEON_FMA;
7699 for (uint32_t n = 16; n <= 24; n += 8) {
7700 for (size_t k = 1; k <= 10; k += 3) {
7701 GemmMicrokernelTester()
7702 .mr(6)
7703 .nr(8)
7704 .kr(1)
7705 .sr(1)
7706 .m(6)
7707 .n(n)
7708 .k(k)
7709 .cn_stride(11)
7710 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7711 }
7712 }
7713 }
7714
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,n_div_8_strided_a)7715 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
7716 TEST_REQUIRES_ARM_NEON_FMA;
7717 for (uint32_t n = 16; n <= 24; n += 8) {
7718 for (size_t k = 1; k <= 10; k += 3) {
7719 GemmMicrokernelTester()
7720 .mr(6)
7721 .nr(8)
7722 .kr(1)
7723 .sr(1)
7724 .m(6)
7725 .n(n)
7726 .k(k)
7727 .a_stride(13)
7728 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7729 }
7730 }
7731 }
7732
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,n_div_8_subtile)7733 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
7734 TEST_REQUIRES_ARM_NEON_FMA;
7735 for (uint32_t n = 16; n <= 24; n += 8) {
7736 for (size_t k = 1; k <= 10; k += 3) {
7737 for (uint32_t m = 1; m <= 6; m++) {
7738 GemmMicrokernelTester()
7739 .mr(6)
7740 .nr(8)
7741 .kr(1)
7742 .sr(1)
7743 .m(m)
7744 .n(n)
7745 .k(k)
7746 .iterations(1)
7747 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7748 }
7749 }
7750 }
7751 }
7752
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,strided_cm_subtile)7753 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
7754 TEST_REQUIRES_ARM_NEON_FMA;
7755 for (size_t k = 1; k <= 10; k += 3) {
7756 for (uint32_t n = 1; n <= 8; n++) {
7757 for (uint32_t m = 1; m <= 6; m++) {
7758 GemmMicrokernelTester()
7759 .mr(6)
7760 .nr(8)
7761 .kr(1)
7762 .sr(1)
7763 .m(m)
7764 .n(n)
7765 .k(k)
7766 .cm_stride(11)
7767 .iterations(1)
7768 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7769 }
7770 }
7771 }
7772 }
7773
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,qmin)7774 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, qmin) {
7775 TEST_REQUIRES_ARM_NEON_FMA;
7776 GemmMicrokernelTester()
7777 .mr(6)
7778 .nr(8)
7779 .kr(1)
7780 .sr(1)
7781 .m(6)
7782 .n(8)
7783 .k(2)
7784 .qmin(128)
7785 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7786 }
7787
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,qmax)7788 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, qmax) {
7789 TEST_REQUIRES_ARM_NEON_FMA;
7790 GemmMicrokernelTester()
7791 .mr(6)
7792 .nr(8)
7793 .kr(1)
7794 .sr(1)
7795 .m(6)
7796 .n(8)
7797 .k(2)
7798 .qmax(128)
7799 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7800 }
7801
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64,strided_cm)7802 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD64, strided_cm) {
7803 TEST_REQUIRES_ARM_NEON_FMA;
7804 GemmMicrokernelTester()
7805 .mr(6)
7806 .nr(8)
7807 .kr(1)
7808 .sr(1)
7809 .m(6)
7810 .n(8)
7811 .k(2)
7812 .cm_stride(11)
7813 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld64, xnn_init_f32_minmax_scalar_params);
7814 }
7815 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7816
7817
7818 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_eq_4)7819 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_eq_4) {
7820 TEST_REQUIRES_ARM_NEON_FMA;
7821 GemmMicrokernelTester()
7822 .mr(6)
7823 .nr(8)
7824 .kr(1)
7825 .sr(1)
7826 .m(6)
7827 .n(8)
7828 .k(4)
7829 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7830 }
7831
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,strided_cn)7832 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, strided_cn) {
7833 TEST_REQUIRES_ARM_NEON_FMA;
7834 GemmMicrokernelTester()
7835 .mr(6)
7836 .nr(8)
7837 .kr(1)
7838 .sr(1)
7839 .m(6)
7840 .n(8)
7841 .k(4)
7842 .cn_stride(11)
7843 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7844 }
7845
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_eq_4_strided_a)7846 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
7847 TEST_REQUIRES_ARM_NEON_FMA;
7848 GemmMicrokernelTester()
7849 .mr(6)
7850 .nr(8)
7851 .kr(1)
7852 .sr(1)
7853 .m(6)
7854 .n(8)
7855 .k(4)
7856 .a_stride(7)
7857 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7858 }
7859
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_eq_4_subtile)7860 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
7861 TEST_REQUIRES_ARM_NEON_FMA;
7862 for (uint32_t n = 1; n <= 8; n++) {
7863 for (uint32_t m = 1; m <= 6; m++) {
7864 GemmMicrokernelTester()
7865 .mr(6)
7866 .nr(8)
7867 .kr(1)
7868 .sr(1)
7869 .m(m)
7870 .n(n)
7871 .k(4)
7872 .iterations(1)
7873 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7874 }
7875 }
7876 }
7877
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_eq_4_subtile_m)7878 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
7879 TEST_REQUIRES_ARM_NEON_FMA;
7880 for (uint32_t m = 1; m <= 6; m++) {
7881 GemmMicrokernelTester()
7882 .mr(6)
7883 .nr(8)
7884 .kr(1)
7885 .sr(1)
7886 .m(m)
7887 .n(8)
7888 .k(4)
7889 .iterations(1)
7890 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7891 }
7892 }
7893
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_eq_4_subtile_n)7894 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
7895 TEST_REQUIRES_ARM_NEON_FMA;
7896 for (uint32_t n = 1; n <= 8; n++) {
7897 GemmMicrokernelTester()
7898 .mr(6)
7899 .nr(8)
7900 .kr(1)
7901 .sr(1)
7902 .m(6)
7903 .n(n)
7904 .k(4)
7905 .iterations(1)
7906 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7907 }
7908 }
7909
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_lt_4)7910 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_lt_4) {
7911 TEST_REQUIRES_ARM_NEON_FMA;
7912 for (size_t k = 1; k < 4; k++) {
7913 GemmMicrokernelTester()
7914 .mr(6)
7915 .nr(8)
7916 .kr(1)
7917 .sr(1)
7918 .m(6)
7919 .n(8)
7920 .k(k)
7921 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7922 }
7923 }
7924
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_lt_4_strided_a)7925 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
7926 TEST_REQUIRES_ARM_NEON_FMA;
7927 for (size_t k = 1; k < 4; k++) {
7928 GemmMicrokernelTester()
7929 .mr(6)
7930 .nr(8)
7931 .kr(1)
7932 .sr(1)
7933 .m(6)
7934 .n(8)
7935 .k(k)
7936 .a_stride(7)
7937 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7938 }
7939 }
7940
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_lt_4_subtile)7941 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
7942 TEST_REQUIRES_ARM_NEON_FMA;
7943 for (size_t k = 1; k < 4; k++) {
7944 for (uint32_t n = 1; n <= 8; n++) {
7945 for (uint32_t m = 1; m <= 6; m++) {
7946 GemmMicrokernelTester()
7947 .mr(6)
7948 .nr(8)
7949 .kr(1)
7950 .sr(1)
7951 .m(m)
7952 .n(n)
7953 .k(k)
7954 .iterations(1)
7955 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7956 }
7957 }
7958 }
7959 }
7960
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_gt_4)7961 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_gt_4) {
7962 TEST_REQUIRES_ARM_NEON_FMA;
7963 for (size_t k = 5; k < 8; k++) {
7964 GemmMicrokernelTester()
7965 .mr(6)
7966 .nr(8)
7967 .kr(1)
7968 .sr(1)
7969 .m(6)
7970 .n(8)
7971 .k(k)
7972 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7973 }
7974 }
7975
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_gt_4_strided_a)7976 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
7977 TEST_REQUIRES_ARM_NEON_FMA;
7978 for (size_t k = 5; k < 8; k++) {
7979 GemmMicrokernelTester()
7980 .mr(6)
7981 .nr(8)
7982 .kr(1)
7983 .sr(1)
7984 .m(6)
7985 .n(8)
7986 .k(k)
7987 .a_stride(11)
7988 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
7989 }
7990 }
7991
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_gt_4_subtile)7992 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
7993 TEST_REQUIRES_ARM_NEON_FMA;
7994 for (size_t k = 5; k < 8; k++) {
7995 for (uint32_t n = 1; n <= 8; n++) {
7996 for (uint32_t m = 1; m <= 6; m++) {
7997 GemmMicrokernelTester()
7998 .mr(6)
7999 .nr(8)
8000 .kr(1)
8001 .sr(1)
8002 .m(m)
8003 .n(n)
8004 .k(k)
8005 .iterations(1)
8006 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8007 }
8008 }
8009 }
8010 }
8011
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_div_4)8012 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_div_4) {
8013 TEST_REQUIRES_ARM_NEON_FMA;
8014 for (size_t k = 8; k <= 40; k += 4) {
8015 GemmMicrokernelTester()
8016 .mr(6)
8017 .nr(8)
8018 .kr(1)
8019 .sr(1)
8020 .m(6)
8021 .n(8)
8022 .k(k)
8023 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8024 }
8025 }
8026
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_div_4_strided_a)8027 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
8028 TEST_REQUIRES_ARM_NEON_FMA;
8029 for (size_t k = 8; k <= 40; k += 4) {
8030 GemmMicrokernelTester()
8031 .mr(6)
8032 .nr(8)
8033 .kr(1)
8034 .sr(1)
8035 .m(6)
8036 .n(8)
8037 .k(k)
8038 .a_stride(43)
8039 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8040 }
8041 }
8042
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,k_div_4_subtile)8043 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
8044 TEST_REQUIRES_ARM_NEON_FMA;
8045 for (size_t k = 8; k <= 40; k += 4) {
8046 for (uint32_t n = 1; n <= 8; n++) {
8047 for (uint32_t m = 1; m <= 6; m++) {
8048 GemmMicrokernelTester()
8049 .mr(6)
8050 .nr(8)
8051 .kr(1)
8052 .sr(1)
8053 .m(m)
8054 .n(n)
8055 .k(k)
8056 .iterations(1)
8057 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8058 }
8059 }
8060 }
8061 }
8062
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,n_gt_8)8063 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, n_gt_8) {
8064 TEST_REQUIRES_ARM_NEON_FMA;
8065 for (uint32_t n = 9; n < 16; n++) {
8066 for (size_t k = 1; k <= 20; k += 5) {
8067 GemmMicrokernelTester()
8068 .mr(6)
8069 .nr(8)
8070 .kr(1)
8071 .sr(1)
8072 .m(6)
8073 .n(n)
8074 .k(k)
8075 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8076 }
8077 }
8078 }
8079
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,n_gt_8_strided_cn)8080 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
8081 TEST_REQUIRES_ARM_NEON_FMA;
8082 for (uint32_t n = 9; n < 16; n++) {
8083 for (size_t k = 1; k <= 20; k += 5) {
8084 GemmMicrokernelTester()
8085 .mr(6)
8086 .nr(8)
8087 .kr(1)
8088 .sr(1)
8089 .m(6)
8090 .n(n)
8091 .k(k)
8092 .cn_stride(11)
8093 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8094 }
8095 }
8096 }
8097
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,n_gt_8_strided_a)8098 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
8099 TEST_REQUIRES_ARM_NEON_FMA;
8100 for (uint32_t n = 9; n < 16; n++) {
8101 for (size_t k = 1; k <= 20; k += 5) {
8102 GemmMicrokernelTester()
8103 .mr(6)
8104 .nr(8)
8105 .kr(1)
8106 .sr(1)
8107 .m(6)
8108 .n(n)
8109 .k(k)
8110 .a_stride(23)
8111 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8112 }
8113 }
8114 }
8115
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,n_gt_8_subtile)8116 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
8117 TEST_REQUIRES_ARM_NEON_FMA;
8118 for (uint32_t n = 9; n < 16; n++) {
8119 for (size_t k = 1; k <= 20; k += 5) {
8120 for (uint32_t m = 1; m <= 6; m++) {
8121 GemmMicrokernelTester()
8122 .mr(6)
8123 .nr(8)
8124 .kr(1)
8125 .sr(1)
8126 .m(m)
8127 .n(n)
8128 .k(k)
8129 .iterations(1)
8130 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8131 }
8132 }
8133 }
8134 }
8135
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,n_div_8)8136 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, n_div_8) {
8137 TEST_REQUIRES_ARM_NEON_FMA;
8138 for (uint32_t n = 16; n <= 24; n += 8) {
8139 for (size_t k = 1; k <= 20; k += 5) {
8140 GemmMicrokernelTester()
8141 .mr(6)
8142 .nr(8)
8143 .kr(1)
8144 .sr(1)
8145 .m(6)
8146 .n(n)
8147 .k(k)
8148 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8149 }
8150 }
8151 }
8152
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,n_div_8_strided_cn)8153 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
8154 TEST_REQUIRES_ARM_NEON_FMA;
8155 for (uint32_t n = 16; n <= 24; n += 8) {
8156 for (size_t k = 1; k <= 20; k += 5) {
8157 GemmMicrokernelTester()
8158 .mr(6)
8159 .nr(8)
8160 .kr(1)
8161 .sr(1)
8162 .m(6)
8163 .n(n)
8164 .k(k)
8165 .cn_stride(11)
8166 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8167 }
8168 }
8169 }
8170
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,n_div_8_strided_a)8171 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
8172 TEST_REQUIRES_ARM_NEON_FMA;
8173 for (uint32_t n = 16; n <= 24; n += 8) {
8174 for (size_t k = 1; k <= 20; k += 5) {
8175 GemmMicrokernelTester()
8176 .mr(6)
8177 .nr(8)
8178 .kr(1)
8179 .sr(1)
8180 .m(6)
8181 .n(n)
8182 .k(k)
8183 .a_stride(23)
8184 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8185 }
8186 }
8187 }
8188
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,n_div_8_subtile)8189 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
8190 TEST_REQUIRES_ARM_NEON_FMA;
8191 for (uint32_t n = 16; n <= 24; n += 8) {
8192 for (size_t k = 1; k <= 20; k += 5) {
8193 for (uint32_t m = 1; m <= 6; m++) {
8194 GemmMicrokernelTester()
8195 .mr(6)
8196 .nr(8)
8197 .kr(1)
8198 .sr(1)
8199 .m(m)
8200 .n(n)
8201 .k(k)
8202 .iterations(1)
8203 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8204 }
8205 }
8206 }
8207 }
8208
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,strided_cm_subtile)8209 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
8210 TEST_REQUIRES_ARM_NEON_FMA;
8211 for (size_t k = 1; k <= 20; k += 5) {
8212 for (uint32_t n = 1; n <= 8; n++) {
8213 for (uint32_t m = 1; m <= 6; m++) {
8214 GemmMicrokernelTester()
8215 .mr(6)
8216 .nr(8)
8217 .kr(1)
8218 .sr(1)
8219 .m(m)
8220 .n(n)
8221 .k(k)
8222 .cm_stride(11)
8223 .iterations(1)
8224 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8225 }
8226 }
8227 }
8228 }
8229
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,qmin)8230 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, qmin) {
8231 TEST_REQUIRES_ARM_NEON_FMA;
8232 GemmMicrokernelTester()
8233 .mr(6)
8234 .nr(8)
8235 .kr(1)
8236 .sr(1)
8237 .m(6)
8238 .n(8)
8239 .k(4)
8240 .qmin(128)
8241 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8242 }
8243
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,qmax)8244 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, qmax) {
8245 TEST_REQUIRES_ARM_NEON_FMA;
8246 GemmMicrokernelTester()
8247 .mr(6)
8248 .nr(8)
8249 .kr(1)
8250 .sr(1)
8251 .m(6)
8252 .n(8)
8253 .k(4)
8254 .qmax(128)
8255 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8256 }
8257
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128,strided_cm)8258 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_LD128, strided_cm) {
8259 TEST_REQUIRES_ARM_NEON_FMA;
8260 GemmMicrokernelTester()
8261 .mr(6)
8262 .nr(8)
8263 .kr(1)
8264 .sr(1)
8265 .m(6)
8266 .n(8)
8267 .k(4)
8268 .cm_stride(11)
8269 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
8270 }
8271 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8272
8273
8274 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8)8275 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8) {
8276 TEST_REQUIRES_ARM_NEON_FMA;
8277 GemmMicrokernelTester()
8278 .mr(6)
8279 .nr(8)
8280 .kr(1)
8281 .sr(1)
8282 .m(6)
8283 .n(8)
8284 .k(8)
8285 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8286 }
8287
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cn)8288 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cn) {
8289 TEST_REQUIRES_ARM_NEON_FMA;
8290 GemmMicrokernelTester()
8291 .mr(6)
8292 .nr(8)
8293 .kr(1)
8294 .sr(1)
8295 .m(6)
8296 .n(8)
8297 .k(8)
8298 .cn_stride(11)
8299 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8300 }
8301
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_strided_a)8302 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_strided_a) {
8303 TEST_REQUIRES_ARM_NEON_FMA;
8304 GemmMicrokernelTester()
8305 .mr(6)
8306 .nr(8)
8307 .kr(1)
8308 .sr(1)
8309 .m(6)
8310 .n(8)
8311 .k(8)
8312 .a_stride(11)
8313 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8314 }
8315
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile)8316 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile) {
8317 TEST_REQUIRES_ARM_NEON_FMA;
8318 for (uint32_t n = 1; n <= 8; n++) {
8319 for (uint32_t m = 1; m <= 6; m++) {
8320 GemmMicrokernelTester()
8321 .mr(6)
8322 .nr(8)
8323 .kr(1)
8324 .sr(1)
8325 .m(m)
8326 .n(n)
8327 .k(8)
8328 .iterations(1)
8329 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8330 }
8331 }
8332 }
8333
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_m)8334 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_m) {
8335 TEST_REQUIRES_ARM_NEON_FMA;
8336 for (uint32_t m = 1; m <= 6; m++) {
8337 GemmMicrokernelTester()
8338 .mr(6)
8339 .nr(8)
8340 .kr(1)
8341 .sr(1)
8342 .m(m)
8343 .n(8)
8344 .k(8)
8345 .iterations(1)
8346 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8347 }
8348 }
8349
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_n)8350 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_n) {
8351 TEST_REQUIRES_ARM_NEON_FMA;
8352 for (uint32_t n = 1; n <= 8; n++) {
8353 GemmMicrokernelTester()
8354 .mr(6)
8355 .nr(8)
8356 .kr(1)
8357 .sr(1)
8358 .m(6)
8359 .n(n)
8360 .k(8)
8361 .iterations(1)
8362 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8363 }
8364 }
8365
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16)8366 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16) {
8367 TEST_REQUIRES_ARM_NEON_FMA;
8368 GemmMicrokernelTester()
8369 .mr(6)
8370 .nr(8)
8371 .kr(1)
8372 .sr(1)
8373 .m(6)
8374 .n(8)
8375 .k(16)
8376 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8377 }
8378
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_strided_a)8379 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_strided_a) {
8380 TEST_REQUIRES_ARM_NEON_FMA;
8381 GemmMicrokernelTester()
8382 .mr(6)
8383 .nr(8)
8384 .kr(1)
8385 .sr(1)
8386 .m(6)
8387 .n(8)
8388 .k(16)
8389 .a_stride(19)
8390 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8391 }
8392
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_subtile)8393 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_subtile) {
8394 TEST_REQUIRES_ARM_NEON_FMA;
8395 for (uint32_t n = 1; n <= 8; n++) {
8396 for (uint32_t m = 1; m <= 6; m++) {
8397 GemmMicrokernelTester()
8398 .mr(6)
8399 .nr(8)
8400 .kr(1)
8401 .sr(1)
8402 .m(m)
8403 .n(n)
8404 .k(16)
8405 .iterations(1)
8406 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8407 }
8408 }
8409 }
8410
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16)8411 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16) {
8412 TEST_REQUIRES_ARM_NEON_FMA;
8413 for (size_t k = 1; k < 16; k++) {
8414 GemmMicrokernelTester()
8415 .mr(6)
8416 .nr(8)
8417 .kr(1)
8418 .sr(1)
8419 .m(6)
8420 .n(8)
8421 .k(k)
8422 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8423 }
8424 }
8425
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_strided_a)8426 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_strided_a) {
8427 TEST_REQUIRES_ARM_NEON_FMA;
8428 for (size_t k = 1; k < 16; k++) {
8429 GemmMicrokernelTester()
8430 .mr(6)
8431 .nr(8)
8432 .kr(1)
8433 .sr(1)
8434 .m(6)
8435 .n(8)
8436 .k(k)
8437 .a_stride(19)
8438 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8439 }
8440 }
8441
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_subtile)8442 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_subtile) {
8443 TEST_REQUIRES_ARM_NEON_FMA;
8444 for (size_t k = 1; k < 16; k++) {
8445 for (uint32_t n = 1; n <= 8; n++) {
8446 for (uint32_t m = 1; m <= 6; m++) {
8447 GemmMicrokernelTester()
8448 .mr(6)
8449 .nr(8)
8450 .kr(1)
8451 .sr(1)
8452 .m(m)
8453 .n(n)
8454 .k(k)
8455 .iterations(1)
8456 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8457 }
8458 }
8459 }
8460 }
8461
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16)8462 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16) {
8463 TEST_REQUIRES_ARM_NEON_FMA;
8464 for (size_t k = 17; k < 32; k++) {
8465 GemmMicrokernelTester()
8466 .mr(6)
8467 .nr(8)
8468 .kr(1)
8469 .sr(1)
8470 .m(6)
8471 .n(8)
8472 .k(k)
8473 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8474 }
8475 }
8476
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_strided_a)8477 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_strided_a) {
8478 TEST_REQUIRES_ARM_NEON_FMA;
8479 for (size_t k = 17; k < 32; k++) {
8480 GemmMicrokernelTester()
8481 .mr(6)
8482 .nr(8)
8483 .kr(1)
8484 .sr(1)
8485 .m(6)
8486 .n(8)
8487 .k(k)
8488 .a_stride(37)
8489 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8490 }
8491 }
8492
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_subtile)8493 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_subtile) {
8494 TEST_REQUIRES_ARM_NEON_FMA;
8495 for (size_t k = 17; k < 32; k++) {
8496 for (uint32_t n = 1; n <= 8; n++) {
8497 for (uint32_t m = 1; m <= 6; m++) {
8498 GemmMicrokernelTester()
8499 .mr(6)
8500 .nr(8)
8501 .kr(1)
8502 .sr(1)
8503 .m(m)
8504 .n(n)
8505 .k(k)
8506 .iterations(1)
8507 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8508 }
8509 }
8510 }
8511 }
8512
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8)8513 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8) {
8514 TEST_REQUIRES_ARM_NEON_FMA;
8515 for (size_t k = 24; k <= 80; k += 8) {
8516 GemmMicrokernelTester()
8517 .mr(6)
8518 .nr(8)
8519 .kr(1)
8520 .sr(1)
8521 .m(6)
8522 .n(8)
8523 .k(k)
8524 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8525 }
8526 }
8527
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_strided_a)8528 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_strided_a) {
8529 TEST_REQUIRES_ARM_NEON_FMA;
8530 for (size_t k = 24; k <= 80; k += 8) {
8531 GemmMicrokernelTester()
8532 .mr(6)
8533 .nr(8)
8534 .kr(1)
8535 .sr(1)
8536 .m(6)
8537 .n(8)
8538 .k(k)
8539 .a_stride(83)
8540 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8541 }
8542 }
8543
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_subtile)8544 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_subtile) {
8545 TEST_REQUIRES_ARM_NEON_FMA;
8546 for (size_t k = 24; k <= 80; k += 8) {
8547 for (uint32_t n = 1; n <= 8; n++) {
8548 for (uint32_t m = 1; m <= 6; m++) {
8549 GemmMicrokernelTester()
8550 .mr(6)
8551 .nr(8)
8552 .kr(1)
8553 .sr(1)
8554 .m(m)
8555 .n(n)
8556 .k(k)
8557 .iterations(1)
8558 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8559 }
8560 }
8561 }
8562 }
8563
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8)8564 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8) {
8565 TEST_REQUIRES_ARM_NEON_FMA;
8566 for (uint32_t n = 9; n < 16; n++) {
8567 for (size_t k = 1; k <= 40; k += 9) {
8568 GemmMicrokernelTester()
8569 .mr(6)
8570 .nr(8)
8571 .kr(1)
8572 .sr(1)
8573 .m(6)
8574 .n(n)
8575 .k(k)
8576 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8577 }
8578 }
8579 }
8580
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_cn)8581 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_cn) {
8582 TEST_REQUIRES_ARM_NEON_FMA;
8583 for (uint32_t n = 9; n < 16; n++) {
8584 for (size_t k = 1; k <= 40; k += 9) {
8585 GemmMicrokernelTester()
8586 .mr(6)
8587 .nr(8)
8588 .kr(1)
8589 .sr(1)
8590 .m(6)
8591 .n(n)
8592 .k(k)
8593 .cn_stride(11)
8594 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8595 }
8596 }
8597 }
8598
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_a)8599 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_a) {
8600 TEST_REQUIRES_ARM_NEON_FMA;
8601 for (uint32_t n = 9; n < 16; n++) {
8602 for (size_t k = 1; k <= 40; k += 9) {
8603 GemmMicrokernelTester()
8604 .mr(6)
8605 .nr(8)
8606 .kr(1)
8607 .sr(1)
8608 .m(6)
8609 .n(n)
8610 .k(k)
8611 .a_stride(43)
8612 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8613 }
8614 }
8615 }
8616
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_subtile)8617 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_subtile) {
8618 TEST_REQUIRES_ARM_NEON_FMA;
8619 for (uint32_t n = 9; n < 16; n++) {
8620 for (size_t k = 1; k <= 40; k += 9) {
8621 for (uint32_t m = 1; m <= 6; m++) {
8622 GemmMicrokernelTester()
8623 .mr(6)
8624 .nr(8)
8625 .kr(1)
8626 .sr(1)
8627 .m(m)
8628 .n(n)
8629 .k(k)
8630 .iterations(1)
8631 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8632 }
8633 }
8634 }
8635 }
8636
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8)8637 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8) {
8638 TEST_REQUIRES_ARM_NEON_FMA;
8639 for (uint32_t n = 16; n <= 24; n += 8) {
8640 for (size_t k = 1; k <= 40; k += 9) {
8641 GemmMicrokernelTester()
8642 .mr(6)
8643 .nr(8)
8644 .kr(1)
8645 .sr(1)
8646 .m(6)
8647 .n(n)
8648 .k(k)
8649 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8650 }
8651 }
8652 }
8653
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_cn)8654 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_cn) {
8655 TEST_REQUIRES_ARM_NEON_FMA;
8656 for (uint32_t n = 16; n <= 24; n += 8) {
8657 for (size_t k = 1; k <= 40; k += 9) {
8658 GemmMicrokernelTester()
8659 .mr(6)
8660 .nr(8)
8661 .kr(1)
8662 .sr(1)
8663 .m(6)
8664 .n(n)
8665 .k(k)
8666 .cn_stride(11)
8667 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8668 }
8669 }
8670 }
8671
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_a)8672 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_a) {
8673 TEST_REQUIRES_ARM_NEON_FMA;
8674 for (uint32_t n = 16; n <= 24; n += 8) {
8675 for (size_t k = 1; k <= 40; k += 9) {
8676 GemmMicrokernelTester()
8677 .mr(6)
8678 .nr(8)
8679 .kr(1)
8680 .sr(1)
8681 .m(6)
8682 .n(n)
8683 .k(k)
8684 .a_stride(43)
8685 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8686 }
8687 }
8688 }
8689
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_subtile)8690 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_subtile) {
8691 TEST_REQUIRES_ARM_NEON_FMA;
8692 for (uint32_t n = 16; n <= 24; n += 8) {
8693 for (size_t k = 1; k <= 40; k += 9) {
8694 for (uint32_t m = 1; m <= 6; m++) {
8695 GemmMicrokernelTester()
8696 .mr(6)
8697 .nr(8)
8698 .kr(1)
8699 .sr(1)
8700 .m(m)
8701 .n(n)
8702 .k(k)
8703 .iterations(1)
8704 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8705 }
8706 }
8707 }
8708 }
8709
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm_subtile)8710 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm_subtile) {
8711 TEST_REQUIRES_ARM_NEON_FMA;
8712 for (size_t k = 1; k <= 40; k += 9) {
8713 for (uint32_t n = 1; n <= 8; n++) {
8714 for (uint32_t m = 1; m <= 6; m++) {
8715 GemmMicrokernelTester()
8716 .mr(6)
8717 .nr(8)
8718 .kr(1)
8719 .sr(1)
8720 .m(m)
8721 .n(n)
8722 .k(k)
8723 .cm_stride(11)
8724 .iterations(1)
8725 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8726 }
8727 }
8728 }
8729 }
8730
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmin)8731 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmin) {
8732 TEST_REQUIRES_ARM_NEON_FMA;
8733 GemmMicrokernelTester()
8734 .mr(6)
8735 .nr(8)
8736 .kr(1)
8737 .sr(1)
8738 .m(6)
8739 .n(8)
8740 .k(8)
8741 .qmin(128)
8742 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8743 }
8744
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmax)8745 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmax) {
8746 TEST_REQUIRES_ARM_NEON_FMA;
8747 GemmMicrokernelTester()
8748 .mr(6)
8749 .nr(8)
8750 .kr(1)
8751 .sr(1)
8752 .m(6)
8753 .n(8)
8754 .k(8)
8755 .qmax(128)
8756 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8757 }
8758
TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm)8759 TEST(F32_GEMM_MINMAX_6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm) {
8760 TEST_REQUIRES_ARM_NEON_FMA;
8761 GemmMicrokernelTester()
8762 .mr(6)
8763 .nr(8)
8764 .kr(1)
8765 .sr(1)
8766 .m(6)
8767 .n(8)
8768 .k(8)
8769 .cm_stride(11)
8770 .Test(xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
8771 }
8772 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8773
8774
8775 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_eq_2)8776 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_eq_2) {
8777 TEST_REQUIRES_ARM_NEON;
8778 GemmMicrokernelTester()
8779 .mr(1)
8780 .nr(8)
8781 .kr(1)
8782 .sr(1)
8783 .m(1)
8784 .n(8)
8785 .k(2)
8786 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8787 }
8788
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,strided_cn)8789 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, strided_cn) {
8790 TEST_REQUIRES_ARM_NEON;
8791 GemmMicrokernelTester()
8792 .mr(1)
8793 .nr(8)
8794 .kr(1)
8795 .sr(1)
8796 .m(1)
8797 .n(8)
8798 .k(2)
8799 .cn_stride(11)
8800 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8801 }
8802
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_eq_2_strided_a)8803 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_eq_2_strided_a) {
8804 TEST_REQUIRES_ARM_NEON;
8805 GemmMicrokernelTester()
8806 .mr(1)
8807 .nr(8)
8808 .kr(1)
8809 .sr(1)
8810 .m(1)
8811 .n(8)
8812 .k(2)
8813 .a_stride(5)
8814 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8815 }
8816
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_eq_2_subtile)8817 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_eq_2_subtile) {
8818 TEST_REQUIRES_ARM_NEON;
8819 for (uint32_t n = 1; n <= 8; n++) {
8820 for (uint32_t m = 1; m <= 1; m++) {
8821 GemmMicrokernelTester()
8822 .mr(1)
8823 .nr(8)
8824 .kr(1)
8825 .sr(1)
8826 .m(m)
8827 .n(n)
8828 .k(2)
8829 .iterations(1)
8830 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8831 }
8832 }
8833 }
8834
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_eq_2_subtile_m)8835 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
8836 TEST_REQUIRES_ARM_NEON;
8837 for (uint32_t m = 1; m <= 1; m++) {
8838 GemmMicrokernelTester()
8839 .mr(1)
8840 .nr(8)
8841 .kr(1)
8842 .sr(1)
8843 .m(m)
8844 .n(8)
8845 .k(2)
8846 .iterations(1)
8847 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8848 }
8849 }
8850
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_eq_2_subtile_n)8851 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
8852 TEST_REQUIRES_ARM_NEON;
8853 for (uint32_t n = 1; n <= 8; n++) {
8854 GemmMicrokernelTester()
8855 .mr(1)
8856 .nr(8)
8857 .kr(1)
8858 .sr(1)
8859 .m(1)
8860 .n(n)
8861 .k(2)
8862 .iterations(1)
8863 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8864 }
8865 }
8866
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_lt_2)8867 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_lt_2) {
8868 TEST_REQUIRES_ARM_NEON;
8869 for (size_t k = 1; k < 2; k++) {
8870 GemmMicrokernelTester()
8871 .mr(1)
8872 .nr(8)
8873 .kr(1)
8874 .sr(1)
8875 .m(1)
8876 .n(8)
8877 .k(k)
8878 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8879 }
8880 }
8881
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_lt_2_strided_a)8882 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_lt_2_strided_a) {
8883 TEST_REQUIRES_ARM_NEON;
8884 for (size_t k = 1; k < 2; k++) {
8885 GemmMicrokernelTester()
8886 .mr(1)
8887 .nr(8)
8888 .kr(1)
8889 .sr(1)
8890 .m(1)
8891 .n(8)
8892 .k(k)
8893 .a_stride(5)
8894 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8895 }
8896 }
8897
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_lt_2_subtile)8898 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_lt_2_subtile) {
8899 TEST_REQUIRES_ARM_NEON;
8900 for (size_t k = 1; k < 2; k++) {
8901 for (uint32_t n = 1; n <= 8; n++) {
8902 for (uint32_t m = 1; m <= 1; m++) {
8903 GemmMicrokernelTester()
8904 .mr(1)
8905 .nr(8)
8906 .kr(1)
8907 .sr(1)
8908 .m(m)
8909 .n(n)
8910 .k(k)
8911 .iterations(1)
8912 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8913 }
8914 }
8915 }
8916 }
8917
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_gt_2)8918 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_gt_2) {
8919 TEST_REQUIRES_ARM_NEON;
8920 for (size_t k = 3; k < 4; k++) {
8921 GemmMicrokernelTester()
8922 .mr(1)
8923 .nr(8)
8924 .kr(1)
8925 .sr(1)
8926 .m(1)
8927 .n(8)
8928 .k(k)
8929 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8930 }
8931 }
8932
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_gt_2_strided_a)8933 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_gt_2_strided_a) {
8934 TEST_REQUIRES_ARM_NEON;
8935 for (size_t k = 3; k < 4; k++) {
8936 GemmMicrokernelTester()
8937 .mr(1)
8938 .nr(8)
8939 .kr(1)
8940 .sr(1)
8941 .m(1)
8942 .n(8)
8943 .k(k)
8944 .a_stride(7)
8945 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8946 }
8947 }
8948
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_gt_2_subtile)8949 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_gt_2_subtile) {
8950 TEST_REQUIRES_ARM_NEON;
8951 for (size_t k = 3; k < 4; k++) {
8952 for (uint32_t n = 1; n <= 8; n++) {
8953 for (uint32_t m = 1; m <= 1; m++) {
8954 GemmMicrokernelTester()
8955 .mr(1)
8956 .nr(8)
8957 .kr(1)
8958 .sr(1)
8959 .m(m)
8960 .n(n)
8961 .k(k)
8962 .iterations(1)
8963 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8964 }
8965 }
8966 }
8967 }
8968
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_div_2)8969 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_div_2) {
8970 TEST_REQUIRES_ARM_NEON;
8971 for (size_t k = 4; k <= 20; k += 2) {
8972 GemmMicrokernelTester()
8973 .mr(1)
8974 .nr(8)
8975 .kr(1)
8976 .sr(1)
8977 .m(1)
8978 .n(8)
8979 .k(k)
8980 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8981 }
8982 }
8983
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_div_2_strided_a)8984 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_div_2_strided_a) {
8985 TEST_REQUIRES_ARM_NEON;
8986 for (size_t k = 4; k <= 20; k += 2) {
8987 GemmMicrokernelTester()
8988 .mr(1)
8989 .nr(8)
8990 .kr(1)
8991 .sr(1)
8992 .m(1)
8993 .n(8)
8994 .k(k)
8995 .a_stride(23)
8996 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
8997 }
8998 }
8999
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,k_div_2_subtile)9000 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, k_div_2_subtile) {
9001 TEST_REQUIRES_ARM_NEON;
9002 for (size_t k = 4; k <= 20; k += 2) {
9003 for (uint32_t n = 1; n <= 8; n++) {
9004 for (uint32_t m = 1; m <= 1; m++) {
9005 GemmMicrokernelTester()
9006 .mr(1)
9007 .nr(8)
9008 .kr(1)
9009 .sr(1)
9010 .m(m)
9011 .n(n)
9012 .k(k)
9013 .iterations(1)
9014 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9015 }
9016 }
9017 }
9018 }
9019
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,n_gt_8)9020 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, n_gt_8) {
9021 TEST_REQUIRES_ARM_NEON;
9022 for (uint32_t n = 9; n < 16; n++) {
9023 for (size_t k = 1; k <= 10; k += 3) {
9024 GemmMicrokernelTester()
9025 .mr(1)
9026 .nr(8)
9027 .kr(1)
9028 .sr(1)
9029 .m(1)
9030 .n(n)
9031 .k(k)
9032 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9033 }
9034 }
9035 }
9036
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,n_gt_8_strided_cn)9037 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
9038 TEST_REQUIRES_ARM_NEON;
9039 for (uint32_t n = 9; n < 16; n++) {
9040 for (size_t k = 1; k <= 10; k += 3) {
9041 GemmMicrokernelTester()
9042 .mr(1)
9043 .nr(8)
9044 .kr(1)
9045 .sr(1)
9046 .m(1)
9047 .n(n)
9048 .k(k)
9049 .cn_stride(11)
9050 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9051 }
9052 }
9053 }
9054
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,n_gt_8_strided_a)9055 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, n_gt_8_strided_a) {
9056 TEST_REQUIRES_ARM_NEON;
9057 for (uint32_t n = 9; n < 16; n++) {
9058 for (size_t k = 1; k <= 10; k += 3) {
9059 GemmMicrokernelTester()
9060 .mr(1)
9061 .nr(8)
9062 .kr(1)
9063 .sr(1)
9064 .m(1)
9065 .n(n)
9066 .k(k)
9067 .a_stride(13)
9068 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9069 }
9070 }
9071 }
9072
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,n_gt_8_subtile)9073 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, n_gt_8_subtile) {
9074 TEST_REQUIRES_ARM_NEON;
9075 for (uint32_t n = 9; n < 16; n++) {
9076 for (size_t k = 1; k <= 10; k += 3) {
9077 for (uint32_t m = 1; m <= 1; m++) {
9078 GemmMicrokernelTester()
9079 .mr(1)
9080 .nr(8)
9081 .kr(1)
9082 .sr(1)
9083 .m(m)
9084 .n(n)
9085 .k(k)
9086 .iterations(1)
9087 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9088 }
9089 }
9090 }
9091 }
9092
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,n_div_8)9093 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, n_div_8) {
9094 TEST_REQUIRES_ARM_NEON;
9095 for (uint32_t n = 16; n <= 24; n += 8) {
9096 for (size_t k = 1; k <= 10; k += 3) {
9097 GemmMicrokernelTester()
9098 .mr(1)
9099 .nr(8)
9100 .kr(1)
9101 .sr(1)
9102 .m(1)
9103 .n(n)
9104 .k(k)
9105 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9106 }
9107 }
9108 }
9109
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,n_div_8_strided_cn)9110 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, n_div_8_strided_cn) {
9111 TEST_REQUIRES_ARM_NEON;
9112 for (uint32_t n = 16; n <= 24; n += 8) {
9113 for (size_t k = 1; k <= 10; k += 3) {
9114 GemmMicrokernelTester()
9115 .mr(1)
9116 .nr(8)
9117 .kr(1)
9118 .sr(1)
9119 .m(1)
9120 .n(n)
9121 .k(k)
9122 .cn_stride(11)
9123 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9124 }
9125 }
9126 }
9127
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,n_div_8_strided_a)9128 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, n_div_8_strided_a) {
9129 TEST_REQUIRES_ARM_NEON;
9130 for (uint32_t n = 16; n <= 24; n += 8) {
9131 for (size_t k = 1; k <= 10; k += 3) {
9132 GemmMicrokernelTester()
9133 .mr(1)
9134 .nr(8)
9135 .kr(1)
9136 .sr(1)
9137 .m(1)
9138 .n(n)
9139 .k(k)
9140 .a_stride(13)
9141 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9142 }
9143 }
9144 }
9145
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,n_div_8_subtile)9146 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, n_div_8_subtile) {
9147 TEST_REQUIRES_ARM_NEON;
9148 for (uint32_t n = 16; n <= 24; n += 8) {
9149 for (size_t k = 1; k <= 10; k += 3) {
9150 for (uint32_t m = 1; m <= 1; m++) {
9151 GemmMicrokernelTester()
9152 .mr(1)
9153 .nr(8)
9154 .kr(1)
9155 .sr(1)
9156 .m(m)
9157 .n(n)
9158 .k(k)
9159 .iterations(1)
9160 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9161 }
9162 }
9163 }
9164 }
9165
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,strided_cm_subtile)9166 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, strided_cm_subtile) {
9167 TEST_REQUIRES_ARM_NEON;
9168 for (size_t k = 1; k <= 10; k += 3) {
9169 for (uint32_t n = 1; n <= 8; n++) {
9170 for (uint32_t m = 1; m <= 1; m++) {
9171 GemmMicrokernelTester()
9172 .mr(1)
9173 .nr(8)
9174 .kr(1)
9175 .sr(1)
9176 .m(m)
9177 .n(n)
9178 .k(k)
9179 .cm_stride(11)
9180 .iterations(1)
9181 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9182 }
9183 }
9184 }
9185 }
9186
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,qmin)9187 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, qmin) {
9188 TEST_REQUIRES_ARM_NEON;
9189 GemmMicrokernelTester()
9190 .mr(1)
9191 .nr(8)
9192 .kr(1)
9193 .sr(1)
9194 .m(1)
9195 .n(8)
9196 .k(2)
9197 .qmin(128)
9198 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9199 }
9200
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,qmax)9201 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, qmax) {
9202 TEST_REQUIRES_ARM_NEON;
9203 GemmMicrokernelTester()
9204 .mr(1)
9205 .nr(8)
9206 .kr(1)
9207 .sr(1)
9208 .m(1)
9209 .n(8)
9210 .k(2)
9211 .qmax(128)
9212 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9213 }
9214
TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64,strided_cm)9215 TEST(F32_GEMM_MINMAX_1X8__NEON_DUP_LD64, strided_cm) {
9216 TEST_REQUIRES_ARM_NEON;
9217 GemmMicrokernelTester()
9218 .mr(1)
9219 .nr(8)
9220 .kr(1)
9221 .sr(1)
9222 .m(1)
9223 .n(8)
9224 .k(2)
9225 .cm_stride(11)
9226 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
9227 }
9228 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
9229
9230
9231 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_eq_2)9232 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_eq_2) {
9233 TEST_REQUIRES_ARM_NEON;
9234 GemmMicrokernelTester()
9235 .mr(1)
9236 .nr(8)
9237 .kr(1)
9238 .sr(1)
9239 .m(1)
9240 .n(8)
9241 .k(2)
9242 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9243 }
9244
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,strided_cn)9245 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, strided_cn) {
9246 TEST_REQUIRES_ARM_NEON;
9247 GemmMicrokernelTester()
9248 .mr(1)
9249 .nr(8)
9250 .kr(1)
9251 .sr(1)
9252 .m(1)
9253 .n(8)
9254 .k(2)
9255 .cn_stride(11)
9256 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9257 }
9258
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_eq_2_strided_a)9259 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_eq_2_strided_a) {
9260 TEST_REQUIRES_ARM_NEON;
9261 GemmMicrokernelTester()
9262 .mr(1)
9263 .nr(8)
9264 .kr(1)
9265 .sr(1)
9266 .m(1)
9267 .n(8)
9268 .k(2)
9269 .a_stride(5)
9270 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9271 }
9272
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_eq_2_subtile)9273 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
9274 TEST_REQUIRES_ARM_NEON;
9275 for (uint32_t n = 1; n <= 8; n++) {
9276 for (uint32_t m = 1; m <= 1; m++) {
9277 GemmMicrokernelTester()
9278 .mr(1)
9279 .nr(8)
9280 .kr(1)
9281 .sr(1)
9282 .m(m)
9283 .n(n)
9284 .k(2)
9285 .iterations(1)
9286 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9287 }
9288 }
9289 }
9290
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_eq_2_subtile_m)9291 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
9292 TEST_REQUIRES_ARM_NEON;
9293 for (uint32_t m = 1; m <= 1; m++) {
9294 GemmMicrokernelTester()
9295 .mr(1)
9296 .nr(8)
9297 .kr(1)
9298 .sr(1)
9299 .m(m)
9300 .n(8)
9301 .k(2)
9302 .iterations(1)
9303 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9304 }
9305 }
9306
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_eq_2_subtile_n)9307 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
9308 TEST_REQUIRES_ARM_NEON;
9309 for (uint32_t n = 1; n <= 8; n++) {
9310 GemmMicrokernelTester()
9311 .mr(1)
9312 .nr(8)
9313 .kr(1)
9314 .sr(1)
9315 .m(1)
9316 .n(n)
9317 .k(2)
9318 .iterations(1)
9319 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9320 }
9321 }
9322
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_lt_2)9323 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_lt_2) {
9324 TEST_REQUIRES_ARM_NEON;
9325 for (size_t k = 1; k < 2; k++) {
9326 GemmMicrokernelTester()
9327 .mr(1)
9328 .nr(8)
9329 .kr(1)
9330 .sr(1)
9331 .m(1)
9332 .n(8)
9333 .k(k)
9334 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9335 }
9336 }
9337
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_lt_2_strided_a)9338 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_lt_2_strided_a) {
9339 TEST_REQUIRES_ARM_NEON;
9340 for (size_t k = 1; k < 2; k++) {
9341 GemmMicrokernelTester()
9342 .mr(1)
9343 .nr(8)
9344 .kr(1)
9345 .sr(1)
9346 .m(1)
9347 .n(8)
9348 .k(k)
9349 .a_stride(5)
9350 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9351 }
9352 }
9353
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_lt_2_subtile)9354 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
9355 TEST_REQUIRES_ARM_NEON;
9356 for (size_t k = 1; k < 2; k++) {
9357 for (uint32_t n = 1; n <= 8; n++) {
9358 for (uint32_t m = 1; m <= 1; m++) {
9359 GemmMicrokernelTester()
9360 .mr(1)
9361 .nr(8)
9362 .kr(1)
9363 .sr(1)
9364 .m(m)
9365 .n(n)
9366 .k(k)
9367 .iterations(1)
9368 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9369 }
9370 }
9371 }
9372 }
9373
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_gt_2)9374 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_gt_2) {
9375 TEST_REQUIRES_ARM_NEON;
9376 for (size_t k = 3; k < 4; k++) {
9377 GemmMicrokernelTester()
9378 .mr(1)
9379 .nr(8)
9380 .kr(1)
9381 .sr(1)
9382 .m(1)
9383 .n(8)
9384 .k(k)
9385 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9386 }
9387 }
9388
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_gt_2_strided_a)9389 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_gt_2_strided_a) {
9390 TEST_REQUIRES_ARM_NEON;
9391 for (size_t k = 3; k < 4; k++) {
9392 GemmMicrokernelTester()
9393 .mr(1)
9394 .nr(8)
9395 .kr(1)
9396 .sr(1)
9397 .m(1)
9398 .n(8)
9399 .k(k)
9400 .a_stride(7)
9401 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9402 }
9403 }
9404
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_gt_2_subtile)9405 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
9406 TEST_REQUIRES_ARM_NEON;
9407 for (size_t k = 3; k < 4; k++) {
9408 for (uint32_t n = 1; n <= 8; n++) {
9409 for (uint32_t m = 1; m <= 1; m++) {
9410 GemmMicrokernelTester()
9411 .mr(1)
9412 .nr(8)
9413 .kr(1)
9414 .sr(1)
9415 .m(m)
9416 .n(n)
9417 .k(k)
9418 .iterations(1)
9419 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9420 }
9421 }
9422 }
9423 }
9424
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_div_2)9425 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_div_2) {
9426 TEST_REQUIRES_ARM_NEON;
9427 for (size_t k = 4; k <= 20; k += 2) {
9428 GemmMicrokernelTester()
9429 .mr(1)
9430 .nr(8)
9431 .kr(1)
9432 .sr(1)
9433 .m(1)
9434 .n(8)
9435 .k(k)
9436 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9437 }
9438 }
9439
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_div_2_strided_a)9440 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_div_2_strided_a) {
9441 TEST_REQUIRES_ARM_NEON;
9442 for (size_t k = 4; k <= 20; k += 2) {
9443 GemmMicrokernelTester()
9444 .mr(1)
9445 .nr(8)
9446 .kr(1)
9447 .sr(1)
9448 .m(1)
9449 .n(8)
9450 .k(k)
9451 .a_stride(23)
9452 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9453 }
9454 }
9455
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,k_div_2_subtile)9456 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, k_div_2_subtile) {
9457 TEST_REQUIRES_ARM_NEON;
9458 for (size_t k = 4; k <= 20; k += 2) {
9459 for (uint32_t n = 1; n <= 8; n++) {
9460 for (uint32_t m = 1; m <= 1; m++) {
9461 GemmMicrokernelTester()
9462 .mr(1)
9463 .nr(8)
9464 .kr(1)
9465 .sr(1)
9466 .m(m)
9467 .n(n)
9468 .k(k)
9469 .iterations(1)
9470 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9471 }
9472 }
9473 }
9474 }
9475
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,n_gt_8)9476 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, n_gt_8) {
9477 TEST_REQUIRES_ARM_NEON;
9478 for (uint32_t n = 9; n < 16; n++) {
9479 for (size_t k = 1; k <= 10; k += 3) {
9480 GemmMicrokernelTester()
9481 .mr(1)
9482 .nr(8)
9483 .kr(1)
9484 .sr(1)
9485 .m(1)
9486 .n(n)
9487 .k(k)
9488 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9489 }
9490 }
9491 }
9492
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,n_gt_8_strided_cn)9493 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
9494 TEST_REQUIRES_ARM_NEON;
9495 for (uint32_t n = 9; n < 16; n++) {
9496 for (size_t k = 1; k <= 10; k += 3) {
9497 GemmMicrokernelTester()
9498 .mr(1)
9499 .nr(8)
9500 .kr(1)
9501 .sr(1)
9502 .m(1)
9503 .n(n)
9504 .k(k)
9505 .cn_stride(11)
9506 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9507 }
9508 }
9509 }
9510
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,n_gt_8_strided_a)9511 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, n_gt_8_strided_a) {
9512 TEST_REQUIRES_ARM_NEON;
9513 for (uint32_t n = 9; n < 16; n++) {
9514 for (size_t k = 1; k <= 10; k += 3) {
9515 GemmMicrokernelTester()
9516 .mr(1)
9517 .nr(8)
9518 .kr(1)
9519 .sr(1)
9520 .m(1)
9521 .n(n)
9522 .k(k)
9523 .a_stride(13)
9524 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9525 }
9526 }
9527 }
9528
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,n_gt_8_subtile)9529 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
9530 TEST_REQUIRES_ARM_NEON;
9531 for (uint32_t n = 9; n < 16; n++) {
9532 for (size_t k = 1; k <= 10; k += 3) {
9533 for (uint32_t m = 1; m <= 1; m++) {
9534 GemmMicrokernelTester()
9535 .mr(1)
9536 .nr(8)
9537 .kr(1)
9538 .sr(1)
9539 .m(m)
9540 .n(n)
9541 .k(k)
9542 .iterations(1)
9543 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9544 }
9545 }
9546 }
9547 }
9548
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,n_div_8)9549 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, n_div_8) {
9550 TEST_REQUIRES_ARM_NEON;
9551 for (uint32_t n = 16; n <= 24; n += 8) {
9552 for (size_t k = 1; k <= 10; k += 3) {
9553 GemmMicrokernelTester()
9554 .mr(1)
9555 .nr(8)
9556 .kr(1)
9557 .sr(1)
9558 .m(1)
9559 .n(n)
9560 .k(k)
9561 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9562 }
9563 }
9564 }
9565
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,n_div_8_strided_cn)9566 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
9567 TEST_REQUIRES_ARM_NEON;
9568 for (uint32_t n = 16; n <= 24; n += 8) {
9569 for (size_t k = 1; k <= 10; k += 3) {
9570 GemmMicrokernelTester()
9571 .mr(1)
9572 .nr(8)
9573 .kr(1)
9574 .sr(1)
9575 .m(1)
9576 .n(n)
9577 .k(k)
9578 .cn_stride(11)
9579 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9580 }
9581 }
9582 }
9583
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,n_div_8_strided_a)9584 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, n_div_8_strided_a) {
9585 TEST_REQUIRES_ARM_NEON;
9586 for (uint32_t n = 16; n <= 24; n += 8) {
9587 for (size_t k = 1; k <= 10; k += 3) {
9588 GemmMicrokernelTester()
9589 .mr(1)
9590 .nr(8)
9591 .kr(1)
9592 .sr(1)
9593 .m(1)
9594 .n(n)
9595 .k(k)
9596 .a_stride(13)
9597 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9598 }
9599 }
9600 }
9601
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,n_div_8_subtile)9602 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, n_div_8_subtile) {
9603 TEST_REQUIRES_ARM_NEON;
9604 for (uint32_t n = 16; n <= 24; n += 8) {
9605 for (size_t k = 1; k <= 10; k += 3) {
9606 for (uint32_t m = 1; m <= 1; m++) {
9607 GemmMicrokernelTester()
9608 .mr(1)
9609 .nr(8)
9610 .kr(1)
9611 .sr(1)
9612 .m(m)
9613 .n(n)
9614 .k(k)
9615 .iterations(1)
9616 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9617 }
9618 }
9619 }
9620 }
9621
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,strided_cm_subtile)9622 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, strided_cm_subtile) {
9623 TEST_REQUIRES_ARM_NEON;
9624 for (size_t k = 1; k <= 10; k += 3) {
9625 for (uint32_t n = 1; n <= 8; n++) {
9626 for (uint32_t m = 1; m <= 1; m++) {
9627 GemmMicrokernelTester()
9628 .mr(1)
9629 .nr(8)
9630 .kr(1)
9631 .sr(1)
9632 .m(m)
9633 .n(n)
9634 .k(k)
9635 .cm_stride(11)
9636 .iterations(1)
9637 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9638 }
9639 }
9640 }
9641 }
9642
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,qmin)9643 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, qmin) {
9644 TEST_REQUIRES_ARM_NEON;
9645 GemmMicrokernelTester()
9646 .mr(1)
9647 .nr(8)
9648 .kr(1)
9649 .sr(1)
9650 .m(1)
9651 .n(8)
9652 .k(2)
9653 .qmin(128)
9654 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9655 }
9656
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,qmax)9657 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, qmax) {
9658 TEST_REQUIRES_ARM_NEON;
9659 GemmMicrokernelTester()
9660 .mr(1)
9661 .nr(8)
9662 .kr(1)
9663 .sr(1)
9664 .m(1)
9665 .n(8)
9666 .k(2)
9667 .qmax(128)
9668 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9669 }
9670
TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64,strided_cm)9671 TEST(F32_GEMM_MINMAX_1X8__NEON_LANE_LD64, strided_cm) {
9672 TEST_REQUIRES_ARM_NEON;
9673 GemmMicrokernelTester()
9674 .mr(1)
9675 .nr(8)
9676 .kr(1)
9677 .sr(1)
9678 .m(1)
9679 .n(8)
9680 .k(2)
9681 .cm_stride(11)
9682 .Test(xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
9683 }
9684 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
9685
9686
9687 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_eq_4)9688 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_eq_4) {
9689 TEST_REQUIRES_ARM_NEON;
9690 GemmMicrokernelTester()
9691 .mr(1)
9692 .nr(8)
9693 .kr(1)
9694 .sr(4)
9695 .m(1)
9696 .n(8)
9697 .k(4)
9698 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9699 }
9700
TEST(F32_GEMM_MINMAX_1X8S4__NEON,strided_cn)9701 TEST(F32_GEMM_MINMAX_1X8S4__NEON, strided_cn) {
9702 TEST_REQUIRES_ARM_NEON;
9703 GemmMicrokernelTester()
9704 .mr(1)
9705 .nr(8)
9706 .kr(1)
9707 .sr(4)
9708 .m(1)
9709 .n(8)
9710 .k(4)
9711 .cn_stride(11)
9712 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9713 }
9714
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_eq_4_strided_a)9715 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_eq_4_strided_a) {
9716 TEST_REQUIRES_ARM_NEON;
9717 GemmMicrokernelTester()
9718 .mr(1)
9719 .nr(8)
9720 .kr(1)
9721 .sr(4)
9722 .m(1)
9723 .n(8)
9724 .k(4)
9725 .a_stride(7)
9726 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9727 }
9728
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_eq_4_subtile)9729 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_eq_4_subtile) {
9730 TEST_REQUIRES_ARM_NEON;
9731 for (uint32_t n = 1; n <= 8; n++) {
9732 for (uint32_t m = 1; m <= 1; m++) {
9733 GemmMicrokernelTester()
9734 .mr(1)
9735 .nr(8)
9736 .kr(1)
9737 .sr(4)
9738 .m(m)
9739 .n(n)
9740 .k(4)
9741 .iterations(1)
9742 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9743 }
9744 }
9745 }
9746
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_eq_4_subtile_m)9747 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_eq_4_subtile_m) {
9748 TEST_REQUIRES_ARM_NEON;
9749 for (uint32_t m = 1; m <= 1; m++) {
9750 GemmMicrokernelTester()
9751 .mr(1)
9752 .nr(8)
9753 .kr(1)
9754 .sr(4)
9755 .m(m)
9756 .n(8)
9757 .k(4)
9758 .iterations(1)
9759 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9760 }
9761 }
9762
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_eq_4_subtile_n)9763 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_eq_4_subtile_n) {
9764 TEST_REQUIRES_ARM_NEON;
9765 for (uint32_t n = 1; n <= 8; n++) {
9766 GemmMicrokernelTester()
9767 .mr(1)
9768 .nr(8)
9769 .kr(1)
9770 .sr(4)
9771 .m(1)
9772 .n(n)
9773 .k(4)
9774 .iterations(1)
9775 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9776 }
9777 }
9778
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_lt_4)9779 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_lt_4) {
9780 TEST_REQUIRES_ARM_NEON;
9781 for (size_t k = 1; k < 4; k++) {
9782 GemmMicrokernelTester()
9783 .mr(1)
9784 .nr(8)
9785 .kr(1)
9786 .sr(4)
9787 .m(1)
9788 .n(8)
9789 .k(k)
9790 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9791 }
9792 }
9793
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_lt_4_strided_a)9794 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_lt_4_strided_a) {
9795 TEST_REQUIRES_ARM_NEON;
9796 for (size_t k = 1; k < 4; k++) {
9797 GemmMicrokernelTester()
9798 .mr(1)
9799 .nr(8)
9800 .kr(1)
9801 .sr(4)
9802 .m(1)
9803 .n(8)
9804 .k(k)
9805 .a_stride(7)
9806 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9807 }
9808 }
9809
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_lt_4_subtile)9810 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_lt_4_subtile) {
9811 TEST_REQUIRES_ARM_NEON;
9812 for (size_t k = 1; k < 4; k++) {
9813 for (uint32_t n = 1; n <= 8; n++) {
9814 for (uint32_t m = 1; m <= 1; m++) {
9815 GemmMicrokernelTester()
9816 .mr(1)
9817 .nr(8)
9818 .kr(1)
9819 .sr(4)
9820 .m(m)
9821 .n(n)
9822 .k(k)
9823 .iterations(1)
9824 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9825 }
9826 }
9827 }
9828 }
9829
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_gt_4)9830 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_gt_4) {
9831 TEST_REQUIRES_ARM_NEON;
9832 for (size_t k = 5; k < 8; k++) {
9833 GemmMicrokernelTester()
9834 .mr(1)
9835 .nr(8)
9836 .kr(1)
9837 .sr(4)
9838 .m(1)
9839 .n(8)
9840 .k(k)
9841 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9842 }
9843 }
9844
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_gt_4_strided_a)9845 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_gt_4_strided_a) {
9846 TEST_REQUIRES_ARM_NEON;
9847 for (size_t k = 5; k < 8; k++) {
9848 GemmMicrokernelTester()
9849 .mr(1)
9850 .nr(8)
9851 .kr(1)
9852 .sr(4)
9853 .m(1)
9854 .n(8)
9855 .k(k)
9856 .a_stride(11)
9857 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9858 }
9859 }
9860
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_gt_4_subtile)9861 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_gt_4_subtile) {
9862 TEST_REQUIRES_ARM_NEON;
9863 for (size_t k = 5; k < 8; k++) {
9864 for (uint32_t n = 1; n <= 8; n++) {
9865 for (uint32_t m = 1; m <= 1; m++) {
9866 GemmMicrokernelTester()
9867 .mr(1)
9868 .nr(8)
9869 .kr(1)
9870 .sr(4)
9871 .m(m)
9872 .n(n)
9873 .k(k)
9874 .iterations(1)
9875 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9876 }
9877 }
9878 }
9879 }
9880
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_div_4)9881 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_div_4) {
9882 TEST_REQUIRES_ARM_NEON;
9883 for (size_t k = 8; k <= 40; k += 4) {
9884 GemmMicrokernelTester()
9885 .mr(1)
9886 .nr(8)
9887 .kr(1)
9888 .sr(4)
9889 .m(1)
9890 .n(8)
9891 .k(k)
9892 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9893 }
9894 }
9895
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_div_4_strided_a)9896 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_div_4_strided_a) {
9897 TEST_REQUIRES_ARM_NEON;
9898 for (size_t k = 8; k <= 40; k += 4) {
9899 GemmMicrokernelTester()
9900 .mr(1)
9901 .nr(8)
9902 .kr(1)
9903 .sr(4)
9904 .m(1)
9905 .n(8)
9906 .k(k)
9907 .a_stride(43)
9908 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9909 }
9910 }
9911
TEST(F32_GEMM_MINMAX_1X8S4__NEON,k_div_4_subtile)9912 TEST(F32_GEMM_MINMAX_1X8S4__NEON, k_div_4_subtile) {
9913 TEST_REQUIRES_ARM_NEON;
9914 for (size_t k = 8; k <= 40; k += 4) {
9915 for (uint32_t n = 1; n <= 8; n++) {
9916 for (uint32_t m = 1; m <= 1; m++) {
9917 GemmMicrokernelTester()
9918 .mr(1)
9919 .nr(8)
9920 .kr(1)
9921 .sr(4)
9922 .m(m)
9923 .n(n)
9924 .k(k)
9925 .iterations(1)
9926 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9927 }
9928 }
9929 }
9930 }
9931
TEST(F32_GEMM_MINMAX_1X8S4__NEON,n_gt_8)9932 TEST(F32_GEMM_MINMAX_1X8S4__NEON, n_gt_8) {
9933 TEST_REQUIRES_ARM_NEON;
9934 for (uint32_t n = 9; n < 16; n++) {
9935 for (size_t k = 1; k <= 20; k += 5) {
9936 GemmMicrokernelTester()
9937 .mr(1)
9938 .nr(8)
9939 .kr(1)
9940 .sr(4)
9941 .m(1)
9942 .n(n)
9943 .k(k)
9944 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9945 }
9946 }
9947 }
9948
TEST(F32_GEMM_MINMAX_1X8S4__NEON,n_gt_8_strided_cn)9949 TEST(F32_GEMM_MINMAX_1X8S4__NEON, n_gt_8_strided_cn) {
9950 TEST_REQUIRES_ARM_NEON;
9951 for (uint32_t n = 9; n < 16; n++) {
9952 for (size_t k = 1; k <= 20; k += 5) {
9953 GemmMicrokernelTester()
9954 .mr(1)
9955 .nr(8)
9956 .kr(1)
9957 .sr(4)
9958 .m(1)
9959 .n(n)
9960 .k(k)
9961 .cn_stride(11)
9962 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9963 }
9964 }
9965 }
9966
TEST(F32_GEMM_MINMAX_1X8S4__NEON,n_gt_8_strided_a)9967 TEST(F32_GEMM_MINMAX_1X8S4__NEON, n_gt_8_strided_a) {
9968 TEST_REQUIRES_ARM_NEON;
9969 for (uint32_t n = 9; n < 16; n++) {
9970 for (size_t k = 1; k <= 20; k += 5) {
9971 GemmMicrokernelTester()
9972 .mr(1)
9973 .nr(8)
9974 .kr(1)
9975 .sr(4)
9976 .m(1)
9977 .n(n)
9978 .k(k)
9979 .a_stride(23)
9980 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
9981 }
9982 }
9983 }
9984
TEST(F32_GEMM_MINMAX_1X8S4__NEON,n_gt_8_subtile)9985 TEST(F32_GEMM_MINMAX_1X8S4__NEON, n_gt_8_subtile) {
9986 TEST_REQUIRES_ARM_NEON;
9987 for (uint32_t n = 9; n < 16; n++) {
9988 for (size_t k = 1; k <= 20; k += 5) {
9989 for (uint32_t m = 1; m <= 1; m++) {
9990 GemmMicrokernelTester()
9991 .mr(1)
9992 .nr(8)
9993 .kr(1)
9994 .sr(4)
9995 .m(m)
9996 .n(n)
9997 .k(k)
9998 .iterations(1)
9999 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
10000 }
10001 }
10002 }
10003 }
10004
TEST(F32_GEMM_MINMAX_1X8S4__NEON,n_div_8)10005 TEST(F32_GEMM_MINMAX_1X8S4__NEON, n_div_8) {
10006 TEST_REQUIRES_ARM_NEON;
10007 for (uint32_t n = 16; n <= 24; n += 8) {
10008 for (size_t k = 1; k <= 20; k += 5) {
10009 GemmMicrokernelTester()
10010 .mr(1)
10011 .nr(8)
10012 .kr(1)
10013 .sr(4)
10014 .m(1)
10015 .n(n)
10016 .k(k)
10017 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
10018 }
10019 }
10020 }
10021
TEST(F32_GEMM_MINMAX_1X8S4__NEON,n_div_8_strided_cn)10022 TEST(F32_GEMM_MINMAX_1X8S4__NEON, n_div_8_strided_cn) {
10023 TEST_REQUIRES_ARM_NEON;
10024 for (uint32_t n = 16; n <= 24; n += 8) {
10025 for (size_t k = 1; k <= 20; k += 5) {
10026 GemmMicrokernelTester()
10027 .mr(1)
10028 .nr(8)
10029 .kr(1)
10030 .sr(4)
10031 .m(1)
10032 .n(n)
10033 .k(k)
10034 .cn_stride(11)
10035 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
10036 }
10037 }
10038 }
10039
TEST(F32_GEMM_MINMAX_1X8S4__NEON,n_div_8_strided_a)10040 TEST(F32_GEMM_MINMAX_1X8S4__NEON, n_div_8_strided_a) {
10041 TEST_REQUIRES_ARM_NEON;
10042 for (uint32_t n = 16; n <= 24; n += 8) {
10043 for (size_t k = 1; k <= 20; k += 5) {
10044 GemmMicrokernelTester()
10045 .mr(1)
10046 .nr(8)
10047 .kr(1)
10048 .sr(4)
10049 .m(1)
10050 .n(n)
10051 .k(k)
10052 .a_stride(23)
10053 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
10054 }
10055 }
10056 }
10057
TEST(F32_GEMM_MINMAX_1X8S4__NEON,n_div_8_subtile)10058 TEST(F32_GEMM_MINMAX_1X8S4__NEON, n_div_8_subtile) {
10059 TEST_REQUIRES_ARM_NEON;
10060 for (uint32_t n = 16; n <= 24; n += 8) {
10061 for (size_t k = 1; k <= 20; k += 5) {
10062 for (uint32_t m = 1; m <= 1; m++) {
10063 GemmMicrokernelTester()
10064 .mr(1)
10065 .nr(8)
10066 .kr(1)
10067 .sr(4)
10068 .m(m)
10069 .n(n)
10070 .k(k)
10071 .iterations(1)
10072 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
10073 }
10074 }
10075 }
10076 }
10077
TEST(F32_GEMM_MINMAX_1X8S4__NEON,strided_cm_subtile)10078 TEST(F32_GEMM_MINMAX_1X8S4__NEON, strided_cm_subtile) {
10079 TEST_REQUIRES_ARM_NEON;
10080 for (size_t k = 1; k <= 20; k += 5) {
10081 for (uint32_t n = 1; n <= 8; n++) {
10082 for (uint32_t m = 1; m <= 1; m++) {
10083 GemmMicrokernelTester()
10084 .mr(1)
10085 .nr(8)
10086 .kr(1)
10087 .sr(4)
10088 .m(m)
10089 .n(n)
10090 .k(k)
10091 .cm_stride(11)
10092 .iterations(1)
10093 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
10094 }
10095 }
10096 }
10097 }
10098
TEST(F32_GEMM_MINMAX_1X8S4__NEON,qmin)10099 TEST(F32_GEMM_MINMAX_1X8S4__NEON, qmin) {
10100 TEST_REQUIRES_ARM_NEON;
10101 GemmMicrokernelTester()
10102 .mr(1)
10103 .nr(8)
10104 .kr(1)
10105 .sr(4)
10106 .m(1)
10107 .n(8)
10108 .k(4)
10109 .qmin(128)
10110 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
10111 }
10112
TEST(F32_GEMM_MINMAX_1X8S4__NEON,qmax)10113 TEST(F32_GEMM_MINMAX_1X8S4__NEON, qmax) {
10114 TEST_REQUIRES_ARM_NEON;
10115 GemmMicrokernelTester()
10116 .mr(1)
10117 .nr(8)
10118 .kr(1)
10119 .sr(4)
10120 .m(1)
10121 .n(8)
10122 .k(4)
10123 .qmax(128)
10124 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
10125 }
10126
TEST(F32_GEMM_MINMAX_1X8S4__NEON,strided_cm)10127 TEST(F32_GEMM_MINMAX_1X8S4__NEON, strided_cm) {
10128 TEST_REQUIRES_ARM_NEON;
10129 GemmMicrokernelTester()
10130 .mr(1)
10131 .nr(8)
10132 .kr(1)
10133 .sr(4)
10134 .m(1)
10135 .n(8)
10136 .k(4)
10137 .cm_stride(11)
10138 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neon, xnn_init_f32_minmax_scalar_params);
10139 }
10140 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10141
10142
10143 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_eq_4)10144 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_eq_4) {
10145 TEST_REQUIRES_ARM_NEON_FMA;
10146 GemmMicrokernelTester()
10147 .mr(1)
10148 .nr(8)
10149 .kr(1)
10150 .sr(4)
10151 .m(1)
10152 .n(8)
10153 .k(4)
10154 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10155 }
10156
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,strided_cn)10157 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, strided_cn) {
10158 TEST_REQUIRES_ARM_NEON_FMA;
10159 GemmMicrokernelTester()
10160 .mr(1)
10161 .nr(8)
10162 .kr(1)
10163 .sr(4)
10164 .m(1)
10165 .n(8)
10166 .k(4)
10167 .cn_stride(11)
10168 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10169 }
10170
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_eq_4_strided_a)10171 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_eq_4_strided_a) {
10172 TEST_REQUIRES_ARM_NEON_FMA;
10173 GemmMicrokernelTester()
10174 .mr(1)
10175 .nr(8)
10176 .kr(1)
10177 .sr(4)
10178 .m(1)
10179 .n(8)
10180 .k(4)
10181 .a_stride(7)
10182 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10183 }
10184
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_eq_4_subtile)10185 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_eq_4_subtile) {
10186 TEST_REQUIRES_ARM_NEON_FMA;
10187 for (uint32_t n = 1; n <= 8; n++) {
10188 for (uint32_t m = 1; m <= 1; m++) {
10189 GemmMicrokernelTester()
10190 .mr(1)
10191 .nr(8)
10192 .kr(1)
10193 .sr(4)
10194 .m(m)
10195 .n(n)
10196 .k(4)
10197 .iterations(1)
10198 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10199 }
10200 }
10201 }
10202
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_eq_4_subtile_m)10203 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_eq_4_subtile_m) {
10204 TEST_REQUIRES_ARM_NEON_FMA;
10205 for (uint32_t m = 1; m <= 1; m++) {
10206 GemmMicrokernelTester()
10207 .mr(1)
10208 .nr(8)
10209 .kr(1)
10210 .sr(4)
10211 .m(m)
10212 .n(8)
10213 .k(4)
10214 .iterations(1)
10215 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10216 }
10217 }
10218
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_eq_4_subtile_n)10219 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_eq_4_subtile_n) {
10220 TEST_REQUIRES_ARM_NEON_FMA;
10221 for (uint32_t n = 1; n <= 8; n++) {
10222 GemmMicrokernelTester()
10223 .mr(1)
10224 .nr(8)
10225 .kr(1)
10226 .sr(4)
10227 .m(1)
10228 .n(n)
10229 .k(4)
10230 .iterations(1)
10231 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10232 }
10233 }
10234
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_lt_4)10235 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_lt_4) {
10236 TEST_REQUIRES_ARM_NEON_FMA;
10237 for (size_t k = 1; k < 4; k++) {
10238 GemmMicrokernelTester()
10239 .mr(1)
10240 .nr(8)
10241 .kr(1)
10242 .sr(4)
10243 .m(1)
10244 .n(8)
10245 .k(k)
10246 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10247 }
10248 }
10249
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_lt_4_strided_a)10250 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_lt_4_strided_a) {
10251 TEST_REQUIRES_ARM_NEON_FMA;
10252 for (size_t k = 1; k < 4; k++) {
10253 GemmMicrokernelTester()
10254 .mr(1)
10255 .nr(8)
10256 .kr(1)
10257 .sr(4)
10258 .m(1)
10259 .n(8)
10260 .k(k)
10261 .a_stride(7)
10262 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10263 }
10264 }
10265
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_lt_4_subtile)10266 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_lt_4_subtile) {
10267 TEST_REQUIRES_ARM_NEON_FMA;
10268 for (size_t k = 1; k < 4; k++) {
10269 for (uint32_t n = 1; n <= 8; n++) {
10270 for (uint32_t m = 1; m <= 1; m++) {
10271 GemmMicrokernelTester()
10272 .mr(1)
10273 .nr(8)
10274 .kr(1)
10275 .sr(4)
10276 .m(m)
10277 .n(n)
10278 .k(k)
10279 .iterations(1)
10280 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10281 }
10282 }
10283 }
10284 }
10285
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_gt_4)10286 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_gt_4) {
10287 TEST_REQUIRES_ARM_NEON_FMA;
10288 for (size_t k = 5; k < 8; k++) {
10289 GemmMicrokernelTester()
10290 .mr(1)
10291 .nr(8)
10292 .kr(1)
10293 .sr(4)
10294 .m(1)
10295 .n(8)
10296 .k(k)
10297 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10298 }
10299 }
10300
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_gt_4_strided_a)10301 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_gt_4_strided_a) {
10302 TEST_REQUIRES_ARM_NEON_FMA;
10303 for (size_t k = 5; k < 8; k++) {
10304 GemmMicrokernelTester()
10305 .mr(1)
10306 .nr(8)
10307 .kr(1)
10308 .sr(4)
10309 .m(1)
10310 .n(8)
10311 .k(k)
10312 .a_stride(11)
10313 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10314 }
10315 }
10316
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_gt_4_subtile)10317 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_gt_4_subtile) {
10318 TEST_REQUIRES_ARM_NEON_FMA;
10319 for (size_t k = 5; k < 8; k++) {
10320 for (uint32_t n = 1; n <= 8; n++) {
10321 for (uint32_t m = 1; m <= 1; m++) {
10322 GemmMicrokernelTester()
10323 .mr(1)
10324 .nr(8)
10325 .kr(1)
10326 .sr(4)
10327 .m(m)
10328 .n(n)
10329 .k(k)
10330 .iterations(1)
10331 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10332 }
10333 }
10334 }
10335 }
10336
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_div_4)10337 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_div_4) {
10338 TEST_REQUIRES_ARM_NEON_FMA;
10339 for (size_t k = 8; k <= 40; k += 4) {
10340 GemmMicrokernelTester()
10341 .mr(1)
10342 .nr(8)
10343 .kr(1)
10344 .sr(4)
10345 .m(1)
10346 .n(8)
10347 .k(k)
10348 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10349 }
10350 }
10351
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_div_4_strided_a)10352 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_div_4_strided_a) {
10353 TEST_REQUIRES_ARM_NEON_FMA;
10354 for (size_t k = 8; k <= 40; k += 4) {
10355 GemmMicrokernelTester()
10356 .mr(1)
10357 .nr(8)
10358 .kr(1)
10359 .sr(4)
10360 .m(1)
10361 .n(8)
10362 .k(k)
10363 .a_stride(43)
10364 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10365 }
10366 }
10367
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,k_div_4_subtile)10368 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, k_div_4_subtile) {
10369 TEST_REQUIRES_ARM_NEON_FMA;
10370 for (size_t k = 8; k <= 40; k += 4) {
10371 for (uint32_t n = 1; n <= 8; n++) {
10372 for (uint32_t m = 1; m <= 1; m++) {
10373 GemmMicrokernelTester()
10374 .mr(1)
10375 .nr(8)
10376 .kr(1)
10377 .sr(4)
10378 .m(m)
10379 .n(n)
10380 .k(k)
10381 .iterations(1)
10382 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10383 }
10384 }
10385 }
10386 }
10387
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,n_gt_8)10388 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, n_gt_8) {
10389 TEST_REQUIRES_ARM_NEON_FMA;
10390 for (uint32_t n = 9; n < 16; n++) {
10391 for (size_t k = 1; k <= 20; k += 5) {
10392 GemmMicrokernelTester()
10393 .mr(1)
10394 .nr(8)
10395 .kr(1)
10396 .sr(4)
10397 .m(1)
10398 .n(n)
10399 .k(k)
10400 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10401 }
10402 }
10403 }
10404
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,n_gt_8_strided_cn)10405 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, n_gt_8_strided_cn) {
10406 TEST_REQUIRES_ARM_NEON_FMA;
10407 for (uint32_t n = 9; n < 16; n++) {
10408 for (size_t k = 1; k <= 20; k += 5) {
10409 GemmMicrokernelTester()
10410 .mr(1)
10411 .nr(8)
10412 .kr(1)
10413 .sr(4)
10414 .m(1)
10415 .n(n)
10416 .k(k)
10417 .cn_stride(11)
10418 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10419 }
10420 }
10421 }
10422
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,n_gt_8_strided_a)10423 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, n_gt_8_strided_a) {
10424 TEST_REQUIRES_ARM_NEON_FMA;
10425 for (uint32_t n = 9; n < 16; n++) {
10426 for (size_t k = 1; k <= 20; k += 5) {
10427 GemmMicrokernelTester()
10428 .mr(1)
10429 .nr(8)
10430 .kr(1)
10431 .sr(4)
10432 .m(1)
10433 .n(n)
10434 .k(k)
10435 .a_stride(23)
10436 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10437 }
10438 }
10439 }
10440
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,n_gt_8_subtile)10441 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, n_gt_8_subtile) {
10442 TEST_REQUIRES_ARM_NEON_FMA;
10443 for (uint32_t n = 9; n < 16; n++) {
10444 for (size_t k = 1; k <= 20; k += 5) {
10445 for (uint32_t m = 1; m <= 1; m++) {
10446 GemmMicrokernelTester()
10447 .mr(1)
10448 .nr(8)
10449 .kr(1)
10450 .sr(4)
10451 .m(m)
10452 .n(n)
10453 .k(k)
10454 .iterations(1)
10455 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10456 }
10457 }
10458 }
10459 }
10460
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,n_div_8)10461 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, n_div_8) {
10462 TEST_REQUIRES_ARM_NEON_FMA;
10463 for (uint32_t n = 16; n <= 24; n += 8) {
10464 for (size_t k = 1; k <= 20; k += 5) {
10465 GemmMicrokernelTester()
10466 .mr(1)
10467 .nr(8)
10468 .kr(1)
10469 .sr(4)
10470 .m(1)
10471 .n(n)
10472 .k(k)
10473 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10474 }
10475 }
10476 }
10477
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,n_div_8_strided_cn)10478 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, n_div_8_strided_cn) {
10479 TEST_REQUIRES_ARM_NEON_FMA;
10480 for (uint32_t n = 16; n <= 24; n += 8) {
10481 for (size_t k = 1; k <= 20; k += 5) {
10482 GemmMicrokernelTester()
10483 .mr(1)
10484 .nr(8)
10485 .kr(1)
10486 .sr(4)
10487 .m(1)
10488 .n(n)
10489 .k(k)
10490 .cn_stride(11)
10491 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10492 }
10493 }
10494 }
10495
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,n_div_8_strided_a)10496 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, n_div_8_strided_a) {
10497 TEST_REQUIRES_ARM_NEON_FMA;
10498 for (uint32_t n = 16; n <= 24; n += 8) {
10499 for (size_t k = 1; k <= 20; k += 5) {
10500 GemmMicrokernelTester()
10501 .mr(1)
10502 .nr(8)
10503 .kr(1)
10504 .sr(4)
10505 .m(1)
10506 .n(n)
10507 .k(k)
10508 .a_stride(23)
10509 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10510 }
10511 }
10512 }
10513
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,n_div_8_subtile)10514 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, n_div_8_subtile) {
10515 TEST_REQUIRES_ARM_NEON_FMA;
10516 for (uint32_t n = 16; n <= 24; n += 8) {
10517 for (size_t k = 1; k <= 20; k += 5) {
10518 for (uint32_t m = 1; m <= 1; m++) {
10519 GemmMicrokernelTester()
10520 .mr(1)
10521 .nr(8)
10522 .kr(1)
10523 .sr(4)
10524 .m(m)
10525 .n(n)
10526 .k(k)
10527 .iterations(1)
10528 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10529 }
10530 }
10531 }
10532 }
10533
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,strided_cm_subtile)10534 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, strided_cm_subtile) {
10535 TEST_REQUIRES_ARM_NEON_FMA;
10536 for (size_t k = 1; k <= 20; k += 5) {
10537 for (uint32_t n = 1; n <= 8; n++) {
10538 for (uint32_t m = 1; m <= 1; m++) {
10539 GemmMicrokernelTester()
10540 .mr(1)
10541 .nr(8)
10542 .kr(1)
10543 .sr(4)
10544 .m(m)
10545 .n(n)
10546 .k(k)
10547 .cm_stride(11)
10548 .iterations(1)
10549 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10550 }
10551 }
10552 }
10553 }
10554
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,qmin)10555 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, qmin) {
10556 TEST_REQUIRES_ARM_NEON_FMA;
10557 GemmMicrokernelTester()
10558 .mr(1)
10559 .nr(8)
10560 .kr(1)
10561 .sr(4)
10562 .m(1)
10563 .n(8)
10564 .k(4)
10565 .qmin(128)
10566 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10567 }
10568
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,qmax)10569 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, qmax) {
10570 TEST_REQUIRES_ARM_NEON_FMA;
10571 GemmMicrokernelTester()
10572 .mr(1)
10573 .nr(8)
10574 .kr(1)
10575 .sr(4)
10576 .m(1)
10577 .n(8)
10578 .k(4)
10579 .qmax(128)
10580 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10581 }
10582
TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA,strided_cm)10583 TEST(F32_GEMM_MINMAX_1X8S4__NEONFMA, strided_cm) {
10584 TEST_REQUIRES_ARM_NEON_FMA;
10585 GemmMicrokernelTester()
10586 .mr(1)
10587 .nr(8)
10588 .kr(1)
10589 .sr(4)
10590 .m(1)
10591 .n(8)
10592 .k(4)
10593 .cm_stride(11)
10594 .Test(xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
10595 }
10596 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10597
10598
10599 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_eq_4)10600 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_eq_4) {
10601 TEST_REQUIRES_ARM_NEON;
10602 GemmMicrokernelTester()
10603 .mr(4)
10604 .nr(8)
10605 .kr(1)
10606 .sr(1)
10607 .m(4)
10608 .n(8)
10609 .k(4)
10610 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10611 }
10612
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,strided_cn)10613 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, strided_cn) {
10614 TEST_REQUIRES_ARM_NEON;
10615 GemmMicrokernelTester()
10616 .mr(4)
10617 .nr(8)
10618 .kr(1)
10619 .sr(1)
10620 .m(4)
10621 .n(8)
10622 .k(4)
10623 .cn_stride(11)
10624 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10625 }
10626
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_eq_4_strided_a)10627 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_eq_4_strided_a) {
10628 TEST_REQUIRES_ARM_NEON;
10629 GemmMicrokernelTester()
10630 .mr(4)
10631 .nr(8)
10632 .kr(1)
10633 .sr(1)
10634 .m(4)
10635 .n(8)
10636 .k(4)
10637 .a_stride(7)
10638 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10639 }
10640
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_eq_4_subtile)10641 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_eq_4_subtile) {
10642 TEST_REQUIRES_ARM_NEON;
10643 for (uint32_t n = 1; n <= 8; n++) {
10644 for (uint32_t m = 1; m <= 4; m++) {
10645 GemmMicrokernelTester()
10646 .mr(4)
10647 .nr(8)
10648 .kr(1)
10649 .sr(1)
10650 .m(m)
10651 .n(n)
10652 .k(4)
10653 .iterations(1)
10654 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10655 }
10656 }
10657 }
10658
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_eq_4_subtile_m)10659 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
10660 TEST_REQUIRES_ARM_NEON;
10661 for (uint32_t m = 1; m <= 4; m++) {
10662 GemmMicrokernelTester()
10663 .mr(4)
10664 .nr(8)
10665 .kr(1)
10666 .sr(1)
10667 .m(m)
10668 .n(8)
10669 .k(4)
10670 .iterations(1)
10671 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10672 }
10673 }
10674
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_eq_4_subtile_n)10675 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
10676 TEST_REQUIRES_ARM_NEON;
10677 for (uint32_t n = 1; n <= 8; n++) {
10678 GemmMicrokernelTester()
10679 .mr(4)
10680 .nr(8)
10681 .kr(1)
10682 .sr(1)
10683 .m(4)
10684 .n(n)
10685 .k(4)
10686 .iterations(1)
10687 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10688 }
10689 }
10690
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_lt_4)10691 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_lt_4) {
10692 TEST_REQUIRES_ARM_NEON;
10693 for (size_t k = 1; k < 4; k++) {
10694 GemmMicrokernelTester()
10695 .mr(4)
10696 .nr(8)
10697 .kr(1)
10698 .sr(1)
10699 .m(4)
10700 .n(8)
10701 .k(k)
10702 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10703 }
10704 }
10705
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_lt_4_strided_a)10706 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_lt_4_strided_a) {
10707 TEST_REQUIRES_ARM_NEON;
10708 for (size_t k = 1; k < 4; k++) {
10709 GemmMicrokernelTester()
10710 .mr(4)
10711 .nr(8)
10712 .kr(1)
10713 .sr(1)
10714 .m(4)
10715 .n(8)
10716 .k(k)
10717 .a_stride(7)
10718 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10719 }
10720 }
10721
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_lt_4_subtile)10722 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_lt_4_subtile) {
10723 TEST_REQUIRES_ARM_NEON;
10724 for (size_t k = 1; k < 4; k++) {
10725 for (uint32_t n = 1; n <= 8; n++) {
10726 for (uint32_t m = 1; m <= 4; m++) {
10727 GemmMicrokernelTester()
10728 .mr(4)
10729 .nr(8)
10730 .kr(1)
10731 .sr(1)
10732 .m(m)
10733 .n(n)
10734 .k(k)
10735 .iterations(1)
10736 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10737 }
10738 }
10739 }
10740 }
10741
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_gt_4)10742 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_gt_4) {
10743 TEST_REQUIRES_ARM_NEON;
10744 for (size_t k = 5; k < 8; k++) {
10745 GemmMicrokernelTester()
10746 .mr(4)
10747 .nr(8)
10748 .kr(1)
10749 .sr(1)
10750 .m(4)
10751 .n(8)
10752 .k(k)
10753 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10754 }
10755 }
10756
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_gt_4_strided_a)10757 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_gt_4_strided_a) {
10758 TEST_REQUIRES_ARM_NEON;
10759 for (size_t k = 5; k < 8; k++) {
10760 GemmMicrokernelTester()
10761 .mr(4)
10762 .nr(8)
10763 .kr(1)
10764 .sr(1)
10765 .m(4)
10766 .n(8)
10767 .k(k)
10768 .a_stride(11)
10769 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10770 }
10771 }
10772
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_gt_4_subtile)10773 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_gt_4_subtile) {
10774 TEST_REQUIRES_ARM_NEON;
10775 for (size_t k = 5; k < 8; k++) {
10776 for (uint32_t n = 1; n <= 8; n++) {
10777 for (uint32_t m = 1; m <= 4; m++) {
10778 GemmMicrokernelTester()
10779 .mr(4)
10780 .nr(8)
10781 .kr(1)
10782 .sr(1)
10783 .m(m)
10784 .n(n)
10785 .k(k)
10786 .iterations(1)
10787 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10788 }
10789 }
10790 }
10791 }
10792
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_div_4)10793 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_div_4) {
10794 TEST_REQUIRES_ARM_NEON;
10795 for (size_t k = 8; k <= 40; k += 4) {
10796 GemmMicrokernelTester()
10797 .mr(4)
10798 .nr(8)
10799 .kr(1)
10800 .sr(1)
10801 .m(4)
10802 .n(8)
10803 .k(k)
10804 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10805 }
10806 }
10807
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_div_4_strided_a)10808 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_div_4_strided_a) {
10809 TEST_REQUIRES_ARM_NEON;
10810 for (size_t k = 8; k <= 40; k += 4) {
10811 GemmMicrokernelTester()
10812 .mr(4)
10813 .nr(8)
10814 .kr(1)
10815 .sr(1)
10816 .m(4)
10817 .n(8)
10818 .k(k)
10819 .a_stride(43)
10820 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10821 }
10822 }
10823
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,k_div_4_subtile)10824 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, k_div_4_subtile) {
10825 TEST_REQUIRES_ARM_NEON;
10826 for (size_t k = 8; k <= 40; k += 4) {
10827 for (uint32_t n = 1; n <= 8; n++) {
10828 for (uint32_t m = 1; m <= 4; m++) {
10829 GemmMicrokernelTester()
10830 .mr(4)
10831 .nr(8)
10832 .kr(1)
10833 .sr(1)
10834 .m(m)
10835 .n(n)
10836 .k(k)
10837 .iterations(1)
10838 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10839 }
10840 }
10841 }
10842 }
10843
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,n_gt_8)10844 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, n_gt_8) {
10845 TEST_REQUIRES_ARM_NEON;
10846 for (uint32_t n = 9; n < 16; n++) {
10847 for (size_t k = 1; k <= 20; k += 5) {
10848 GemmMicrokernelTester()
10849 .mr(4)
10850 .nr(8)
10851 .kr(1)
10852 .sr(1)
10853 .m(4)
10854 .n(n)
10855 .k(k)
10856 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10857 }
10858 }
10859 }
10860
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,n_gt_8_strided_cn)10861 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
10862 TEST_REQUIRES_ARM_NEON;
10863 for (uint32_t n = 9; n < 16; n++) {
10864 for (size_t k = 1; k <= 20; k += 5) {
10865 GemmMicrokernelTester()
10866 .mr(4)
10867 .nr(8)
10868 .kr(1)
10869 .sr(1)
10870 .m(4)
10871 .n(n)
10872 .k(k)
10873 .cn_stride(11)
10874 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10875 }
10876 }
10877 }
10878
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,n_gt_8_strided_a)10879 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, n_gt_8_strided_a) {
10880 TEST_REQUIRES_ARM_NEON;
10881 for (uint32_t n = 9; n < 16; n++) {
10882 for (size_t k = 1; k <= 20; k += 5) {
10883 GemmMicrokernelTester()
10884 .mr(4)
10885 .nr(8)
10886 .kr(1)
10887 .sr(1)
10888 .m(4)
10889 .n(n)
10890 .k(k)
10891 .a_stride(23)
10892 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10893 }
10894 }
10895 }
10896
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,n_gt_8_subtile)10897 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, n_gt_8_subtile) {
10898 TEST_REQUIRES_ARM_NEON;
10899 for (uint32_t n = 9; n < 16; n++) {
10900 for (size_t k = 1; k <= 20; k += 5) {
10901 for (uint32_t m = 1; m <= 4; m++) {
10902 GemmMicrokernelTester()
10903 .mr(4)
10904 .nr(8)
10905 .kr(1)
10906 .sr(1)
10907 .m(m)
10908 .n(n)
10909 .k(k)
10910 .iterations(1)
10911 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10912 }
10913 }
10914 }
10915 }
10916
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,n_div_8)10917 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, n_div_8) {
10918 TEST_REQUIRES_ARM_NEON;
10919 for (uint32_t n = 16; n <= 24; n += 8) {
10920 for (size_t k = 1; k <= 20; k += 5) {
10921 GemmMicrokernelTester()
10922 .mr(4)
10923 .nr(8)
10924 .kr(1)
10925 .sr(1)
10926 .m(4)
10927 .n(n)
10928 .k(k)
10929 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10930 }
10931 }
10932 }
10933
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,n_div_8_strided_cn)10934 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, n_div_8_strided_cn) {
10935 TEST_REQUIRES_ARM_NEON;
10936 for (uint32_t n = 16; n <= 24; n += 8) {
10937 for (size_t k = 1; k <= 20; k += 5) {
10938 GemmMicrokernelTester()
10939 .mr(4)
10940 .nr(8)
10941 .kr(1)
10942 .sr(1)
10943 .m(4)
10944 .n(n)
10945 .k(k)
10946 .cn_stride(11)
10947 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10948 }
10949 }
10950 }
10951
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,n_div_8_strided_a)10952 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, n_div_8_strided_a) {
10953 TEST_REQUIRES_ARM_NEON;
10954 for (uint32_t n = 16; n <= 24; n += 8) {
10955 for (size_t k = 1; k <= 20; k += 5) {
10956 GemmMicrokernelTester()
10957 .mr(4)
10958 .nr(8)
10959 .kr(1)
10960 .sr(1)
10961 .m(4)
10962 .n(n)
10963 .k(k)
10964 .a_stride(23)
10965 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10966 }
10967 }
10968 }
10969
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,n_div_8_subtile)10970 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, n_div_8_subtile) {
10971 TEST_REQUIRES_ARM_NEON;
10972 for (uint32_t n = 16; n <= 24; n += 8) {
10973 for (size_t k = 1; k <= 20; k += 5) {
10974 for (uint32_t m = 1; m <= 4; m++) {
10975 GemmMicrokernelTester()
10976 .mr(4)
10977 .nr(8)
10978 .kr(1)
10979 .sr(1)
10980 .m(m)
10981 .n(n)
10982 .k(k)
10983 .iterations(1)
10984 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
10985 }
10986 }
10987 }
10988 }
10989
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,strided_cm_subtile)10990 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, strided_cm_subtile) {
10991 TEST_REQUIRES_ARM_NEON;
10992 for (size_t k = 1; k <= 20; k += 5) {
10993 for (uint32_t n = 1; n <= 8; n++) {
10994 for (uint32_t m = 1; m <= 4; m++) {
10995 GemmMicrokernelTester()
10996 .mr(4)
10997 .nr(8)
10998 .kr(1)
10999 .sr(1)
11000 .m(m)
11001 .n(n)
11002 .k(k)
11003 .cm_stride(11)
11004 .iterations(1)
11005 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
11006 }
11007 }
11008 }
11009 }
11010
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,qmin)11011 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, qmin) {
11012 TEST_REQUIRES_ARM_NEON;
11013 GemmMicrokernelTester()
11014 .mr(4)
11015 .nr(8)
11016 .kr(1)
11017 .sr(1)
11018 .m(4)
11019 .n(8)
11020 .k(4)
11021 .qmin(128)
11022 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
11023 }
11024
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,qmax)11025 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, qmax) {
11026 TEST_REQUIRES_ARM_NEON;
11027 GemmMicrokernelTester()
11028 .mr(4)
11029 .nr(8)
11030 .kr(1)
11031 .sr(1)
11032 .m(4)
11033 .n(8)
11034 .k(4)
11035 .qmax(128)
11036 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
11037 }
11038
TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128,strided_cm)11039 TEST(F32_GEMM_MINMAX_4X8__NEON_DUP_LD128, strided_cm) {
11040 TEST_REQUIRES_ARM_NEON;
11041 GemmMicrokernelTester()
11042 .mr(4)
11043 .nr(8)
11044 .kr(1)
11045 .sr(1)
11046 .m(4)
11047 .n(8)
11048 .k(4)
11049 .cm_stride(11)
11050 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_dup_ld128, xnn_init_f32_minmax_scalar_params);
11051 }
11052 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11053
11054
11055 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_eq_2)11056 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_eq_2) {
11057 TEST_REQUIRES_ARM_NEON;
11058 GemmMicrokernelTester()
11059 .mr(4)
11060 .nr(8)
11061 .kr(1)
11062 .sr(1)
11063 .m(4)
11064 .n(8)
11065 .k(2)
11066 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11067 }
11068
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,strided_cn)11069 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, strided_cn) {
11070 TEST_REQUIRES_ARM_NEON;
11071 GemmMicrokernelTester()
11072 .mr(4)
11073 .nr(8)
11074 .kr(1)
11075 .sr(1)
11076 .m(4)
11077 .n(8)
11078 .k(2)
11079 .cn_stride(11)
11080 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11081 }
11082
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_eq_2_strided_a)11083 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_eq_2_strided_a) {
11084 TEST_REQUIRES_ARM_NEON;
11085 GemmMicrokernelTester()
11086 .mr(4)
11087 .nr(8)
11088 .kr(1)
11089 .sr(1)
11090 .m(4)
11091 .n(8)
11092 .k(2)
11093 .a_stride(5)
11094 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11095 }
11096
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_eq_2_subtile)11097 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
11098 TEST_REQUIRES_ARM_NEON;
11099 for (uint32_t n = 1; n <= 8; n++) {
11100 for (uint32_t m = 1; m <= 4; m++) {
11101 GemmMicrokernelTester()
11102 .mr(4)
11103 .nr(8)
11104 .kr(1)
11105 .sr(1)
11106 .m(m)
11107 .n(n)
11108 .k(2)
11109 .iterations(1)
11110 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11111 }
11112 }
11113 }
11114
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_eq_2_subtile_m)11115 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
11116 TEST_REQUIRES_ARM_NEON;
11117 for (uint32_t m = 1; m <= 4; m++) {
11118 GemmMicrokernelTester()
11119 .mr(4)
11120 .nr(8)
11121 .kr(1)
11122 .sr(1)
11123 .m(m)
11124 .n(8)
11125 .k(2)
11126 .iterations(1)
11127 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11128 }
11129 }
11130
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_eq_2_subtile_n)11131 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
11132 TEST_REQUIRES_ARM_NEON;
11133 for (uint32_t n = 1; n <= 8; n++) {
11134 GemmMicrokernelTester()
11135 .mr(4)
11136 .nr(8)
11137 .kr(1)
11138 .sr(1)
11139 .m(4)
11140 .n(n)
11141 .k(2)
11142 .iterations(1)
11143 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11144 }
11145 }
11146
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_lt_2)11147 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_lt_2) {
11148 TEST_REQUIRES_ARM_NEON;
11149 for (size_t k = 1; k < 2; k++) {
11150 GemmMicrokernelTester()
11151 .mr(4)
11152 .nr(8)
11153 .kr(1)
11154 .sr(1)
11155 .m(4)
11156 .n(8)
11157 .k(k)
11158 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11159 }
11160 }
11161
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_lt_2_strided_a)11162 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_lt_2_strided_a) {
11163 TEST_REQUIRES_ARM_NEON;
11164 for (size_t k = 1; k < 2; k++) {
11165 GemmMicrokernelTester()
11166 .mr(4)
11167 .nr(8)
11168 .kr(1)
11169 .sr(1)
11170 .m(4)
11171 .n(8)
11172 .k(k)
11173 .a_stride(5)
11174 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11175 }
11176 }
11177
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_lt_2_subtile)11178 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
11179 TEST_REQUIRES_ARM_NEON;
11180 for (size_t k = 1; k < 2; k++) {
11181 for (uint32_t n = 1; n <= 8; n++) {
11182 for (uint32_t m = 1; m <= 4; m++) {
11183 GemmMicrokernelTester()
11184 .mr(4)
11185 .nr(8)
11186 .kr(1)
11187 .sr(1)
11188 .m(m)
11189 .n(n)
11190 .k(k)
11191 .iterations(1)
11192 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11193 }
11194 }
11195 }
11196 }
11197
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_gt_2)11198 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_gt_2) {
11199 TEST_REQUIRES_ARM_NEON;
11200 for (size_t k = 3; k < 4; k++) {
11201 GemmMicrokernelTester()
11202 .mr(4)
11203 .nr(8)
11204 .kr(1)
11205 .sr(1)
11206 .m(4)
11207 .n(8)
11208 .k(k)
11209 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11210 }
11211 }
11212
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_gt_2_strided_a)11213 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_gt_2_strided_a) {
11214 TEST_REQUIRES_ARM_NEON;
11215 for (size_t k = 3; k < 4; k++) {
11216 GemmMicrokernelTester()
11217 .mr(4)
11218 .nr(8)
11219 .kr(1)
11220 .sr(1)
11221 .m(4)
11222 .n(8)
11223 .k(k)
11224 .a_stride(7)
11225 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11226 }
11227 }
11228
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_gt_2_subtile)11229 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
11230 TEST_REQUIRES_ARM_NEON;
11231 for (size_t k = 3; k < 4; k++) {
11232 for (uint32_t n = 1; n <= 8; n++) {
11233 for (uint32_t m = 1; m <= 4; m++) {
11234 GemmMicrokernelTester()
11235 .mr(4)
11236 .nr(8)
11237 .kr(1)
11238 .sr(1)
11239 .m(m)
11240 .n(n)
11241 .k(k)
11242 .iterations(1)
11243 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11244 }
11245 }
11246 }
11247 }
11248
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_div_2)11249 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_div_2) {
11250 TEST_REQUIRES_ARM_NEON;
11251 for (size_t k = 4; k <= 20; k += 2) {
11252 GemmMicrokernelTester()
11253 .mr(4)
11254 .nr(8)
11255 .kr(1)
11256 .sr(1)
11257 .m(4)
11258 .n(8)
11259 .k(k)
11260 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11261 }
11262 }
11263
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_div_2_strided_a)11264 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_div_2_strided_a) {
11265 TEST_REQUIRES_ARM_NEON;
11266 for (size_t k = 4; k <= 20; k += 2) {
11267 GemmMicrokernelTester()
11268 .mr(4)
11269 .nr(8)
11270 .kr(1)
11271 .sr(1)
11272 .m(4)
11273 .n(8)
11274 .k(k)
11275 .a_stride(23)
11276 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11277 }
11278 }
11279
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,k_div_2_subtile)11280 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, k_div_2_subtile) {
11281 TEST_REQUIRES_ARM_NEON;
11282 for (size_t k = 4; k <= 20; k += 2) {
11283 for (uint32_t n = 1; n <= 8; n++) {
11284 for (uint32_t m = 1; m <= 4; m++) {
11285 GemmMicrokernelTester()
11286 .mr(4)
11287 .nr(8)
11288 .kr(1)
11289 .sr(1)
11290 .m(m)
11291 .n(n)
11292 .k(k)
11293 .iterations(1)
11294 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11295 }
11296 }
11297 }
11298 }
11299
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,n_gt_8)11300 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, n_gt_8) {
11301 TEST_REQUIRES_ARM_NEON;
11302 for (uint32_t n = 9; n < 16; n++) {
11303 for (size_t k = 1; k <= 10; k += 3) {
11304 GemmMicrokernelTester()
11305 .mr(4)
11306 .nr(8)
11307 .kr(1)
11308 .sr(1)
11309 .m(4)
11310 .n(n)
11311 .k(k)
11312 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11313 }
11314 }
11315 }
11316
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,n_gt_8_strided_cn)11317 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
11318 TEST_REQUIRES_ARM_NEON;
11319 for (uint32_t n = 9; n < 16; n++) {
11320 for (size_t k = 1; k <= 10; k += 3) {
11321 GemmMicrokernelTester()
11322 .mr(4)
11323 .nr(8)
11324 .kr(1)
11325 .sr(1)
11326 .m(4)
11327 .n(n)
11328 .k(k)
11329 .cn_stride(11)
11330 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11331 }
11332 }
11333 }
11334
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,n_gt_8_strided_a)11335 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, n_gt_8_strided_a) {
11336 TEST_REQUIRES_ARM_NEON;
11337 for (uint32_t n = 9; n < 16; n++) {
11338 for (size_t k = 1; k <= 10; k += 3) {
11339 GemmMicrokernelTester()
11340 .mr(4)
11341 .nr(8)
11342 .kr(1)
11343 .sr(1)
11344 .m(4)
11345 .n(n)
11346 .k(k)
11347 .a_stride(13)
11348 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11349 }
11350 }
11351 }
11352
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,n_gt_8_subtile)11353 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
11354 TEST_REQUIRES_ARM_NEON;
11355 for (uint32_t n = 9; n < 16; n++) {
11356 for (size_t k = 1; k <= 10; k += 3) {
11357 for (uint32_t m = 1; m <= 4; m++) {
11358 GemmMicrokernelTester()
11359 .mr(4)
11360 .nr(8)
11361 .kr(1)
11362 .sr(1)
11363 .m(m)
11364 .n(n)
11365 .k(k)
11366 .iterations(1)
11367 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11368 }
11369 }
11370 }
11371 }
11372
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,n_div_8)11373 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, n_div_8) {
11374 TEST_REQUIRES_ARM_NEON;
11375 for (uint32_t n = 16; n <= 24; n += 8) {
11376 for (size_t k = 1; k <= 10; k += 3) {
11377 GemmMicrokernelTester()
11378 .mr(4)
11379 .nr(8)
11380 .kr(1)
11381 .sr(1)
11382 .m(4)
11383 .n(n)
11384 .k(k)
11385 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11386 }
11387 }
11388 }
11389
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,n_div_8_strided_cn)11390 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
11391 TEST_REQUIRES_ARM_NEON;
11392 for (uint32_t n = 16; n <= 24; n += 8) {
11393 for (size_t k = 1; k <= 10; k += 3) {
11394 GemmMicrokernelTester()
11395 .mr(4)
11396 .nr(8)
11397 .kr(1)
11398 .sr(1)
11399 .m(4)
11400 .n(n)
11401 .k(k)
11402 .cn_stride(11)
11403 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11404 }
11405 }
11406 }
11407
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,n_div_8_strided_a)11408 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, n_div_8_strided_a) {
11409 TEST_REQUIRES_ARM_NEON;
11410 for (uint32_t n = 16; n <= 24; n += 8) {
11411 for (size_t k = 1; k <= 10; k += 3) {
11412 GemmMicrokernelTester()
11413 .mr(4)
11414 .nr(8)
11415 .kr(1)
11416 .sr(1)
11417 .m(4)
11418 .n(n)
11419 .k(k)
11420 .a_stride(13)
11421 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11422 }
11423 }
11424 }
11425
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,n_div_8_subtile)11426 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, n_div_8_subtile) {
11427 TEST_REQUIRES_ARM_NEON;
11428 for (uint32_t n = 16; n <= 24; n += 8) {
11429 for (size_t k = 1; k <= 10; k += 3) {
11430 for (uint32_t m = 1; m <= 4; m++) {
11431 GemmMicrokernelTester()
11432 .mr(4)
11433 .nr(8)
11434 .kr(1)
11435 .sr(1)
11436 .m(m)
11437 .n(n)
11438 .k(k)
11439 .iterations(1)
11440 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11441 }
11442 }
11443 }
11444 }
11445
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,strided_cm_subtile)11446 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, strided_cm_subtile) {
11447 TEST_REQUIRES_ARM_NEON;
11448 for (size_t k = 1; k <= 10; k += 3) {
11449 for (uint32_t n = 1; n <= 8; n++) {
11450 for (uint32_t m = 1; m <= 4; m++) {
11451 GemmMicrokernelTester()
11452 .mr(4)
11453 .nr(8)
11454 .kr(1)
11455 .sr(1)
11456 .m(m)
11457 .n(n)
11458 .k(k)
11459 .cm_stride(11)
11460 .iterations(1)
11461 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11462 }
11463 }
11464 }
11465 }
11466
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,qmin)11467 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, qmin) {
11468 TEST_REQUIRES_ARM_NEON;
11469 GemmMicrokernelTester()
11470 .mr(4)
11471 .nr(8)
11472 .kr(1)
11473 .sr(1)
11474 .m(4)
11475 .n(8)
11476 .k(2)
11477 .qmin(128)
11478 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11479 }
11480
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,qmax)11481 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, qmax) {
11482 TEST_REQUIRES_ARM_NEON;
11483 GemmMicrokernelTester()
11484 .mr(4)
11485 .nr(8)
11486 .kr(1)
11487 .sr(1)
11488 .m(4)
11489 .n(8)
11490 .k(2)
11491 .qmax(128)
11492 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11493 }
11494
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64,strided_cm)11495 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD64, strided_cm) {
11496 TEST_REQUIRES_ARM_NEON;
11497 GemmMicrokernelTester()
11498 .mr(4)
11499 .nr(8)
11500 .kr(1)
11501 .sr(1)
11502 .m(4)
11503 .n(8)
11504 .k(2)
11505 .cm_stride(11)
11506 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
11507 }
11508 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11509
11510
11511 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_eq_4)11512 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_eq_4) {
11513 TEST_REQUIRES_ARM_NEON;
11514 GemmMicrokernelTester()
11515 .mr(4)
11516 .nr(8)
11517 .kr(1)
11518 .sr(1)
11519 .m(4)
11520 .n(8)
11521 .k(4)
11522 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11523 }
11524
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,strided_cn)11525 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, strided_cn) {
11526 TEST_REQUIRES_ARM_NEON;
11527 GemmMicrokernelTester()
11528 .mr(4)
11529 .nr(8)
11530 .kr(1)
11531 .sr(1)
11532 .m(4)
11533 .n(8)
11534 .k(4)
11535 .cn_stride(11)
11536 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11537 }
11538
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_eq_4_strided_a)11539 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_eq_4_strided_a) {
11540 TEST_REQUIRES_ARM_NEON;
11541 GemmMicrokernelTester()
11542 .mr(4)
11543 .nr(8)
11544 .kr(1)
11545 .sr(1)
11546 .m(4)
11547 .n(8)
11548 .k(4)
11549 .a_stride(7)
11550 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11551 }
11552
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_eq_4_subtile)11553 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
11554 TEST_REQUIRES_ARM_NEON;
11555 for (uint32_t n = 1; n <= 8; n++) {
11556 for (uint32_t m = 1; m <= 4; m++) {
11557 GemmMicrokernelTester()
11558 .mr(4)
11559 .nr(8)
11560 .kr(1)
11561 .sr(1)
11562 .m(m)
11563 .n(n)
11564 .k(4)
11565 .iterations(1)
11566 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11567 }
11568 }
11569 }
11570
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_eq_4_subtile_m)11571 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
11572 TEST_REQUIRES_ARM_NEON;
11573 for (uint32_t m = 1; m <= 4; m++) {
11574 GemmMicrokernelTester()
11575 .mr(4)
11576 .nr(8)
11577 .kr(1)
11578 .sr(1)
11579 .m(m)
11580 .n(8)
11581 .k(4)
11582 .iterations(1)
11583 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11584 }
11585 }
11586
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_eq_4_subtile_n)11587 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
11588 TEST_REQUIRES_ARM_NEON;
11589 for (uint32_t n = 1; n <= 8; n++) {
11590 GemmMicrokernelTester()
11591 .mr(4)
11592 .nr(8)
11593 .kr(1)
11594 .sr(1)
11595 .m(4)
11596 .n(n)
11597 .k(4)
11598 .iterations(1)
11599 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11600 }
11601 }
11602
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_lt_4)11603 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_lt_4) {
11604 TEST_REQUIRES_ARM_NEON;
11605 for (size_t k = 1; k < 4; k++) {
11606 GemmMicrokernelTester()
11607 .mr(4)
11608 .nr(8)
11609 .kr(1)
11610 .sr(1)
11611 .m(4)
11612 .n(8)
11613 .k(k)
11614 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11615 }
11616 }
11617
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_lt_4_strided_a)11618 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_lt_4_strided_a) {
11619 TEST_REQUIRES_ARM_NEON;
11620 for (size_t k = 1; k < 4; k++) {
11621 GemmMicrokernelTester()
11622 .mr(4)
11623 .nr(8)
11624 .kr(1)
11625 .sr(1)
11626 .m(4)
11627 .n(8)
11628 .k(k)
11629 .a_stride(7)
11630 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11631 }
11632 }
11633
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_lt_4_subtile)11634 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
11635 TEST_REQUIRES_ARM_NEON;
11636 for (size_t k = 1; k < 4; k++) {
11637 for (uint32_t n = 1; n <= 8; n++) {
11638 for (uint32_t m = 1; m <= 4; m++) {
11639 GemmMicrokernelTester()
11640 .mr(4)
11641 .nr(8)
11642 .kr(1)
11643 .sr(1)
11644 .m(m)
11645 .n(n)
11646 .k(k)
11647 .iterations(1)
11648 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11649 }
11650 }
11651 }
11652 }
11653
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_gt_4)11654 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_gt_4) {
11655 TEST_REQUIRES_ARM_NEON;
11656 for (size_t k = 5; k < 8; k++) {
11657 GemmMicrokernelTester()
11658 .mr(4)
11659 .nr(8)
11660 .kr(1)
11661 .sr(1)
11662 .m(4)
11663 .n(8)
11664 .k(k)
11665 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11666 }
11667 }
11668
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_gt_4_strided_a)11669 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_gt_4_strided_a) {
11670 TEST_REQUIRES_ARM_NEON;
11671 for (size_t k = 5; k < 8; k++) {
11672 GemmMicrokernelTester()
11673 .mr(4)
11674 .nr(8)
11675 .kr(1)
11676 .sr(1)
11677 .m(4)
11678 .n(8)
11679 .k(k)
11680 .a_stride(11)
11681 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11682 }
11683 }
11684
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_gt_4_subtile)11685 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
11686 TEST_REQUIRES_ARM_NEON;
11687 for (size_t k = 5; k < 8; k++) {
11688 for (uint32_t n = 1; n <= 8; n++) {
11689 for (uint32_t m = 1; m <= 4; m++) {
11690 GemmMicrokernelTester()
11691 .mr(4)
11692 .nr(8)
11693 .kr(1)
11694 .sr(1)
11695 .m(m)
11696 .n(n)
11697 .k(k)
11698 .iterations(1)
11699 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11700 }
11701 }
11702 }
11703 }
11704
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_div_4)11705 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_div_4) {
11706 TEST_REQUIRES_ARM_NEON;
11707 for (size_t k = 8; k <= 40; k += 4) {
11708 GemmMicrokernelTester()
11709 .mr(4)
11710 .nr(8)
11711 .kr(1)
11712 .sr(1)
11713 .m(4)
11714 .n(8)
11715 .k(k)
11716 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11717 }
11718 }
11719
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_div_4_strided_a)11720 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_div_4_strided_a) {
11721 TEST_REQUIRES_ARM_NEON;
11722 for (size_t k = 8; k <= 40; k += 4) {
11723 GemmMicrokernelTester()
11724 .mr(4)
11725 .nr(8)
11726 .kr(1)
11727 .sr(1)
11728 .m(4)
11729 .n(8)
11730 .k(k)
11731 .a_stride(43)
11732 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11733 }
11734 }
11735
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,k_div_4_subtile)11736 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, k_div_4_subtile) {
11737 TEST_REQUIRES_ARM_NEON;
11738 for (size_t k = 8; k <= 40; k += 4) {
11739 for (uint32_t n = 1; n <= 8; n++) {
11740 for (uint32_t m = 1; m <= 4; m++) {
11741 GemmMicrokernelTester()
11742 .mr(4)
11743 .nr(8)
11744 .kr(1)
11745 .sr(1)
11746 .m(m)
11747 .n(n)
11748 .k(k)
11749 .iterations(1)
11750 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11751 }
11752 }
11753 }
11754 }
11755
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,n_gt_8)11756 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, n_gt_8) {
11757 TEST_REQUIRES_ARM_NEON;
11758 for (uint32_t n = 9; n < 16; n++) {
11759 for (size_t k = 1; k <= 20; k += 5) {
11760 GemmMicrokernelTester()
11761 .mr(4)
11762 .nr(8)
11763 .kr(1)
11764 .sr(1)
11765 .m(4)
11766 .n(n)
11767 .k(k)
11768 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11769 }
11770 }
11771 }
11772
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,n_gt_8_strided_cn)11773 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
11774 TEST_REQUIRES_ARM_NEON;
11775 for (uint32_t n = 9; n < 16; n++) {
11776 for (size_t k = 1; k <= 20; k += 5) {
11777 GemmMicrokernelTester()
11778 .mr(4)
11779 .nr(8)
11780 .kr(1)
11781 .sr(1)
11782 .m(4)
11783 .n(n)
11784 .k(k)
11785 .cn_stride(11)
11786 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11787 }
11788 }
11789 }
11790
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,n_gt_8_strided_a)11791 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, n_gt_8_strided_a) {
11792 TEST_REQUIRES_ARM_NEON;
11793 for (uint32_t n = 9; n < 16; n++) {
11794 for (size_t k = 1; k <= 20; k += 5) {
11795 GemmMicrokernelTester()
11796 .mr(4)
11797 .nr(8)
11798 .kr(1)
11799 .sr(1)
11800 .m(4)
11801 .n(n)
11802 .k(k)
11803 .a_stride(23)
11804 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11805 }
11806 }
11807 }
11808
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,n_gt_8_subtile)11809 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
11810 TEST_REQUIRES_ARM_NEON;
11811 for (uint32_t n = 9; n < 16; n++) {
11812 for (size_t k = 1; k <= 20; k += 5) {
11813 for (uint32_t m = 1; m <= 4; m++) {
11814 GemmMicrokernelTester()
11815 .mr(4)
11816 .nr(8)
11817 .kr(1)
11818 .sr(1)
11819 .m(m)
11820 .n(n)
11821 .k(k)
11822 .iterations(1)
11823 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11824 }
11825 }
11826 }
11827 }
11828
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,n_div_8)11829 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, n_div_8) {
11830 TEST_REQUIRES_ARM_NEON;
11831 for (uint32_t n = 16; n <= 24; n += 8) {
11832 for (size_t k = 1; k <= 20; k += 5) {
11833 GemmMicrokernelTester()
11834 .mr(4)
11835 .nr(8)
11836 .kr(1)
11837 .sr(1)
11838 .m(4)
11839 .n(n)
11840 .k(k)
11841 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11842 }
11843 }
11844 }
11845
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,n_div_8_strided_cn)11846 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
11847 TEST_REQUIRES_ARM_NEON;
11848 for (uint32_t n = 16; n <= 24; n += 8) {
11849 for (size_t k = 1; k <= 20; k += 5) {
11850 GemmMicrokernelTester()
11851 .mr(4)
11852 .nr(8)
11853 .kr(1)
11854 .sr(1)
11855 .m(4)
11856 .n(n)
11857 .k(k)
11858 .cn_stride(11)
11859 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11860 }
11861 }
11862 }
11863
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,n_div_8_strided_a)11864 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, n_div_8_strided_a) {
11865 TEST_REQUIRES_ARM_NEON;
11866 for (uint32_t n = 16; n <= 24; n += 8) {
11867 for (size_t k = 1; k <= 20; k += 5) {
11868 GemmMicrokernelTester()
11869 .mr(4)
11870 .nr(8)
11871 .kr(1)
11872 .sr(1)
11873 .m(4)
11874 .n(n)
11875 .k(k)
11876 .a_stride(23)
11877 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11878 }
11879 }
11880 }
11881
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,n_div_8_subtile)11882 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, n_div_8_subtile) {
11883 TEST_REQUIRES_ARM_NEON;
11884 for (uint32_t n = 16; n <= 24; n += 8) {
11885 for (size_t k = 1; k <= 20; k += 5) {
11886 for (uint32_t m = 1; m <= 4; m++) {
11887 GemmMicrokernelTester()
11888 .mr(4)
11889 .nr(8)
11890 .kr(1)
11891 .sr(1)
11892 .m(m)
11893 .n(n)
11894 .k(k)
11895 .iterations(1)
11896 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11897 }
11898 }
11899 }
11900 }
11901
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,strided_cm_subtile)11902 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, strided_cm_subtile) {
11903 TEST_REQUIRES_ARM_NEON;
11904 for (size_t k = 1; k <= 20; k += 5) {
11905 for (uint32_t n = 1; n <= 8; n++) {
11906 for (uint32_t m = 1; m <= 4; m++) {
11907 GemmMicrokernelTester()
11908 .mr(4)
11909 .nr(8)
11910 .kr(1)
11911 .sr(1)
11912 .m(m)
11913 .n(n)
11914 .k(k)
11915 .cm_stride(11)
11916 .iterations(1)
11917 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11918 }
11919 }
11920 }
11921 }
11922
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,qmin)11923 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, qmin) {
11924 TEST_REQUIRES_ARM_NEON;
11925 GemmMicrokernelTester()
11926 .mr(4)
11927 .nr(8)
11928 .kr(1)
11929 .sr(1)
11930 .m(4)
11931 .n(8)
11932 .k(4)
11933 .qmin(128)
11934 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11935 }
11936
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,qmax)11937 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, qmax) {
11938 TEST_REQUIRES_ARM_NEON;
11939 GemmMicrokernelTester()
11940 .mr(4)
11941 .nr(8)
11942 .kr(1)
11943 .sr(1)
11944 .m(4)
11945 .n(8)
11946 .k(4)
11947 .qmax(128)
11948 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11949 }
11950
TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128,strided_cm)11951 TEST(F32_GEMM_MINMAX_4X8__NEON_LANE_LD128, strided_cm) {
11952 TEST_REQUIRES_ARM_NEON;
11953 GemmMicrokernelTester()
11954 .mr(4)
11955 .nr(8)
11956 .kr(1)
11957 .sr(1)
11958 .m(4)
11959 .n(8)
11960 .k(4)
11961 .cm_stride(11)
11962 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
11963 }
11964 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11965
11966
11967 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_eq_2)11968 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_eq_2) {
11969 TEST_REQUIRES_ARM_NEON_FMA;
11970 GemmMicrokernelTester()
11971 .mr(4)
11972 .nr(8)
11973 .kr(1)
11974 .sr(1)
11975 .m(4)
11976 .n(8)
11977 .k(2)
11978 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
11979 }
11980
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,strided_cn)11981 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, strided_cn) {
11982 TEST_REQUIRES_ARM_NEON_FMA;
11983 GemmMicrokernelTester()
11984 .mr(4)
11985 .nr(8)
11986 .kr(1)
11987 .sr(1)
11988 .m(4)
11989 .n(8)
11990 .k(2)
11991 .cn_stride(11)
11992 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
11993 }
11994
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_eq_2_strided_a)11995 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
11996 TEST_REQUIRES_ARM_NEON_FMA;
11997 GemmMicrokernelTester()
11998 .mr(4)
11999 .nr(8)
12000 .kr(1)
12001 .sr(1)
12002 .m(4)
12003 .n(8)
12004 .k(2)
12005 .a_stride(5)
12006 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12007 }
12008
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_eq_2_subtile)12009 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
12010 TEST_REQUIRES_ARM_NEON_FMA;
12011 for (uint32_t n = 1; n <= 8; n++) {
12012 for (uint32_t m = 1; m <= 4; m++) {
12013 GemmMicrokernelTester()
12014 .mr(4)
12015 .nr(8)
12016 .kr(1)
12017 .sr(1)
12018 .m(m)
12019 .n(n)
12020 .k(2)
12021 .iterations(1)
12022 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12023 }
12024 }
12025 }
12026
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_eq_2_subtile_m)12027 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
12028 TEST_REQUIRES_ARM_NEON_FMA;
12029 for (uint32_t m = 1; m <= 4; m++) {
12030 GemmMicrokernelTester()
12031 .mr(4)
12032 .nr(8)
12033 .kr(1)
12034 .sr(1)
12035 .m(m)
12036 .n(8)
12037 .k(2)
12038 .iterations(1)
12039 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12040 }
12041 }
12042
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_eq_2_subtile_n)12043 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
12044 TEST_REQUIRES_ARM_NEON_FMA;
12045 for (uint32_t n = 1; n <= 8; n++) {
12046 GemmMicrokernelTester()
12047 .mr(4)
12048 .nr(8)
12049 .kr(1)
12050 .sr(1)
12051 .m(4)
12052 .n(n)
12053 .k(2)
12054 .iterations(1)
12055 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12056 }
12057 }
12058
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_lt_2)12059 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_lt_2) {
12060 TEST_REQUIRES_ARM_NEON_FMA;
12061 for (size_t k = 1; k < 2; k++) {
12062 GemmMicrokernelTester()
12063 .mr(4)
12064 .nr(8)
12065 .kr(1)
12066 .sr(1)
12067 .m(4)
12068 .n(8)
12069 .k(k)
12070 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12071 }
12072 }
12073
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_lt_2_strided_a)12074 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
12075 TEST_REQUIRES_ARM_NEON_FMA;
12076 for (size_t k = 1; k < 2; k++) {
12077 GemmMicrokernelTester()
12078 .mr(4)
12079 .nr(8)
12080 .kr(1)
12081 .sr(1)
12082 .m(4)
12083 .n(8)
12084 .k(k)
12085 .a_stride(5)
12086 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12087 }
12088 }
12089
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_lt_2_subtile)12090 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
12091 TEST_REQUIRES_ARM_NEON_FMA;
12092 for (size_t k = 1; k < 2; k++) {
12093 for (uint32_t n = 1; n <= 8; n++) {
12094 for (uint32_t m = 1; m <= 4; m++) {
12095 GemmMicrokernelTester()
12096 .mr(4)
12097 .nr(8)
12098 .kr(1)
12099 .sr(1)
12100 .m(m)
12101 .n(n)
12102 .k(k)
12103 .iterations(1)
12104 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12105 }
12106 }
12107 }
12108 }
12109
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_gt_2)12110 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_gt_2) {
12111 TEST_REQUIRES_ARM_NEON_FMA;
12112 for (size_t k = 3; k < 4; k++) {
12113 GemmMicrokernelTester()
12114 .mr(4)
12115 .nr(8)
12116 .kr(1)
12117 .sr(1)
12118 .m(4)
12119 .n(8)
12120 .k(k)
12121 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12122 }
12123 }
12124
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_gt_2_strided_a)12125 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
12126 TEST_REQUIRES_ARM_NEON_FMA;
12127 for (size_t k = 3; k < 4; k++) {
12128 GemmMicrokernelTester()
12129 .mr(4)
12130 .nr(8)
12131 .kr(1)
12132 .sr(1)
12133 .m(4)
12134 .n(8)
12135 .k(k)
12136 .a_stride(7)
12137 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12138 }
12139 }
12140
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_gt_2_subtile)12141 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
12142 TEST_REQUIRES_ARM_NEON_FMA;
12143 for (size_t k = 3; k < 4; k++) {
12144 for (uint32_t n = 1; n <= 8; n++) {
12145 for (uint32_t m = 1; m <= 4; m++) {
12146 GemmMicrokernelTester()
12147 .mr(4)
12148 .nr(8)
12149 .kr(1)
12150 .sr(1)
12151 .m(m)
12152 .n(n)
12153 .k(k)
12154 .iterations(1)
12155 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12156 }
12157 }
12158 }
12159 }
12160
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_div_2)12161 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_div_2) {
12162 TEST_REQUIRES_ARM_NEON_FMA;
12163 for (size_t k = 4; k <= 20; k += 2) {
12164 GemmMicrokernelTester()
12165 .mr(4)
12166 .nr(8)
12167 .kr(1)
12168 .sr(1)
12169 .m(4)
12170 .n(8)
12171 .k(k)
12172 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12173 }
12174 }
12175
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_div_2_strided_a)12176 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
12177 TEST_REQUIRES_ARM_NEON_FMA;
12178 for (size_t k = 4; k <= 20; k += 2) {
12179 GemmMicrokernelTester()
12180 .mr(4)
12181 .nr(8)
12182 .kr(1)
12183 .sr(1)
12184 .m(4)
12185 .n(8)
12186 .k(k)
12187 .a_stride(23)
12188 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12189 }
12190 }
12191
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,k_div_2_subtile)12192 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
12193 TEST_REQUIRES_ARM_NEON_FMA;
12194 for (size_t k = 4; k <= 20; k += 2) {
12195 for (uint32_t n = 1; n <= 8; n++) {
12196 for (uint32_t m = 1; m <= 4; m++) {
12197 GemmMicrokernelTester()
12198 .mr(4)
12199 .nr(8)
12200 .kr(1)
12201 .sr(1)
12202 .m(m)
12203 .n(n)
12204 .k(k)
12205 .iterations(1)
12206 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12207 }
12208 }
12209 }
12210 }
12211
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,n_gt_8)12212 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, n_gt_8) {
12213 TEST_REQUIRES_ARM_NEON_FMA;
12214 for (uint32_t n = 9; n < 16; n++) {
12215 for (size_t k = 1; k <= 10; k += 3) {
12216 GemmMicrokernelTester()
12217 .mr(4)
12218 .nr(8)
12219 .kr(1)
12220 .sr(1)
12221 .m(4)
12222 .n(n)
12223 .k(k)
12224 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12225 }
12226 }
12227 }
12228
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,n_gt_8_strided_cn)12229 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
12230 TEST_REQUIRES_ARM_NEON_FMA;
12231 for (uint32_t n = 9; n < 16; n++) {
12232 for (size_t k = 1; k <= 10; k += 3) {
12233 GemmMicrokernelTester()
12234 .mr(4)
12235 .nr(8)
12236 .kr(1)
12237 .sr(1)
12238 .m(4)
12239 .n(n)
12240 .k(k)
12241 .cn_stride(11)
12242 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12243 }
12244 }
12245 }
12246
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,n_gt_8_strided_a)12247 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
12248 TEST_REQUIRES_ARM_NEON_FMA;
12249 for (uint32_t n = 9; n < 16; n++) {
12250 for (size_t k = 1; k <= 10; k += 3) {
12251 GemmMicrokernelTester()
12252 .mr(4)
12253 .nr(8)
12254 .kr(1)
12255 .sr(1)
12256 .m(4)
12257 .n(n)
12258 .k(k)
12259 .a_stride(13)
12260 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12261 }
12262 }
12263 }
12264
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,n_gt_8_subtile)12265 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
12266 TEST_REQUIRES_ARM_NEON_FMA;
12267 for (uint32_t n = 9; n < 16; n++) {
12268 for (size_t k = 1; k <= 10; k += 3) {
12269 for (uint32_t m = 1; m <= 4; m++) {
12270 GemmMicrokernelTester()
12271 .mr(4)
12272 .nr(8)
12273 .kr(1)
12274 .sr(1)
12275 .m(m)
12276 .n(n)
12277 .k(k)
12278 .iterations(1)
12279 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12280 }
12281 }
12282 }
12283 }
12284
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,n_div_8)12285 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, n_div_8) {
12286 TEST_REQUIRES_ARM_NEON_FMA;
12287 for (uint32_t n = 16; n <= 24; n += 8) {
12288 for (size_t k = 1; k <= 10; k += 3) {
12289 GemmMicrokernelTester()
12290 .mr(4)
12291 .nr(8)
12292 .kr(1)
12293 .sr(1)
12294 .m(4)
12295 .n(n)
12296 .k(k)
12297 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12298 }
12299 }
12300 }
12301
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,n_div_8_strided_cn)12302 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
12303 TEST_REQUIRES_ARM_NEON_FMA;
12304 for (uint32_t n = 16; n <= 24; n += 8) {
12305 for (size_t k = 1; k <= 10; k += 3) {
12306 GemmMicrokernelTester()
12307 .mr(4)
12308 .nr(8)
12309 .kr(1)
12310 .sr(1)
12311 .m(4)
12312 .n(n)
12313 .k(k)
12314 .cn_stride(11)
12315 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12316 }
12317 }
12318 }
12319
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,n_div_8_strided_a)12320 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
12321 TEST_REQUIRES_ARM_NEON_FMA;
12322 for (uint32_t n = 16; n <= 24; n += 8) {
12323 for (size_t k = 1; k <= 10; k += 3) {
12324 GemmMicrokernelTester()
12325 .mr(4)
12326 .nr(8)
12327 .kr(1)
12328 .sr(1)
12329 .m(4)
12330 .n(n)
12331 .k(k)
12332 .a_stride(13)
12333 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12334 }
12335 }
12336 }
12337
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,n_div_8_subtile)12338 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
12339 TEST_REQUIRES_ARM_NEON_FMA;
12340 for (uint32_t n = 16; n <= 24; n += 8) {
12341 for (size_t k = 1; k <= 10; k += 3) {
12342 for (uint32_t m = 1; m <= 4; m++) {
12343 GemmMicrokernelTester()
12344 .mr(4)
12345 .nr(8)
12346 .kr(1)
12347 .sr(1)
12348 .m(m)
12349 .n(n)
12350 .k(k)
12351 .iterations(1)
12352 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12353 }
12354 }
12355 }
12356 }
12357
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,strided_cm_subtile)12358 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
12359 TEST_REQUIRES_ARM_NEON_FMA;
12360 for (size_t k = 1; k <= 10; k += 3) {
12361 for (uint32_t n = 1; n <= 8; n++) {
12362 for (uint32_t m = 1; m <= 4; m++) {
12363 GemmMicrokernelTester()
12364 .mr(4)
12365 .nr(8)
12366 .kr(1)
12367 .sr(1)
12368 .m(m)
12369 .n(n)
12370 .k(k)
12371 .cm_stride(11)
12372 .iterations(1)
12373 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12374 }
12375 }
12376 }
12377 }
12378
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,qmin)12379 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, qmin) {
12380 TEST_REQUIRES_ARM_NEON_FMA;
12381 GemmMicrokernelTester()
12382 .mr(4)
12383 .nr(8)
12384 .kr(1)
12385 .sr(1)
12386 .m(4)
12387 .n(8)
12388 .k(2)
12389 .qmin(128)
12390 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12391 }
12392
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,qmax)12393 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, qmax) {
12394 TEST_REQUIRES_ARM_NEON_FMA;
12395 GemmMicrokernelTester()
12396 .mr(4)
12397 .nr(8)
12398 .kr(1)
12399 .sr(1)
12400 .m(4)
12401 .n(8)
12402 .k(2)
12403 .qmax(128)
12404 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12405 }
12406
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64,strided_cm)12407 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_DUP_LD64, strided_cm) {
12408 TEST_REQUIRES_ARM_NEON_FMA;
12409 GemmMicrokernelTester()
12410 .mr(4)
12411 .nr(8)
12412 .kr(1)
12413 .sr(1)
12414 .m(4)
12415 .n(8)
12416 .k(2)
12417 .cm_stride(11)
12418 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_dup_ld64, xnn_init_f32_minmax_scalar_params);
12419 }
12420 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
12421
12422
12423 #if XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_eq_4)12424 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_eq_4) {
12425 TEST_REQUIRES_ARM_NEON_FMA;
12426 GemmMicrokernelTester()
12427 .mr(4)
12428 .nr(8)
12429 .kr(1)
12430 .sr(1)
12431 .m(4)
12432 .n(8)
12433 .k(4)
12434 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12435 }
12436
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,strided_cn)12437 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, strided_cn) {
12438 TEST_REQUIRES_ARM_NEON_FMA;
12439 GemmMicrokernelTester()
12440 .mr(4)
12441 .nr(8)
12442 .kr(1)
12443 .sr(1)
12444 .m(4)
12445 .n(8)
12446 .k(4)
12447 .cn_stride(11)
12448 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12449 }
12450
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_eq_4_strided_a)12451 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
12452 TEST_REQUIRES_ARM_NEON_FMA;
12453 GemmMicrokernelTester()
12454 .mr(4)
12455 .nr(8)
12456 .kr(1)
12457 .sr(1)
12458 .m(4)
12459 .n(8)
12460 .k(4)
12461 .a_stride(7)
12462 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12463 }
12464
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_eq_4_subtile)12465 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
12466 TEST_REQUIRES_ARM_NEON_FMA;
12467 for (uint32_t n = 1; n <= 8; n++) {
12468 for (uint32_t m = 1; m <= 4; m++) {
12469 GemmMicrokernelTester()
12470 .mr(4)
12471 .nr(8)
12472 .kr(1)
12473 .sr(1)
12474 .m(m)
12475 .n(n)
12476 .k(4)
12477 .iterations(1)
12478 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12479 }
12480 }
12481 }
12482
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_eq_4_subtile_m)12483 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
12484 TEST_REQUIRES_ARM_NEON_FMA;
12485 for (uint32_t m = 1; m <= 4; m++) {
12486 GemmMicrokernelTester()
12487 .mr(4)
12488 .nr(8)
12489 .kr(1)
12490 .sr(1)
12491 .m(m)
12492 .n(8)
12493 .k(4)
12494 .iterations(1)
12495 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12496 }
12497 }
12498
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_eq_4_subtile_n)12499 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
12500 TEST_REQUIRES_ARM_NEON_FMA;
12501 for (uint32_t n = 1; n <= 8; n++) {
12502 GemmMicrokernelTester()
12503 .mr(4)
12504 .nr(8)
12505 .kr(1)
12506 .sr(1)
12507 .m(4)
12508 .n(n)
12509 .k(4)
12510 .iterations(1)
12511 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12512 }
12513 }
12514
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_lt_4)12515 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_lt_4) {
12516 TEST_REQUIRES_ARM_NEON_FMA;
12517 for (size_t k = 1; k < 4; k++) {
12518 GemmMicrokernelTester()
12519 .mr(4)
12520 .nr(8)
12521 .kr(1)
12522 .sr(1)
12523 .m(4)
12524 .n(8)
12525 .k(k)
12526 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12527 }
12528 }
12529
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_lt_4_strided_a)12530 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
12531 TEST_REQUIRES_ARM_NEON_FMA;
12532 for (size_t k = 1; k < 4; k++) {
12533 GemmMicrokernelTester()
12534 .mr(4)
12535 .nr(8)
12536 .kr(1)
12537 .sr(1)
12538 .m(4)
12539 .n(8)
12540 .k(k)
12541 .a_stride(7)
12542 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12543 }
12544 }
12545
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_lt_4_subtile)12546 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
12547 TEST_REQUIRES_ARM_NEON_FMA;
12548 for (size_t k = 1; k < 4; k++) {
12549 for (uint32_t n = 1; n <= 8; n++) {
12550 for (uint32_t m = 1; m <= 4; m++) {
12551 GemmMicrokernelTester()
12552 .mr(4)
12553 .nr(8)
12554 .kr(1)
12555 .sr(1)
12556 .m(m)
12557 .n(n)
12558 .k(k)
12559 .iterations(1)
12560 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12561 }
12562 }
12563 }
12564 }
12565
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_gt_4)12566 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_gt_4) {
12567 TEST_REQUIRES_ARM_NEON_FMA;
12568 for (size_t k = 5; k < 8; k++) {
12569 GemmMicrokernelTester()
12570 .mr(4)
12571 .nr(8)
12572 .kr(1)
12573 .sr(1)
12574 .m(4)
12575 .n(8)
12576 .k(k)
12577 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12578 }
12579 }
12580
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_gt_4_strided_a)12581 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
12582 TEST_REQUIRES_ARM_NEON_FMA;
12583 for (size_t k = 5; k < 8; k++) {
12584 GemmMicrokernelTester()
12585 .mr(4)
12586 .nr(8)
12587 .kr(1)
12588 .sr(1)
12589 .m(4)
12590 .n(8)
12591 .k(k)
12592 .a_stride(11)
12593 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12594 }
12595 }
12596
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_gt_4_subtile)12597 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
12598 TEST_REQUIRES_ARM_NEON_FMA;
12599 for (size_t k = 5; k < 8; k++) {
12600 for (uint32_t n = 1; n <= 8; n++) {
12601 for (uint32_t m = 1; m <= 4; m++) {
12602 GemmMicrokernelTester()
12603 .mr(4)
12604 .nr(8)
12605 .kr(1)
12606 .sr(1)
12607 .m(m)
12608 .n(n)
12609 .k(k)
12610 .iterations(1)
12611 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12612 }
12613 }
12614 }
12615 }
12616
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_div_4)12617 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_div_4) {
12618 TEST_REQUIRES_ARM_NEON_FMA;
12619 for (size_t k = 8; k <= 40; k += 4) {
12620 GemmMicrokernelTester()
12621 .mr(4)
12622 .nr(8)
12623 .kr(1)
12624 .sr(1)
12625 .m(4)
12626 .n(8)
12627 .k(k)
12628 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12629 }
12630 }
12631
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_div_4_strided_a)12632 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
12633 TEST_REQUIRES_ARM_NEON_FMA;
12634 for (size_t k = 8; k <= 40; k += 4) {
12635 GemmMicrokernelTester()
12636 .mr(4)
12637 .nr(8)
12638 .kr(1)
12639 .sr(1)
12640 .m(4)
12641 .n(8)
12642 .k(k)
12643 .a_stride(43)
12644 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12645 }
12646 }
12647
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,k_div_4_subtile)12648 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
12649 TEST_REQUIRES_ARM_NEON_FMA;
12650 for (size_t k = 8; k <= 40; k += 4) {
12651 for (uint32_t n = 1; n <= 8; n++) {
12652 for (uint32_t m = 1; m <= 4; m++) {
12653 GemmMicrokernelTester()
12654 .mr(4)
12655 .nr(8)
12656 .kr(1)
12657 .sr(1)
12658 .m(m)
12659 .n(n)
12660 .k(k)
12661 .iterations(1)
12662 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12663 }
12664 }
12665 }
12666 }
12667
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,n_gt_8)12668 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, n_gt_8) {
12669 TEST_REQUIRES_ARM_NEON_FMA;
12670 for (uint32_t n = 9; n < 16; n++) {
12671 for (size_t k = 1; k <= 20; k += 5) {
12672 GemmMicrokernelTester()
12673 .mr(4)
12674 .nr(8)
12675 .kr(1)
12676 .sr(1)
12677 .m(4)
12678 .n(n)
12679 .k(k)
12680 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12681 }
12682 }
12683 }
12684
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,n_gt_8_strided_cn)12685 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
12686 TEST_REQUIRES_ARM_NEON_FMA;
12687 for (uint32_t n = 9; n < 16; n++) {
12688 for (size_t k = 1; k <= 20; k += 5) {
12689 GemmMicrokernelTester()
12690 .mr(4)
12691 .nr(8)
12692 .kr(1)
12693 .sr(1)
12694 .m(4)
12695 .n(n)
12696 .k(k)
12697 .cn_stride(11)
12698 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12699 }
12700 }
12701 }
12702
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,n_gt_8_strided_a)12703 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
12704 TEST_REQUIRES_ARM_NEON_FMA;
12705 for (uint32_t n = 9; n < 16; n++) {
12706 for (size_t k = 1; k <= 20; k += 5) {
12707 GemmMicrokernelTester()
12708 .mr(4)
12709 .nr(8)
12710 .kr(1)
12711 .sr(1)
12712 .m(4)
12713 .n(n)
12714 .k(k)
12715 .a_stride(23)
12716 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12717 }
12718 }
12719 }
12720
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,n_gt_8_subtile)12721 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
12722 TEST_REQUIRES_ARM_NEON_FMA;
12723 for (uint32_t n = 9; n < 16; n++) {
12724 for (size_t k = 1; k <= 20; k += 5) {
12725 for (uint32_t m = 1; m <= 4; m++) {
12726 GemmMicrokernelTester()
12727 .mr(4)
12728 .nr(8)
12729 .kr(1)
12730 .sr(1)
12731 .m(m)
12732 .n(n)
12733 .k(k)
12734 .iterations(1)
12735 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12736 }
12737 }
12738 }
12739 }
12740
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,n_div_8)12741 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, n_div_8) {
12742 TEST_REQUIRES_ARM_NEON_FMA;
12743 for (uint32_t n = 16; n <= 24; n += 8) {
12744 for (size_t k = 1; k <= 20; k += 5) {
12745 GemmMicrokernelTester()
12746 .mr(4)
12747 .nr(8)
12748 .kr(1)
12749 .sr(1)
12750 .m(4)
12751 .n(n)
12752 .k(k)
12753 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12754 }
12755 }
12756 }
12757
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,n_div_8_strided_cn)12758 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
12759 TEST_REQUIRES_ARM_NEON_FMA;
12760 for (uint32_t n = 16; n <= 24; n += 8) {
12761 for (size_t k = 1; k <= 20; k += 5) {
12762 GemmMicrokernelTester()
12763 .mr(4)
12764 .nr(8)
12765 .kr(1)
12766 .sr(1)
12767 .m(4)
12768 .n(n)
12769 .k(k)
12770 .cn_stride(11)
12771 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12772 }
12773 }
12774 }
12775
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,n_div_8_strided_a)12776 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
12777 TEST_REQUIRES_ARM_NEON_FMA;
12778 for (uint32_t n = 16; n <= 24; n += 8) {
12779 for (size_t k = 1; k <= 20; k += 5) {
12780 GemmMicrokernelTester()
12781 .mr(4)
12782 .nr(8)
12783 .kr(1)
12784 .sr(1)
12785 .m(4)
12786 .n(n)
12787 .k(k)
12788 .a_stride(23)
12789 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12790 }
12791 }
12792 }
12793
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,n_div_8_subtile)12794 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
12795 TEST_REQUIRES_ARM_NEON_FMA;
12796 for (uint32_t n = 16; n <= 24; n += 8) {
12797 for (size_t k = 1; k <= 20; k += 5) {
12798 for (uint32_t m = 1; m <= 4; m++) {
12799 GemmMicrokernelTester()
12800 .mr(4)
12801 .nr(8)
12802 .kr(1)
12803 .sr(1)
12804 .m(m)
12805 .n(n)
12806 .k(k)
12807 .iterations(1)
12808 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12809 }
12810 }
12811 }
12812 }
12813
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,strided_cm_subtile)12814 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
12815 TEST_REQUIRES_ARM_NEON_FMA;
12816 for (size_t k = 1; k <= 20; k += 5) {
12817 for (uint32_t n = 1; n <= 8; n++) {
12818 for (uint32_t m = 1; m <= 4; m++) {
12819 GemmMicrokernelTester()
12820 .mr(4)
12821 .nr(8)
12822 .kr(1)
12823 .sr(1)
12824 .m(m)
12825 .n(n)
12826 .k(k)
12827 .cm_stride(11)
12828 .iterations(1)
12829 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12830 }
12831 }
12832 }
12833 }
12834
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,qmin)12835 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, qmin) {
12836 TEST_REQUIRES_ARM_NEON_FMA;
12837 GemmMicrokernelTester()
12838 .mr(4)
12839 .nr(8)
12840 .kr(1)
12841 .sr(1)
12842 .m(4)
12843 .n(8)
12844 .k(4)
12845 .qmin(128)
12846 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12847 }
12848
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,qmax)12849 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, qmax) {
12850 TEST_REQUIRES_ARM_NEON_FMA;
12851 GemmMicrokernelTester()
12852 .mr(4)
12853 .nr(8)
12854 .kr(1)
12855 .sr(1)
12856 .m(4)
12857 .n(8)
12858 .k(4)
12859 .qmax(128)
12860 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12861 }
12862
TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128,strided_cm)12863 TEST(F32_GEMM_MINMAX_4X8__NEONFMA_LANE_LD128, strided_cm) {
12864 TEST_REQUIRES_ARM_NEON_FMA;
12865 GemmMicrokernelTester()
12866 .mr(4)
12867 .nr(8)
12868 .kr(1)
12869 .sr(1)
12870 .m(4)
12871 .n(8)
12872 .k(4)
12873 .cm_stride(11)
12874 .Test(xnn_f32_gemm_minmax_ukernel_4x8__neonfma_lane_ld128, xnn_init_f32_minmax_scalar_params);
12875 }
12876 #endif // XNN_ARCH_ARM64
12877
12878
12879 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_eq_4)12880 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_eq_4) {
12881 TEST_REQUIRES_ARM_NEON;
12882 GemmMicrokernelTester()
12883 .mr(4)
12884 .nr(8)
12885 .kr(1)
12886 .sr(4)
12887 .m(4)
12888 .n(8)
12889 .k(4)
12890 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
12891 }
12892
TEST(F32_GEMM_MINMAX_4X8S4__NEON,strided_cn)12893 TEST(F32_GEMM_MINMAX_4X8S4__NEON, strided_cn) {
12894 TEST_REQUIRES_ARM_NEON;
12895 GemmMicrokernelTester()
12896 .mr(4)
12897 .nr(8)
12898 .kr(1)
12899 .sr(4)
12900 .m(4)
12901 .n(8)
12902 .k(4)
12903 .cn_stride(11)
12904 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
12905 }
12906
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_eq_4_strided_a)12907 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_eq_4_strided_a) {
12908 TEST_REQUIRES_ARM_NEON;
12909 GemmMicrokernelTester()
12910 .mr(4)
12911 .nr(8)
12912 .kr(1)
12913 .sr(4)
12914 .m(4)
12915 .n(8)
12916 .k(4)
12917 .a_stride(7)
12918 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
12919 }
12920
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_eq_4_subtile)12921 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_eq_4_subtile) {
12922 TEST_REQUIRES_ARM_NEON;
12923 for (uint32_t n = 1; n <= 8; n++) {
12924 for (uint32_t m = 1; m <= 4; m++) {
12925 GemmMicrokernelTester()
12926 .mr(4)
12927 .nr(8)
12928 .kr(1)
12929 .sr(4)
12930 .m(m)
12931 .n(n)
12932 .k(4)
12933 .iterations(1)
12934 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
12935 }
12936 }
12937 }
12938
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_eq_4_subtile_m)12939 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_eq_4_subtile_m) {
12940 TEST_REQUIRES_ARM_NEON;
12941 for (uint32_t m = 1; m <= 4; m++) {
12942 GemmMicrokernelTester()
12943 .mr(4)
12944 .nr(8)
12945 .kr(1)
12946 .sr(4)
12947 .m(m)
12948 .n(8)
12949 .k(4)
12950 .iterations(1)
12951 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
12952 }
12953 }
12954
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_eq_4_subtile_n)12955 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_eq_4_subtile_n) {
12956 TEST_REQUIRES_ARM_NEON;
12957 for (uint32_t n = 1; n <= 8; n++) {
12958 GemmMicrokernelTester()
12959 .mr(4)
12960 .nr(8)
12961 .kr(1)
12962 .sr(4)
12963 .m(4)
12964 .n(n)
12965 .k(4)
12966 .iterations(1)
12967 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
12968 }
12969 }
12970
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_lt_4)12971 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_lt_4) {
12972 TEST_REQUIRES_ARM_NEON;
12973 for (size_t k = 1; k < 4; k++) {
12974 GemmMicrokernelTester()
12975 .mr(4)
12976 .nr(8)
12977 .kr(1)
12978 .sr(4)
12979 .m(4)
12980 .n(8)
12981 .k(k)
12982 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
12983 }
12984 }
12985
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_lt_4_strided_a)12986 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_lt_4_strided_a) {
12987 TEST_REQUIRES_ARM_NEON;
12988 for (size_t k = 1; k < 4; k++) {
12989 GemmMicrokernelTester()
12990 .mr(4)
12991 .nr(8)
12992 .kr(1)
12993 .sr(4)
12994 .m(4)
12995 .n(8)
12996 .k(k)
12997 .a_stride(7)
12998 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
12999 }
13000 }
13001
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_lt_4_subtile)13002 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_lt_4_subtile) {
13003 TEST_REQUIRES_ARM_NEON;
13004 for (size_t k = 1; k < 4; k++) {
13005 for (uint32_t n = 1; n <= 8; n++) {
13006 for (uint32_t m = 1; m <= 4; m++) {
13007 GemmMicrokernelTester()
13008 .mr(4)
13009 .nr(8)
13010 .kr(1)
13011 .sr(4)
13012 .m(m)
13013 .n(n)
13014 .k(k)
13015 .iterations(1)
13016 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13017 }
13018 }
13019 }
13020 }
13021
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_gt_4)13022 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_gt_4) {
13023 TEST_REQUIRES_ARM_NEON;
13024 for (size_t k = 5; k < 8; k++) {
13025 GemmMicrokernelTester()
13026 .mr(4)
13027 .nr(8)
13028 .kr(1)
13029 .sr(4)
13030 .m(4)
13031 .n(8)
13032 .k(k)
13033 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13034 }
13035 }
13036
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_gt_4_strided_a)13037 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_gt_4_strided_a) {
13038 TEST_REQUIRES_ARM_NEON;
13039 for (size_t k = 5; k < 8; k++) {
13040 GemmMicrokernelTester()
13041 .mr(4)
13042 .nr(8)
13043 .kr(1)
13044 .sr(4)
13045 .m(4)
13046 .n(8)
13047 .k(k)
13048 .a_stride(11)
13049 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13050 }
13051 }
13052
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_gt_4_subtile)13053 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_gt_4_subtile) {
13054 TEST_REQUIRES_ARM_NEON;
13055 for (size_t k = 5; k < 8; k++) {
13056 for (uint32_t n = 1; n <= 8; n++) {
13057 for (uint32_t m = 1; m <= 4; m++) {
13058 GemmMicrokernelTester()
13059 .mr(4)
13060 .nr(8)
13061 .kr(1)
13062 .sr(4)
13063 .m(m)
13064 .n(n)
13065 .k(k)
13066 .iterations(1)
13067 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13068 }
13069 }
13070 }
13071 }
13072
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_div_4)13073 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_div_4) {
13074 TEST_REQUIRES_ARM_NEON;
13075 for (size_t k = 8; k <= 40; k += 4) {
13076 GemmMicrokernelTester()
13077 .mr(4)
13078 .nr(8)
13079 .kr(1)
13080 .sr(4)
13081 .m(4)
13082 .n(8)
13083 .k(k)
13084 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13085 }
13086 }
13087
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_div_4_strided_a)13088 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_div_4_strided_a) {
13089 TEST_REQUIRES_ARM_NEON;
13090 for (size_t k = 8; k <= 40; k += 4) {
13091 GemmMicrokernelTester()
13092 .mr(4)
13093 .nr(8)
13094 .kr(1)
13095 .sr(4)
13096 .m(4)
13097 .n(8)
13098 .k(k)
13099 .a_stride(43)
13100 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13101 }
13102 }
13103
TEST(F32_GEMM_MINMAX_4X8S4__NEON,k_div_4_subtile)13104 TEST(F32_GEMM_MINMAX_4X8S4__NEON, k_div_4_subtile) {
13105 TEST_REQUIRES_ARM_NEON;
13106 for (size_t k = 8; k <= 40; k += 4) {
13107 for (uint32_t n = 1; n <= 8; n++) {
13108 for (uint32_t m = 1; m <= 4; m++) {
13109 GemmMicrokernelTester()
13110 .mr(4)
13111 .nr(8)
13112 .kr(1)
13113 .sr(4)
13114 .m(m)
13115 .n(n)
13116 .k(k)
13117 .iterations(1)
13118 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13119 }
13120 }
13121 }
13122 }
13123
TEST(F32_GEMM_MINMAX_4X8S4__NEON,n_gt_8)13124 TEST(F32_GEMM_MINMAX_4X8S4__NEON, n_gt_8) {
13125 TEST_REQUIRES_ARM_NEON;
13126 for (uint32_t n = 9; n < 16; n++) {
13127 for (size_t k = 1; k <= 20; k += 5) {
13128 GemmMicrokernelTester()
13129 .mr(4)
13130 .nr(8)
13131 .kr(1)
13132 .sr(4)
13133 .m(4)
13134 .n(n)
13135 .k(k)
13136 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13137 }
13138 }
13139 }
13140
TEST(F32_GEMM_MINMAX_4X8S4__NEON,n_gt_8_strided_cn)13141 TEST(F32_GEMM_MINMAX_4X8S4__NEON, n_gt_8_strided_cn) {
13142 TEST_REQUIRES_ARM_NEON;
13143 for (uint32_t n = 9; n < 16; n++) {
13144 for (size_t k = 1; k <= 20; k += 5) {
13145 GemmMicrokernelTester()
13146 .mr(4)
13147 .nr(8)
13148 .kr(1)
13149 .sr(4)
13150 .m(4)
13151 .n(n)
13152 .k(k)
13153 .cn_stride(11)
13154 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13155 }
13156 }
13157 }
13158
TEST(F32_GEMM_MINMAX_4X8S4__NEON,n_gt_8_strided_a)13159 TEST(F32_GEMM_MINMAX_4X8S4__NEON, n_gt_8_strided_a) {
13160 TEST_REQUIRES_ARM_NEON;
13161 for (uint32_t n = 9; n < 16; n++) {
13162 for (size_t k = 1; k <= 20; k += 5) {
13163 GemmMicrokernelTester()
13164 .mr(4)
13165 .nr(8)
13166 .kr(1)
13167 .sr(4)
13168 .m(4)
13169 .n(n)
13170 .k(k)
13171 .a_stride(23)
13172 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13173 }
13174 }
13175 }
13176
TEST(F32_GEMM_MINMAX_4X8S4__NEON,n_gt_8_subtile)13177 TEST(F32_GEMM_MINMAX_4X8S4__NEON, n_gt_8_subtile) {
13178 TEST_REQUIRES_ARM_NEON;
13179 for (uint32_t n = 9; n < 16; n++) {
13180 for (size_t k = 1; k <= 20; k += 5) {
13181 for (uint32_t m = 1; m <= 4; m++) {
13182 GemmMicrokernelTester()
13183 .mr(4)
13184 .nr(8)
13185 .kr(1)
13186 .sr(4)
13187 .m(m)
13188 .n(n)
13189 .k(k)
13190 .iterations(1)
13191 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13192 }
13193 }
13194 }
13195 }
13196
TEST(F32_GEMM_MINMAX_4X8S4__NEON,n_div_8)13197 TEST(F32_GEMM_MINMAX_4X8S4__NEON, n_div_8) {
13198 TEST_REQUIRES_ARM_NEON;
13199 for (uint32_t n = 16; n <= 24; n += 8) {
13200 for (size_t k = 1; k <= 20; k += 5) {
13201 GemmMicrokernelTester()
13202 .mr(4)
13203 .nr(8)
13204 .kr(1)
13205 .sr(4)
13206 .m(4)
13207 .n(n)
13208 .k(k)
13209 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13210 }
13211 }
13212 }
13213
TEST(F32_GEMM_MINMAX_4X8S4__NEON,n_div_8_strided_cn)13214 TEST(F32_GEMM_MINMAX_4X8S4__NEON, n_div_8_strided_cn) {
13215 TEST_REQUIRES_ARM_NEON;
13216 for (uint32_t n = 16; n <= 24; n += 8) {
13217 for (size_t k = 1; k <= 20; k += 5) {
13218 GemmMicrokernelTester()
13219 .mr(4)
13220 .nr(8)
13221 .kr(1)
13222 .sr(4)
13223 .m(4)
13224 .n(n)
13225 .k(k)
13226 .cn_stride(11)
13227 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13228 }
13229 }
13230 }
13231
TEST(F32_GEMM_MINMAX_4X8S4__NEON,n_div_8_strided_a)13232 TEST(F32_GEMM_MINMAX_4X8S4__NEON, n_div_8_strided_a) {
13233 TEST_REQUIRES_ARM_NEON;
13234 for (uint32_t n = 16; n <= 24; n += 8) {
13235 for (size_t k = 1; k <= 20; k += 5) {
13236 GemmMicrokernelTester()
13237 .mr(4)
13238 .nr(8)
13239 .kr(1)
13240 .sr(4)
13241 .m(4)
13242 .n(n)
13243 .k(k)
13244 .a_stride(23)
13245 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13246 }
13247 }
13248 }
13249
TEST(F32_GEMM_MINMAX_4X8S4__NEON,n_div_8_subtile)13250 TEST(F32_GEMM_MINMAX_4X8S4__NEON, n_div_8_subtile) {
13251 TEST_REQUIRES_ARM_NEON;
13252 for (uint32_t n = 16; n <= 24; n += 8) {
13253 for (size_t k = 1; k <= 20; k += 5) {
13254 for (uint32_t m = 1; m <= 4; m++) {
13255 GemmMicrokernelTester()
13256 .mr(4)
13257 .nr(8)
13258 .kr(1)
13259 .sr(4)
13260 .m(m)
13261 .n(n)
13262 .k(k)
13263 .iterations(1)
13264 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13265 }
13266 }
13267 }
13268 }
13269
TEST(F32_GEMM_MINMAX_4X8S4__NEON,strided_cm_subtile)13270 TEST(F32_GEMM_MINMAX_4X8S4__NEON, strided_cm_subtile) {
13271 TEST_REQUIRES_ARM_NEON;
13272 for (size_t k = 1; k <= 20; k += 5) {
13273 for (uint32_t n = 1; n <= 8; n++) {
13274 for (uint32_t m = 1; m <= 4; m++) {
13275 GemmMicrokernelTester()
13276 .mr(4)
13277 .nr(8)
13278 .kr(1)
13279 .sr(4)
13280 .m(m)
13281 .n(n)
13282 .k(k)
13283 .cm_stride(11)
13284 .iterations(1)
13285 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13286 }
13287 }
13288 }
13289 }
13290
TEST(F32_GEMM_MINMAX_4X8S4__NEON,qmin)13291 TEST(F32_GEMM_MINMAX_4X8S4__NEON, qmin) {
13292 TEST_REQUIRES_ARM_NEON;
13293 GemmMicrokernelTester()
13294 .mr(4)
13295 .nr(8)
13296 .kr(1)
13297 .sr(4)
13298 .m(4)
13299 .n(8)
13300 .k(4)
13301 .qmin(128)
13302 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13303 }
13304
TEST(F32_GEMM_MINMAX_4X8S4__NEON,qmax)13305 TEST(F32_GEMM_MINMAX_4X8S4__NEON, qmax) {
13306 TEST_REQUIRES_ARM_NEON;
13307 GemmMicrokernelTester()
13308 .mr(4)
13309 .nr(8)
13310 .kr(1)
13311 .sr(4)
13312 .m(4)
13313 .n(8)
13314 .k(4)
13315 .qmax(128)
13316 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13317 }
13318
TEST(F32_GEMM_MINMAX_4X8S4__NEON,strided_cm)13319 TEST(F32_GEMM_MINMAX_4X8S4__NEON, strided_cm) {
13320 TEST_REQUIRES_ARM_NEON;
13321 GemmMicrokernelTester()
13322 .mr(4)
13323 .nr(8)
13324 .kr(1)
13325 .sr(4)
13326 .m(4)
13327 .n(8)
13328 .k(4)
13329 .cm_stride(11)
13330 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__neon, xnn_init_f32_minmax_scalar_params);
13331 }
13332 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
13333
13334
13335 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_eq_2)13336 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_eq_2) {
13337 TEST_REQUIRES_ARM_NEON;
13338 GemmMicrokernelTester()
13339 .mr(5)
13340 .nr(8)
13341 .kr(1)
13342 .sr(1)
13343 .m(5)
13344 .n(8)
13345 .k(2)
13346 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13347 }
13348
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,strided_cn)13349 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, strided_cn) {
13350 TEST_REQUIRES_ARM_NEON;
13351 GemmMicrokernelTester()
13352 .mr(5)
13353 .nr(8)
13354 .kr(1)
13355 .sr(1)
13356 .m(5)
13357 .n(8)
13358 .k(2)
13359 .cn_stride(11)
13360 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13361 }
13362
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_eq_2_strided_a)13363 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_eq_2_strided_a) {
13364 TEST_REQUIRES_ARM_NEON;
13365 GemmMicrokernelTester()
13366 .mr(5)
13367 .nr(8)
13368 .kr(1)
13369 .sr(1)
13370 .m(5)
13371 .n(8)
13372 .k(2)
13373 .a_stride(5)
13374 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13375 }
13376
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_eq_2_subtile)13377 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_eq_2_subtile) {
13378 TEST_REQUIRES_ARM_NEON;
13379 for (uint32_t n = 1; n <= 8; n++) {
13380 for (uint32_t m = 1; m <= 5; m++) {
13381 GemmMicrokernelTester()
13382 .mr(5)
13383 .nr(8)
13384 .kr(1)
13385 .sr(1)
13386 .m(m)
13387 .n(n)
13388 .k(2)
13389 .iterations(1)
13390 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13391 }
13392 }
13393 }
13394
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_eq_2_subtile_m)13395 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
13396 TEST_REQUIRES_ARM_NEON;
13397 for (uint32_t m = 1; m <= 5; m++) {
13398 GemmMicrokernelTester()
13399 .mr(5)
13400 .nr(8)
13401 .kr(1)
13402 .sr(1)
13403 .m(m)
13404 .n(8)
13405 .k(2)
13406 .iterations(1)
13407 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13408 }
13409 }
13410
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_eq_2_subtile_n)13411 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
13412 TEST_REQUIRES_ARM_NEON;
13413 for (uint32_t n = 1; n <= 8; n++) {
13414 GemmMicrokernelTester()
13415 .mr(5)
13416 .nr(8)
13417 .kr(1)
13418 .sr(1)
13419 .m(5)
13420 .n(n)
13421 .k(2)
13422 .iterations(1)
13423 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13424 }
13425 }
13426
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_lt_2)13427 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_lt_2) {
13428 TEST_REQUIRES_ARM_NEON;
13429 for (size_t k = 1; k < 2; k++) {
13430 GemmMicrokernelTester()
13431 .mr(5)
13432 .nr(8)
13433 .kr(1)
13434 .sr(1)
13435 .m(5)
13436 .n(8)
13437 .k(k)
13438 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13439 }
13440 }
13441
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_lt_2_strided_a)13442 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_lt_2_strided_a) {
13443 TEST_REQUIRES_ARM_NEON;
13444 for (size_t k = 1; k < 2; k++) {
13445 GemmMicrokernelTester()
13446 .mr(5)
13447 .nr(8)
13448 .kr(1)
13449 .sr(1)
13450 .m(5)
13451 .n(8)
13452 .k(k)
13453 .a_stride(5)
13454 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13455 }
13456 }
13457
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_lt_2_subtile)13458 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_lt_2_subtile) {
13459 TEST_REQUIRES_ARM_NEON;
13460 for (size_t k = 1; k < 2; k++) {
13461 for (uint32_t n = 1; n <= 8; n++) {
13462 for (uint32_t m = 1; m <= 5; m++) {
13463 GemmMicrokernelTester()
13464 .mr(5)
13465 .nr(8)
13466 .kr(1)
13467 .sr(1)
13468 .m(m)
13469 .n(n)
13470 .k(k)
13471 .iterations(1)
13472 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13473 }
13474 }
13475 }
13476 }
13477
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_gt_2)13478 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_gt_2) {
13479 TEST_REQUIRES_ARM_NEON;
13480 for (size_t k = 3; k < 4; k++) {
13481 GemmMicrokernelTester()
13482 .mr(5)
13483 .nr(8)
13484 .kr(1)
13485 .sr(1)
13486 .m(5)
13487 .n(8)
13488 .k(k)
13489 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13490 }
13491 }
13492
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_gt_2_strided_a)13493 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_gt_2_strided_a) {
13494 TEST_REQUIRES_ARM_NEON;
13495 for (size_t k = 3; k < 4; k++) {
13496 GemmMicrokernelTester()
13497 .mr(5)
13498 .nr(8)
13499 .kr(1)
13500 .sr(1)
13501 .m(5)
13502 .n(8)
13503 .k(k)
13504 .a_stride(7)
13505 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13506 }
13507 }
13508
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_gt_2_subtile)13509 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_gt_2_subtile) {
13510 TEST_REQUIRES_ARM_NEON;
13511 for (size_t k = 3; k < 4; k++) {
13512 for (uint32_t n = 1; n <= 8; n++) {
13513 for (uint32_t m = 1; m <= 5; m++) {
13514 GemmMicrokernelTester()
13515 .mr(5)
13516 .nr(8)
13517 .kr(1)
13518 .sr(1)
13519 .m(m)
13520 .n(n)
13521 .k(k)
13522 .iterations(1)
13523 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13524 }
13525 }
13526 }
13527 }
13528
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_div_2)13529 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_div_2) {
13530 TEST_REQUIRES_ARM_NEON;
13531 for (size_t k = 4; k <= 20; k += 2) {
13532 GemmMicrokernelTester()
13533 .mr(5)
13534 .nr(8)
13535 .kr(1)
13536 .sr(1)
13537 .m(5)
13538 .n(8)
13539 .k(k)
13540 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13541 }
13542 }
13543
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_div_2_strided_a)13544 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_div_2_strided_a) {
13545 TEST_REQUIRES_ARM_NEON;
13546 for (size_t k = 4; k <= 20; k += 2) {
13547 GemmMicrokernelTester()
13548 .mr(5)
13549 .nr(8)
13550 .kr(1)
13551 .sr(1)
13552 .m(5)
13553 .n(8)
13554 .k(k)
13555 .a_stride(23)
13556 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13557 }
13558 }
13559
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,k_div_2_subtile)13560 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, k_div_2_subtile) {
13561 TEST_REQUIRES_ARM_NEON;
13562 for (size_t k = 4; k <= 20; k += 2) {
13563 for (uint32_t n = 1; n <= 8; n++) {
13564 for (uint32_t m = 1; m <= 5; m++) {
13565 GemmMicrokernelTester()
13566 .mr(5)
13567 .nr(8)
13568 .kr(1)
13569 .sr(1)
13570 .m(m)
13571 .n(n)
13572 .k(k)
13573 .iterations(1)
13574 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13575 }
13576 }
13577 }
13578 }
13579
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,n_gt_8)13580 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, n_gt_8) {
13581 TEST_REQUIRES_ARM_NEON;
13582 for (uint32_t n = 9; n < 16; n++) {
13583 for (size_t k = 1; k <= 10; k += 3) {
13584 GemmMicrokernelTester()
13585 .mr(5)
13586 .nr(8)
13587 .kr(1)
13588 .sr(1)
13589 .m(5)
13590 .n(n)
13591 .k(k)
13592 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13593 }
13594 }
13595 }
13596
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,n_gt_8_strided_cn)13597 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
13598 TEST_REQUIRES_ARM_NEON;
13599 for (uint32_t n = 9; n < 16; n++) {
13600 for (size_t k = 1; k <= 10; k += 3) {
13601 GemmMicrokernelTester()
13602 .mr(5)
13603 .nr(8)
13604 .kr(1)
13605 .sr(1)
13606 .m(5)
13607 .n(n)
13608 .k(k)
13609 .cn_stride(11)
13610 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13611 }
13612 }
13613 }
13614
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,n_gt_8_strided_a)13615 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, n_gt_8_strided_a) {
13616 TEST_REQUIRES_ARM_NEON;
13617 for (uint32_t n = 9; n < 16; n++) {
13618 for (size_t k = 1; k <= 10; k += 3) {
13619 GemmMicrokernelTester()
13620 .mr(5)
13621 .nr(8)
13622 .kr(1)
13623 .sr(1)
13624 .m(5)
13625 .n(n)
13626 .k(k)
13627 .a_stride(13)
13628 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13629 }
13630 }
13631 }
13632
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,n_gt_8_subtile)13633 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, n_gt_8_subtile) {
13634 TEST_REQUIRES_ARM_NEON;
13635 for (uint32_t n = 9; n < 16; n++) {
13636 for (size_t k = 1; k <= 10; k += 3) {
13637 for (uint32_t m = 1; m <= 5; m++) {
13638 GemmMicrokernelTester()
13639 .mr(5)
13640 .nr(8)
13641 .kr(1)
13642 .sr(1)
13643 .m(m)
13644 .n(n)
13645 .k(k)
13646 .iterations(1)
13647 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13648 }
13649 }
13650 }
13651 }
13652
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,n_div_8)13653 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, n_div_8) {
13654 TEST_REQUIRES_ARM_NEON;
13655 for (uint32_t n = 16; n <= 24; n += 8) {
13656 for (size_t k = 1; k <= 10; k += 3) {
13657 GemmMicrokernelTester()
13658 .mr(5)
13659 .nr(8)
13660 .kr(1)
13661 .sr(1)
13662 .m(5)
13663 .n(n)
13664 .k(k)
13665 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13666 }
13667 }
13668 }
13669
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,n_div_8_strided_cn)13670 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, n_div_8_strided_cn) {
13671 TEST_REQUIRES_ARM_NEON;
13672 for (uint32_t n = 16; n <= 24; n += 8) {
13673 for (size_t k = 1; k <= 10; k += 3) {
13674 GemmMicrokernelTester()
13675 .mr(5)
13676 .nr(8)
13677 .kr(1)
13678 .sr(1)
13679 .m(5)
13680 .n(n)
13681 .k(k)
13682 .cn_stride(11)
13683 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13684 }
13685 }
13686 }
13687
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,n_div_8_strided_a)13688 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, n_div_8_strided_a) {
13689 TEST_REQUIRES_ARM_NEON;
13690 for (uint32_t n = 16; n <= 24; n += 8) {
13691 for (size_t k = 1; k <= 10; k += 3) {
13692 GemmMicrokernelTester()
13693 .mr(5)
13694 .nr(8)
13695 .kr(1)
13696 .sr(1)
13697 .m(5)
13698 .n(n)
13699 .k(k)
13700 .a_stride(13)
13701 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13702 }
13703 }
13704 }
13705
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,n_div_8_subtile)13706 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, n_div_8_subtile) {
13707 TEST_REQUIRES_ARM_NEON;
13708 for (uint32_t n = 16; n <= 24; n += 8) {
13709 for (size_t k = 1; k <= 10; k += 3) {
13710 for (uint32_t m = 1; m <= 5; m++) {
13711 GemmMicrokernelTester()
13712 .mr(5)
13713 .nr(8)
13714 .kr(1)
13715 .sr(1)
13716 .m(m)
13717 .n(n)
13718 .k(k)
13719 .iterations(1)
13720 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13721 }
13722 }
13723 }
13724 }
13725
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,strided_cm_subtile)13726 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, strided_cm_subtile) {
13727 TEST_REQUIRES_ARM_NEON;
13728 for (size_t k = 1; k <= 10; k += 3) {
13729 for (uint32_t n = 1; n <= 8; n++) {
13730 for (uint32_t m = 1; m <= 5; m++) {
13731 GemmMicrokernelTester()
13732 .mr(5)
13733 .nr(8)
13734 .kr(1)
13735 .sr(1)
13736 .m(m)
13737 .n(n)
13738 .k(k)
13739 .cm_stride(11)
13740 .iterations(1)
13741 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13742 }
13743 }
13744 }
13745 }
13746
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,qmin)13747 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, qmin) {
13748 TEST_REQUIRES_ARM_NEON;
13749 GemmMicrokernelTester()
13750 .mr(5)
13751 .nr(8)
13752 .kr(1)
13753 .sr(1)
13754 .m(5)
13755 .n(8)
13756 .k(2)
13757 .qmin(128)
13758 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13759 }
13760
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,qmax)13761 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, qmax) {
13762 TEST_REQUIRES_ARM_NEON;
13763 GemmMicrokernelTester()
13764 .mr(5)
13765 .nr(8)
13766 .kr(1)
13767 .sr(1)
13768 .m(5)
13769 .n(8)
13770 .k(2)
13771 .qmax(128)
13772 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13773 }
13774
TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64,strided_cm)13775 TEST(F32_GEMM_MINMAX_5X8__NEON_LANE_LD64, strided_cm) {
13776 TEST_REQUIRES_ARM_NEON;
13777 GemmMicrokernelTester()
13778 .mr(5)
13779 .nr(8)
13780 .kr(1)
13781 .sr(1)
13782 .m(5)
13783 .n(8)
13784 .k(2)
13785 .cm_stride(11)
13786 .Test(xnn_f32_gemm_minmax_ukernel_5x8__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13787 }
13788 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
13789
13790
13791 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_eq_2)13792 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_eq_2) {
13793 TEST_REQUIRES_ARM_NEON;
13794 GemmMicrokernelTester()
13795 .mr(6)
13796 .nr(2)
13797 .kr(1)
13798 .sr(1)
13799 .m(6)
13800 .n(2)
13801 .k(2)
13802 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13803 }
13804
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,strided_cn)13805 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, strided_cn) {
13806 TEST_REQUIRES_ARM_NEON;
13807 GemmMicrokernelTester()
13808 .mr(6)
13809 .nr(2)
13810 .kr(1)
13811 .sr(1)
13812 .m(6)
13813 .n(2)
13814 .k(2)
13815 .cn_stride(5)
13816 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13817 }
13818
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_eq_2_strided_a)13819 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_eq_2_strided_a) {
13820 TEST_REQUIRES_ARM_NEON;
13821 GemmMicrokernelTester()
13822 .mr(6)
13823 .nr(2)
13824 .kr(1)
13825 .sr(1)
13826 .m(6)
13827 .n(2)
13828 .k(2)
13829 .a_stride(5)
13830 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13831 }
13832
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_eq_2_subtile)13833 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_eq_2_subtile) {
13834 TEST_REQUIRES_ARM_NEON;
13835 for (uint32_t n = 1; n <= 2; n++) {
13836 for (uint32_t m = 1; m <= 6; m++) {
13837 GemmMicrokernelTester()
13838 .mr(6)
13839 .nr(2)
13840 .kr(1)
13841 .sr(1)
13842 .m(m)
13843 .n(n)
13844 .k(2)
13845 .iterations(1)
13846 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13847 }
13848 }
13849 }
13850
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_eq_2_subtile_m)13851 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_eq_2_subtile_m) {
13852 TEST_REQUIRES_ARM_NEON;
13853 for (uint32_t m = 1; m <= 6; m++) {
13854 GemmMicrokernelTester()
13855 .mr(6)
13856 .nr(2)
13857 .kr(1)
13858 .sr(1)
13859 .m(m)
13860 .n(2)
13861 .k(2)
13862 .iterations(1)
13863 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13864 }
13865 }
13866
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_eq_2_subtile_n)13867 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_eq_2_subtile_n) {
13868 TEST_REQUIRES_ARM_NEON;
13869 for (uint32_t n = 1; n <= 2; n++) {
13870 GemmMicrokernelTester()
13871 .mr(6)
13872 .nr(2)
13873 .kr(1)
13874 .sr(1)
13875 .m(6)
13876 .n(n)
13877 .k(2)
13878 .iterations(1)
13879 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13880 }
13881 }
13882
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_lt_2)13883 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_lt_2) {
13884 TEST_REQUIRES_ARM_NEON;
13885 for (size_t k = 1; k < 2; k++) {
13886 GemmMicrokernelTester()
13887 .mr(6)
13888 .nr(2)
13889 .kr(1)
13890 .sr(1)
13891 .m(6)
13892 .n(2)
13893 .k(k)
13894 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13895 }
13896 }
13897
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_lt_2_strided_a)13898 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_lt_2_strided_a) {
13899 TEST_REQUIRES_ARM_NEON;
13900 for (size_t k = 1; k < 2; k++) {
13901 GemmMicrokernelTester()
13902 .mr(6)
13903 .nr(2)
13904 .kr(1)
13905 .sr(1)
13906 .m(6)
13907 .n(2)
13908 .k(k)
13909 .a_stride(5)
13910 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13911 }
13912 }
13913
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_lt_2_subtile)13914 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_lt_2_subtile) {
13915 TEST_REQUIRES_ARM_NEON;
13916 for (size_t k = 1; k < 2; k++) {
13917 for (uint32_t n = 1; n <= 2; n++) {
13918 for (uint32_t m = 1; m <= 6; m++) {
13919 GemmMicrokernelTester()
13920 .mr(6)
13921 .nr(2)
13922 .kr(1)
13923 .sr(1)
13924 .m(m)
13925 .n(n)
13926 .k(k)
13927 .iterations(1)
13928 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13929 }
13930 }
13931 }
13932 }
13933
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_gt_2)13934 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_gt_2) {
13935 TEST_REQUIRES_ARM_NEON;
13936 for (size_t k = 3; k < 4; k++) {
13937 GemmMicrokernelTester()
13938 .mr(6)
13939 .nr(2)
13940 .kr(1)
13941 .sr(1)
13942 .m(6)
13943 .n(2)
13944 .k(k)
13945 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13946 }
13947 }
13948
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_gt_2_strided_a)13949 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_gt_2_strided_a) {
13950 TEST_REQUIRES_ARM_NEON;
13951 for (size_t k = 3; k < 4; k++) {
13952 GemmMicrokernelTester()
13953 .mr(6)
13954 .nr(2)
13955 .kr(1)
13956 .sr(1)
13957 .m(6)
13958 .n(2)
13959 .k(k)
13960 .a_stride(7)
13961 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13962 }
13963 }
13964
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_gt_2_subtile)13965 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_gt_2_subtile) {
13966 TEST_REQUIRES_ARM_NEON;
13967 for (size_t k = 3; k < 4; k++) {
13968 for (uint32_t n = 1; n <= 2; n++) {
13969 for (uint32_t m = 1; m <= 6; m++) {
13970 GemmMicrokernelTester()
13971 .mr(6)
13972 .nr(2)
13973 .kr(1)
13974 .sr(1)
13975 .m(m)
13976 .n(n)
13977 .k(k)
13978 .iterations(1)
13979 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13980 }
13981 }
13982 }
13983 }
13984
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_div_2)13985 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_div_2) {
13986 TEST_REQUIRES_ARM_NEON;
13987 for (size_t k = 4; k <= 20; k += 2) {
13988 GemmMicrokernelTester()
13989 .mr(6)
13990 .nr(2)
13991 .kr(1)
13992 .sr(1)
13993 .m(6)
13994 .n(2)
13995 .k(k)
13996 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
13997 }
13998 }
13999
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_div_2_strided_a)14000 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_div_2_strided_a) {
14001 TEST_REQUIRES_ARM_NEON;
14002 for (size_t k = 4; k <= 20; k += 2) {
14003 GemmMicrokernelTester()
14004 .mr(6)
14005 .nr(2)
14006 .kr(1)
14007 .sr(1)
14008 .m(6)
14009 .n(2)
14010 .k(k)
14011 .a_stride(23)
14012 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14013 }
14014 }
14015
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,k_div_2_subtile)14016 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, k_div_2_subtile) {
14017 TEST_REQUIRES_ARM_NEON;
14018 for (size_t k = 4; k <= 20; k += 2) {
14019 for (uint32_t n = 1; n <= 2; n++) {
14020 for (uint32_t m = 1; m <= 6; m++) {
14021 GemmMicrokernelTester()
14022 .mr(6)
14023 .nr(2)
14024 .kr(1)
14025 .sr(1)
14026 .m(m)
14027 .n(n)
14028 .k(k)
14029 .iterations(1)
14030 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14031 }
14032 }
14033 }
14034 }
14035
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,n_gt_2)14036 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, n_gt_2) {
14037 TEST_REQUIRES_ARM_NEON;
14038 for (uint32_t n = 3; n < 4; n++) {
14039 for (size_t k = 1; k <= 10; k += 3) {
14040 GemmMicrokernelTester()
14041 .mr(6)
14042 .nr(2)
14043 .kr(1)
14044 .sr(1)
14045 .m(6)
14046 .n(n)
14047 .k(k)
14048 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14049 }
14050 }
14051 }
14052
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,n_gt_2_strided_cn)14053 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, n_gt_2_strided_cn) {
14054 TEST_REQUIRES_ARM_NEON;
14055 for (uint32_t n = 3; n < 4; n++) {
14056 for (size_t k = 1; k <= 10; k += 3) {
14057 GemmMicrokernelTester()
14058 .mr(6)
14059 .nr(2)
14060 .kr(1)
14061 .sr(1)
14062 .m(6)
14063 .n(n)
14064 .k(k)
14065 .cn_stride(5)
14066 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14067 }
14068 }
14069 }
14070
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,n_gt_2_strided_a)14071 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, n_gt_2_strided_a) {
14072 TEST_REQUIRES_ARM_NEON;
14073 for (uint32_t n = 3; n < 4; n++) {
14074 for (size_t k = 1; k <= 10; k += 3) {
14075 GemmMicrokernelTester()
14076 .mr(6)
14077 .nr(2)
14078 .kr(1)
14079 .sr(1)
14080 .m(6)
14081 .n(n)
14082 .k(k)
14083 .a_stride(13)
14084 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14085 }
14086 }
14087 }
14088
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,n_gt_2_subtile)14089 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, n_gt_2_subtile) {
14090 TEST_REQUIRES_ARM_NEON;
14091 for (uint32_t n = 3; n < 4; n++) {
14092 for (size_t k = 1; k <= 10; k += 3) {
14093 for (uint32_t m = 1; m <= 6; m++) {
14094 GemmMicrokernelTester()
14095 .mr(6)
14096 .nr(2)
14097 .kr(1)
14098 .sr(1)
14099 .m(m)
14100 .n(n)
14101 .k(k)
14102 .iterations(1)
14103 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14104 }
14105 }
14106 }
14107 }
14108
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,n_div_2)14109 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, n_div_2) {
14110 TEST_REQUIRES_ARM_NEON;
14111 for (uint32_t n = 4; n <= 6; n += 2) {
14112 for (size_t k = 1; k <= 10; k += 3) {
14113 GemmMicrokernelTester()
14114 .mr(6)
14115 .nr(2)
14116 .kr(1)
14117 .sr(1)
14118 .m(6)
14119 .n(n)
14120 .k(k)
14121 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14122 }
14123 }
14124 }
14125
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,n_div_2_strided_cn)14126 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, n_div_2_strided_cn) {
14127 TEST_REQUIRES_ARM_NEON;
14128 for (uint32_t n = 4; n <= 6; n += 2) {
14129 for (size_t k = 1; k <= 10; k += 3) {
14130 GemmMicrokernelTester()
14131 .mr(6)
14132 .nr(2)
14133 .kr(1)
14134 .sr(1)
14135 .m(6)
14136 .n(n)
14137 .k(k)
14138 .cn_stride(5)
14139 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14140 }
14141 }
14142 }
14143
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,n_div_2_strided_a)14144 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, n_div_2_strided_a) {
14145 TEST_REQUIRES_ARM_NEON;
14146 for (uint32_t n = 4; n <= 6; n += 2) {
14147 for (size_t k = 1; k <= 10; k += 3) {
14148 GemmMicrokernelTester()
14149 .mr(6)
14150 .nr(2)
14151 .kr(1)
14152 .sr(1)
14153 .m(6)
14154 .n(n)
14155 .k(k)
14156 .a_stride(13)
14157 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14158 }
14159 }
14160 }
14161
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,n_div_2_subtile)14162 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, n_div_2_subtile) {
14163 TEST_REQUIRES_ARM_NEON;
14164 for (uint32_t n = 4; n <= 6; n += 2) {
14165 for (size_t k = 1; k <= 10; k += 3) {
14166 for (uint32_t m = 1; m <= 6; m++) {
14167 GemmMicrokernelTester()
14168 .mr(6)
14169 .nr(2)
14170 .kr(1)
14171 .sr(1)
14172 .m(m)
14173 .n(n)
14174 .k(k)
14175 .iterations(1)
14176 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14177 }
14178 }
14179 }
14180 }
14181
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,strided_cm_subtile)14182 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, strided_cm_subtile) {
14183 TEST_REQUIRES_ARM_NEON;
14184 for (size_t k = 1; k <= 10; k += 3) {
14185 for (uint32_t n = 1; n <= 2; n++) {
14186 for (uint32_t m = 1; m <= 6; m++) {
14187 GemmMicrokernelTester()
14188 .mr(6)
14189 .nr(2)
14190 .kr(1)
14191 .sr(1)
14192 .m(m)
14193 .n(n)
14194 .k(k)
14195 .cm_stride(5)
14196 .iterations(1)
14197 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14198 }
14199 }
14200 }
14201 }
14202
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,qmin)14203 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, qmin) {
14204 TEST_REQUIRES_ARM_NEON;
14205 GemmMicrokernelTester()
14206 .mr(6)
14207 .nr(2)
14208 .kr(1)
14209 .sr(1)
14210 .m(6)
14211 .n(2)
14212 .k(2)
14213 .qmin(128)
14214 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14215 }
14216
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,qmax)14217 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, qmax) {
14218 TEST_REQUIRES_ARM_NEON;
14219 GemmMicrokernelTester()
14220 .mr(6)
14221 .nr(2)
14222 .kr(1)
14223 .sr(1)
14224 .m(6)
14225 .n(2)
14226 .k(2)
14227 .qmax(128)
14228 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14229 }
14230
TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64,strided_cm)14231 TEST(F32_GEMM_MINMAX_6X2__NEON_LANE_LD64, strided_cm) {
14232 TEST_REQUIRES_ARM_NEON;
14233 GemmMicrokernelTester()
14234 .mr(6)
14235 .nr(2)
14236 .kr(1)
14237 .sr(1)
14238 .m(6)
14239 .n(2)
14240 .k(2)
14241 .cm_stride(5)
14242 .Test(xnn_f32_gemm_minmax_ukernel_6x2__neon_lane_ld64, xnn_init_f32_minmax_scalar_params);
14243 }
14244 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
14245
14246
14247 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_eq_2)14248 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_eq_2) {
14249 TEST_REQUIRES_ARM_NEON;
14250 GemmMicrokernelTester()
14251 .mr(6)
14252 .nr(8)
14253 .kr(1)
14254 .sr(1)
14255 .m(6)
14256 .n(8)
14257 .k(2)
14258 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14259 }
14260
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,strided_cn)14261 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, strided_cn) {
14262 TEST_REQUIRES_ARM_NEON;
14263 GemmMicrokernelTester()
14264 .mr(6)
14265 .nr(8)
14266 .kr(1)
14267 .sr(1)
14268 .m(6)
14269 .n(8)
14270 .k(2)
14271 .cn_stride(11)
14272 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14273 }
14274
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_eq_2_strided_a)14275 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_eq_2_strided_a) {
14276 TEST_REQUIRES_ARM_NEON;
14277 GemmMicrokernelTester()
14278 .mr(6)
14279 .nr(8)
14280 .kr(1)
14281 .sr(1)
14282 .m(6)
14283 .n(8)
14284 .k(2)
14285 .a_stride(5)
14286 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14287 }
14288
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_eq_2_subtile)14289 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_eq_2_subtile) {
14290 TEST_REQUIRES_ARM_NEON;
14291 for (uint32_t n = 1; n <= 8; n++) {
14292 for (uint32_t m = 1; m <= 6; m++) {
14293 GemmMicrokernelTester()
14294 .mr(6)
14295 .nr(8)
14296 .kr(1)
14297 .sr(1)
14298 .m(m)
14299 .n(n)
14300 .k(2)
14301 .iterations(1)
14302 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14303 }
14304 }
14305 }
14306
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_eq_2_subtile_m)14307 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
14308 TEST_REQUIRES_ARM_NEON;
14309 for (uint32_t m = 1; m <= 6; m++) {
14310 GemmMicrokernelTester()
14311 .mr(6)
14312 .nr(8)
14313 .kr(1)
14314 .sr(1)
14315 .m(m)
14316 .n(8)
14317 .k(2)
14318 .iterations(1)
14319 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14320 }
14321 }
14322
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_eq_2_subtile_n)14323 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
14324 TEST_REQUIRES_ARM_NEON;
14325 for (uint32_t n = 1; n <= 8; n++) {
14326 GemmMicrokernelTester()
14327 .mr(6)
14328 .nr(8)
14329 .kr(1)
14330 .sr(1)
14331 .m(6)
14332 .n(n)
14333 .k(2)
14334 .iterations(1)
14335 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14336 }
14337 }
14338
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_lt_2)14339 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_lt_2) {
14340 TEST_REQUIRES_ARM_NEON;
14341 for (size_t k = 1; k < 2; k++) {
14342 GemmMicrokernelTester()
14343 .mr(6)
14344 .nr(8)
14345 .kr(1)
14346 .sr(1)
14347 .m(6)
14348 .n(8)
14349 .k(k)
14350 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14351 }
14352 }
14353
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_lt_2_strided_a)14354 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_lt_2_strided_a) {
14355 TEST_REQUIRES_ARM_NEON;
14356 for (size_t k = 1; k < 2; k++) {
14357 GemmMicrokernelTester()
14358 .mr(6)
14359 .nr(8)
14360 .kr(1)
14361 .sr(1)
14362 .m(6)
14363 .n(8)
14364 .k(k)
14365 .a_stride(5)
14366 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14367 }
14368 }
14369
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_lt_2_subtile)14370 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_lt_2_subtile) {
14371 TEST_REQUIRES_ARM_NEON;
14372 for (size_t k = 1; k < 2; k++) {
14373 for (uint32_t n = 1; n <= 8; n++) {
14374 for (uint32_t m = 1; m <= 6; m++) {
14375 GemmMicrokernelTester()
14376 .mr(6)
14377 .nr(8)
14378 .kr(1)
14379 .sr(1)
14380 .m(m)
14381 .n(n)
14382 .k(k)
14383 .iterations(1)
14384 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14385 }
14386 }
14387 }
14388 }
14389
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_gt_2)14390 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_gt_2) {
14391 TEST_REQUIRES_ARM_NEON;
14392 for (size_t k = 3; k < 4; k++) {
14393 GemmMicrokernelTester()
14394 .mr(6)
14395 .nr(8)
14396 .kr(1)
14397 .sr(1)
14398 .m(6)
14399 .n(8)
14400 .k(k)
14401 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14402 }
14403 }
14404
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_gt_2_strided_a)14405 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_gt_2_strided_a) {
14406 TEST_REQUIRES_ARM_NEON;
14407 for (size_t k = 3; k < 4; k++) {
14408 GemmMicrokernelTester()
14409 .mr(6)
14410 .nr(8)
14411 .kr(1)
14412 .sr(1)
14413 .m(6)
14414 .n(8)
14415 .k(k)
14416 .a_stride(7)
14417 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14418 }
14419 }
14420
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_gt_2_subtile)14421 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_gt_2_subtile) {
14422 TEST_REQUIRES_ARM_NEON;
14423 for (size_t k = 3; k < 4; k++) {
14424 for (uint32_t n = 1; n <= 8; n++) {
14425 for (uint32_t m = 1; m <= 6; m++) {
14426 GemmMicrokernelTester()
14427 .mr(6)
14428 .nr(8)
14429 .kr(1)
14430 .sr(1)
14431 .m(m)
14432 .n(n)
14433 .k(k)
14434 .iterations(1)
14435 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14436 }
14437 }
14438 }
14439 }
14440
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_div_2)14441 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_div_2) {
14442 TEST_REQUIRES_ARM_NEON;
14443 for (size_t k = 4; k <= 20; k += 2) {
14444 GemmMicrokernelTester()
14445 .mr(6)
14446 .nr(8)
14447 .kr(1)
14448 .sr(1)
14449 .m(6)
14450 .n(8)
14451 .k(k)
14452 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14453 }
14454 }
14455
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_div_2_strided_a)14456 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_div_2_strided_a) {
14457 TEST_REQUIRES_ARM_NEON;
14458 for (size_t k = 4; k <= 20; k += 2) {
14459 GemmMicrokernelTester()
14460 .mr(6)
14461 .nr(8)
14462 .kr(1)
14463 .sr(1)
14464 .m(6)
14465 .n(8)
14466 .k(k)
14467 .a_stride(23)
14468 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14469 }
14470 }
14471
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,k_div_2_subtile)14472 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, k_div_2_subtile) {
14473 TEST_REQUIRES_ARM_NEON;
14474 for (size_t k = 4; k <= 20; k += 2) {
14475 for (uint32_t n = 1; n <= 8; n++) {
14476 for (uint32_t m = 1; m <= 6; m++) {
14477 GemmMicrokernelTester()
14478 .mr(6)
14479 .nr(8)
14480 .kr(1)
14481 .sr(1)
14482 .m(m)
14483 .n(n)
14484 .k(k)
14485 .iterations(1)
14486 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14487 }
14488 }
14489 }
14490 }
14491
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,n_gt_8)14492 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, n_gt_8) {
14493 TEST_REQUIRES_ARM_NEON;
14494 for (uint32_t n = 9; n < 16; n++) {
14495 for (size_t k = 1; k <= 10; k += 3) {
14496 GemmMicrokernelTester()
14497 .mr(6)
14498 .nr(8)
14499 .kr(1)
14500 .sr(1)
14501 .m(6)
14502 .n(n)
14503 .k(k)
14504 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14505 }
14506 }
14507 }
14508
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,n_gt_8_strided_cn)14509 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
14510 TEST_REQUIRES_ARM_NEON;
14511 for (uint32_t n = 9; n < 16; n++) {
14512 for (size_t k = 1; k <= 10; k += 3) {
14513 GemmMicrokernelTester()
14514 .mr(6)
14515 .nr(8)
14516 .kr(1)
14517 .sr(1)
14518 .m(6)
14519 .n(n)
14520 .k(k)
14521 .cn_stride(11)
14522 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14523 }
14524 }
14525 }
14526
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,n_gt_8_strided_a)14527 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, n_gt_8_strided_a) {
14528 TEST_REQUIRES_ARM_NEON;
14529 for (uint32_t n = 9; n < 16; n++) {
14530 for (size_t k = 1; k <= 10; k += 3) {
14531 GemmMicrokernelTester()
14532 .mr(6)
14533 .nr(8)
14534 .kr(1)
14535 .sr(1)
14536 .m(6)
14537 .n(n)
14538 .k(k)
14539 .a_stride(13)
14540 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14541 }
14542 }
14543 }
14544
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,n_gt_8_subtile)14545 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, n_gt_8_subtile) {
14546 TEST_REQUIRES_ARM_NEON;
14547 for (uint32_t n = 9; n < 16; n++) {
14548 for (size_t k = 1; k <= 10; k += 3) {
14549 for (uint32_t m = 1; m <= 6; m++) {
14550 GemmMicrokernelTester()
14551 .mr(6)
14552 .nr(8)
14553 .kr(1)
14554 .sr(1)
14555 .m(m)
14556 .n(n)
14557 .k(k)
14558 .iterations(1)
14559 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14560 }
14561 }
14562 }
14563 }
14564
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,n_div_8)14565 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, n_div_8) {
14566 TEST_REQUIRES_ARM_NEON;
14567 for (uint32_t n = 16; n <= 24; n += 8) {
14568 for (size_t k = 1; k <= 10; k += 3) {
14569 GemmMicrokernelTester()
14570 .mr(6)
14571 .nr(8)
14572 .kr(1)
14573 .sr(1)
14574 .m(6)
14575 .n(n)
14576 .k(k)
14577 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14578 }
14579 }
14580 }
14581
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,n_div_8_strided_cn)14582 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, n_div_8_strided_cn) {
14583 TEST_REQUIRES_ARM_NEON;
14584 for (uint32_t n = 16; n <= 24; n += 8) {
14585 for (size_t k = 1; k <= 10; k += 3) {
14586 GemmMicrokernelTester()
14587 .mr(6)
14588 .nr(8)
14589 .kr(1)
14590 .sr(1)
14591 .m(6)
14592 .n(n)
14593 .k(k)
14594 .cn_stride(11)
14595 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14596 }
14597 }
14598 }
14599
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,n_div_8_strided_a)14600 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, n_div_8_strided_a) {
14601 TEST_REQUIRES_ARM_NEON;
14602 for (uint32_t n = 16; n <= 24; n += 8) {
14603 for (size_t k = 1; k <= 10; k += 3) {
14604 GemmMicrokernelTester()
14605 .mr(6)
14606 .nr(8)
14607 .kr(1)
14608 .sr(1)
14609 .m(6)
14610 .n(n)
14611 .k(k)
14612 .a_stride(13)
14613 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14614 }
14615 }
14616 }
14617
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,n_div_8_subtile)14618 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, n_div_8_subtile) {
14619 TEST_REQUIRES_ARM_NEON;
14620 for (uint32_t n = 16; n <= 24; n += 8) {
14621 for (size_t k = 1; k <= 10; k += 3) {
14622 for (uint32_t m = 1; m <= 6; m++) {
14623 GemmMicrokernelTester()
14624 .mr(6)
14625 .nr(8)
14626 .kr(1)
14627 .sr(1)
14628 .m(m)
14629 .n(n)
14630 .k(k)
14631 .iterations(1)
14632 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14633 }
14634 }
14635 }
14636 }
14637
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,strided_cm_subtile)14638 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, strided_cm_subtile) {
14639 TEST_REQUIRES_ARM_NEON;
14640 for (size_t k = 1; k <= 10; k += 3) {
14641 for (uint32_t n = 1; n <= 8; n++) {
14642 for (uint32_t m = 1; m <= 6; m++) {
14643 GemmMicrokernelTester()
14644 .mr(6)
14645 .nr(8)
14646 .kr(1)
14647 .sr(1)
14648 .m(m)
14649 .n(n)
14650 .k(k)
14651 .cm_stride(11)
14652 .iterations(1)
14653 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14654 }
14655 }
14656 }
14657 }
14658
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,qmin)14659 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, qmin) {
14660 TEST_REQUIRES_ARM_NEON;
14661 GemmMicrokernelTester()
14662 .mr(6)
14663 .nr(8)
14664 .kr(1)
14665 .sr(1)
14666 .m(6)
14667 .n(8)
14668 .k(2)
14669 .qmin(128)
14670 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14671 }
14672
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,qmax)14673 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, qmax) {
14674 TEST_REQUIRES_ARM_NEON;
14675 GemmMicrokernelTester()
14676 .mr(6)
14677 .nr(8)
14678 .kr(1)
14679 .sr(1)
14680 .m(6)
14681 .n(8)
14682 .k(2)
14683 .qmax(128)
14684 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14685 }
14686
TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64,strided_cm)14687 TEST(F32_GEMM_MINMAX_6X8__NEON_DUP_LD64, strided_cm) {
14688 TEST_REQUIRES_ARM_NEON;
14689 GemmMicrokernelTester()
14690 .mr(6)
14691 .nr(8)
14692 .kr(1)
14693 .sr(1)
14694 .m(6)
14695 .n(8)
14696 .k(2)
14697 .cm_stride(11)
14698 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_dup_ld64, xnn_init_f32_minmax_scalar_params);
14699 }
14700 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
14701
14702
14703 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_eq_4)14704 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_eq_4) {
14705 TEST_REQUIRES_ARM_NEON;
14706 GemmMicrokernelTester()
14707 .mr(6)
14708 .nr(8)
14709 .kr(1)
14710 .sr(1)
14711 .m(6)
14712 .n(8)
14713 .k(4)
14714 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14715 }
14716
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,strided_cn)14717 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, strided_cn) {
14718 TEST_REQUIRES_ARM_NEON;
14719 GemmMicrokernelTester()
14720 .mr(6)
14721 .nr(8)
14722 .kr(1)
14723 .sr(1)
14724 .m(6)
14725 .n(8)
14726 .k(4)
14727 .cn_stride(11)
14728 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14729 }
14730
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_eq_4_strided_a)14731 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_eq_4_strided_a) {
14732 TEST_REQUIRES_ARM_NEON;
14733 GemmMicrokernelTester()
14734 .mr(6)
14735 .nr(8)
14736 .kr(1)
14737 .sr(1)
14738 .m(6)
14739 .n(8)
14740 .k(4)
14741 .a_stride(7)
14742 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14743 }
14744
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_eq_4_subtile)14745 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_eq_4_subtile) {
14746 TEST_REQUIRES_ARM_NEON;
14747 for (uint32_t n = 1; n <= 8; n++) {
14748 for (uint32_t m = 1; m <= 6; m++) {
14749 GemmMicrokernelTester()
14750 .mr(6)
14751 .nr(8)
14752 .kr(1)
14753 .sr(1)
14754 .m(m)
14755 .n(n)
14756 .k(4)
14757 .iterations(1)
14758 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14759 }
14760 }
14761 }
14762
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_eq_4_subtile_m)14763 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
14764 TEST_REQUIRES_ARM_NEON;
14765 for (uint32_t m = 1; m <= 6; m++) {
14766 GemmMicrokernelTester()
14767 .mr(6)
14768 .nr(8)
14769 .kr(1)
14770 .sr(1)
14771 .m(m)
14772 .n(8)
14773 .k(4)
14774 .iterations(1)
14775 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14776 }
14777 }
14778
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_eq_4_subtile_n)14779 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
14780 TEST_REQUIRES_ARM_NEON;
14781 for (uint32_t n = 1; n <= 8; n++) {
14782 GemmMicrokernelTester()
14783 .mr(6)
14784 .nr(8)
14785 .kr(1)
14786 .sr(1)
14787 .m(6)
14788 .n(n)
14789 .k(4)
14790 .iterations(1)
14791 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14792 }
14793 }
14794
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_lt_4)14795 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_lt_4) {
14796 TEST_REQUIRES_ARM_NEON;
14797 for (size_t k = 1; k < 4; k++) {
14798 GemmMicrokernelTester()
14799 .mr(6)
14800 .nr(8)
14801 .kr(1)
14802 .sr(1)
14803 .m(6)
14804 .n(8)
14805 .k(k)
14806 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14807 }
14808 }
14809
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_lt_4_strided_a)14810 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_lt_4_strided_a) {
14811 TEST_REQUIRES_ARM_NEON;
14812 for (size_t k = 1; k < 4; k++) {
14813 GemmMicrokernelTester()
14814 .mr(6)
14815 .nr(8)
14816 .kr(1)
14817 .sr(1)
14818 .m(6)
14819 .n(8)
14820 .k(k)
14821 .a_stride(7)
14822 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14823 }
14824 }
14825
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_lt_4_subtile)14826 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_lt_4_subtile) {
14827 TEST_REQUIRES_ARM_NEON;
14828 for (size_t k = 1; k < 4; k++) {
14829 for (uint32_t n = 1; n <= 8; n++) {
14830 for (uint32_t m = 1; m <= 6; m++) {
14831 GemmMicrokernelTester()
14832 .mr(6)
14833 .nr(8)
14834 .kr(1)
14835 .sr(1)
14836 .m(m)
14837 .n(n)
14838 .k(k)
14839 .iterations(1)
14840 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14841 }
14842 }
14843 }
14844 }
14845
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_gt_4)14846 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_gt_4) {
14847 TEST_REQUIRES_ARM_NEON;
14848 for (size_t k = 5; k < 8; k++) {
14849 GemmMicrokernelTester()
14850 .mr(6)
14851 .nr(8)
14852 .kr(1)
14853 .sr(1)
14854 .m(6)
14855 .n(8)
14856 .k(k)
14857 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14858 }
14859 }
14860
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_gt_4_strided_a)14861 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_gt_4_strided_a) {
14862 TEST_REQUIRES_ARM_NEON;
14863 for (size_t k = 5; k < 8; k++) {
14864 GemmMicrokernelTester()
14865 .mr(6)
14866 .nr(8)
14867 .kr(1)
14868 .sr(1)
14869 .m(6)
14870 .n(8)
14871 .k(k)
14872 .a_stride(11)
14873 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14874 }
14875 }
14876
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_gt_4_subtile)14877 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_gt_4_subtile) {
14878 TEST_REQUIRES_ARM_NEON;
14879 for (size_t k = 5; k < 8; k++) {
14880 for (uint32_t n = 1; n <= 8; n++) {
14881 for (uint32_t m = 1; m <= 6; m++) {
14882 GemmMicrokernelTester()
14883 .mr(6)
14884 .nr(8)
14885 .kr(1)
14886 .sr(1)
14887 .m(m)
14888 .n(n)
14889 .k(k)
14890 .iterations(1)
14891 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14892 }
14893 }
14894 }
14895 }
14896
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_div_4)14897 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_div_4) {
14898 TEST_REQUIRES_ARM_NEON;
14899 for (size_t k = 8; k <= 40; k += 4) {
14900 GemmMicrokernelTester()
14901 .mr(6)
14902 .nr(8)
14903 .kr(1)
14904 .sr(1)
14905 .m(6)
14906 .n(8)
14907 .k(k)
14908 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14909 }
14910 }
14911
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_div_4_strided_a)14912 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_div_4_strided_a) {
14913 TEST_REQUIRES_ARM_NEON;
14914 for (size_t k = 8; k <= 40; k += 4) {
14915 GemmMicrokernelTester()
14916 .mr(6)
14917 .nr(8)
14918 .kr(1)
14919 .sr(1)
14920 .m(6)
14921 .n(8)
14922 .k(k)
14923 .a_stride(43)
14924 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14925 }
14926 }
14927
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,k_div_4_subtile)14928 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, k_div_4_subtile) {
14929 TEST_REQUIRES_ARM_NEON;
14930 for (size_t k = 8; k <= 40; k += 4) {
14931 for (uint32_t n = 1; n <= 8; n++) {
14932 for (uint32_t m = 1; m <= 6; m++) {
14933 GemmMicrokernelTester()
14934 .mr(6)
14935 .nr(8)
14936 .kr(1)
14937 .sr(1)
14938 .m(m)
14939 .n(n)
14940 .k(k)
14941 .iterations(1)
14942 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14943 }
14944 }
14945 }
14946 }
14947
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,n_gt_8)14948 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, n_gt_8) {
14949 TEST_REQUIRES_ARM_NEON;
14950 for (uint32_t n = 9; n < 16; n++) {
14951 for (size_t k = 1; k <= 20; k += 5) {
14952 GemmMicrokernelTester()
14953 .mr(6)
14954 .nr(8)
14955 .kr(1)
14956 .sr(1)
14957 .m(6)
14958 .n(n)
14959 .k(k)
14960 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14961 }
14962 }
14963 }
14964
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,n_gt_8_strided_cn)14965 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
14966 TEST_REQUIRES_ARM_NEON;
14967 for (uint32_t n = 9; n < 16; n++) {
14968 for (size_t k = 1; k <= 20; k += 5) {
14969 GemmMicrokernelTester()
14970 .mr(6)
14971 .nr(8)
14972 .kr(1)
14973 .sr(1)
14974 .m(6)
14975 .n(n)
14976 .k(k)
14977 .cn_stride(11)
14978 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14979 }
14980 }
14981 }
14982
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,n_gt_8_strided_a)14983 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, n_gt_8_strided_a) {
14984 TEST_REQUIRES_ARM_NEON;
14985 for (uint32_t n = 9; n < 16; n++) {
14986 for (size_t k = 1; k <= 20; k += 5) {
14987 GemmMicrokernelTester()
14988 .mr(6)
14989 .nr(8)
14990 .kr(1)
14991 .sr(1)
14992 .m(6)
14993 .n(n)
14994 .k(k)
14995 .a_stride(23)
14996 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
14997 }
14998 }
14999 }
15000
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,n_gt_8_subtile)15001 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, n_gt_8_subtile) {
15002 TEST_REQUIRES_ARM_NEON;
15003 for (uint32_t n = 9; n < 16; n++) {
15004 for (size_t k = 1; k <= 20; k += 5) {
15005 for (uint32_t m = 1; m <= 6; m++) {
15006 GemmMicrokernelTester()
15007 .mr(6)
15008 .nr(8)
15009 .kr(1)
15010 .sr(1)
15011 .m(m)
15012 .n(n)
15013 .k(k)
15014 .iterations(1)
15015 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
15016 }
15017 }
15018 }
15019 }
15020
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,n_div_8)15021 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, n_div_8) {
15022 TEST_REQUIRES_ARM_NEON;
15023 for (uint32_t n = 16; n <= 24; n += 8) {
15024 for (size_t k = 1; k <= 20; k += 5) {
15025 GemmMicrokernelTester()
15026 .mr(6)
15027 .nr(8)
15028 .kr(1)
15029 .sr(1)
15030 .m(6)
15031 .n(n)
15032 .k(k)
15033 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
15034 }
15035 }
15036 }
15037
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,n_div_8_strided_cn)15038 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, n_div_8_strided_cn) {
15039 TEST_REQUIRES_ARM_NEON;
15040 for (uint32_t n = 16; n <= 24; n += 8) {
15041 for (size_t k = 1; k <= 20; k += 5) {
15042 GemmMicrokernelTester()
15043 .mr(6)
15044 .nr(8)
15045 .kr(1)
15046 .sr(1)
15047 .m(6)
15048 .n(n)
15049 .k(k)
15050 .cn_stride(11)
15051 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
15052 }
15053 }
15054 }
15055
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,n_div_8_strided_a)15056 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, n_div_8_strided_a) {
15057 TEST_REQUIRES_ARM_NEON;
15058 for (uint32_t n = 16; n <= 24; n += 8) {
15059 for (size_t k = 1; k <= 20; k += 5) {
15060 GemmMicrokernelTester()
15061 .mr(6)
15062 .nr(8)
15063 .kr(1)
15064 .sr(1)
15065 .m(6)
15066 .n(n)
15067 .k(k)
15068 .a_stride(23)
15069 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
15070 }
15071 }
15072 }
15073
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,n_div_8_subtile)15074 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, n_div_8_subtile) {
15075 TEST_REQUIRES_ARM_NEON;
15076 for (uint32_t n = 16; n <= 24; n += 8) {
15077 for (size_t k = 1; k <= 20; k += 5) {
15078 for (uint32_t m = 1; m <= 6; m++) {
15079 GemmMicrokernelTester()
15080 .mr(6)
15081 .nr(8)
15082 .kr(1)
15083 .sr(1)
15084 .m(m)
15085 .n(n)
15086 .k(k)
15087 .iterations(1)
15088 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
15089 }
15090 }
15091 }
15092 }
15093
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,strided_cm_subtile)15094 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, strided_cm_subtile) {
15095 TEST_REQUIRES_ARM_NEON;
15096 for (size_t k = 1; k <= 20; k += 5) {
15097 for (uint32_t n = 1; n <= 8; n++) {
15098 for (uint32_t m = 1; m <= 6; m++) {
15099 GemmMicrokernelTester()
15100 .mr(6)
15101 .nr(8)
15102 .kr(1)
15103 .sr(1)
15104 .m(m)
15105 .n(n)
15106 .k(k)
15107 .cm_stride(11)
15108 .iterations(1)
15109 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
15110 }
15111 }
15112 }
15113 }
15114
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,qmin)15115 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, qmin) {
15116 TEST_REQUIRES_ARM_NEON;
15117 GemmMicrokernelTester()
15118 .mr(6)
15119 .nr(8)
15120 .kr(1)
15121 .sr(1)
15122 .m(6)
15123 .n(8)
15124 .k(4)
15125 .qmin(128)
15126 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
15127 }
15128
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,qmax)15129 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, qmax) {
15130 TEST_REQUIRES_ARM_NEON;
15131 GemmMicrokernelTester()
15132 .mr(6)
15133 .nr(8)
15134 .kr(1)
15135 .sr(1)
15136 .m(6)
15137 .n(8)
15138 .k(4)
15139 .qmax(128)
15140 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
15141 }
15142
TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128,strided_cm)15143 TEST(F32_GEMM_MINMAX_6X8__NEON_LANE_LD128, strided_cm) {
15144 TEST_REQUIRES_ARM_NEON;
15145 GemmMicrokernelTester()
15146 .mr(6)
15147 .nr(8)
15148 .kr(1)
15149 .sr(1)
15150 .m(6)
15151 .n(8)
15152 .k(4)
15153 .cm_stride(11)
15154 .Test(xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128, xnn_init_f32_minmax_scalar_params);
15155 }
15156 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
15157
15158
15159 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_eq_4)15160 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_eq_4) {
15161 TEST_REQUIRES_ARM_NEON_FMA;
15162 GemmMicrokernelTester()
15163 .mr(6)
15164 .nr(8)
15165 .kr(1)
15166 .sr(4)
15167 .m(6)
15168 .n(8)
15169 .k(4)
15170 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15171 }
15172
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,strided_cn)15173 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, strided_cn) {
15174 TEST_REQUIRES_ARM_NEON_FMA;
15175 GemmMicrokernelTester()
15176 .mr(6)
15177 .nr(8)
15178 .kr(1)
15179 .sr(4)
15180 .m(6)
15181 .n(8)
15182 .k(4)
15183 .cn_stride(11)
15184 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15185 }
15186
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_eq_4_strided_a)15187 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_eq_4_strided_a) {
15188 TEST_REQUIRES_ARM_NEON_FMA;
15189 GemmMicrokernelTester()
15190 .mr(6)
15191 .nr(8)
15192 .kr(1)
15193 .sr(4)
15194 .m(6)
15195 .n(8)
15196 .k(4)
15197 .a_stride(7)
15198 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15199 }
15200
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_eq_4_subtile)15201 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_eq_4_subtile) {
15202 TEST_REQUIRES_ARM_NEON_FMA;
15203 for (uint32_t n = 1; n <= 8; n++) {
15204 for (uint32_t m = 1; m <= 6; m++) {
15205 GemmMicrokernelTester()
15206 .mr(6)
15207 .nr(8)
15208 .kr(1)
15209 .sr(4)
15210 .m(m)
15211 .n(n)
15212 .k(4)
15213 .iterations(1)
15214 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15215 }
15216 }
15217 }
15218
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_eq_4_subtile_m)15219 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_eq_4_subtile_m) {
15220 TEST_REQUIRES_ARM_NEON_FMA;
15221 for (uint32_t m = 1; m <= 6; m++) {
15222 GemmMicrokernelTester()
15223 .mr(6)
15224 .nr(8)
15225 .kr(1)
15226 .sr(4)
15227 .m(m)
15228 .n(8)
15229 .k(4)
15230 .iterations(1)
15231 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15232 }
15233 }
15234
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_eq_4_subtile_n)15235 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_eq_4_subtile_n) {
15236 TEST_REQUIRES_ARM_NEON_FMA;
15237 for (uint32_t n = 1; n <= 8; n++) {
15238 GemmMicrokernelTester()
15239 .mr(6)
15240 .nr(8)
15241 .kr(1)
15242 .sr(4)
15243 .m(6)
15244 .n(n)
15245 .k(4)
15246 .iterations(1)
15247 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15248 }
15249 }
15250
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_lt_4)15251 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_lt_4) {
15252 TEST_REQUIRES_ARM_NEON_FMA;
15253 for (size_t k = 1; k < 4; k++) {
15254 GemmMicrokernelTester()
15255 .mr(6)
15256 .nr(8)
15257 .kr(1)
15258 .sr(4)
15259 .m(6)
15260 .n(8)
15261 .k(k)
15262 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15263 }
15264 }
15265
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_lt_4_strided_a)15266 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_lt_4_strided_a) {
15267 TEST_REQUIRES_ARM_NEON_FMA;
15268 for (size_t k = 1; k < 4; k++) {
15269 GemmMicrokernelTester()
15270 .mr(6)
15271 .nr(8)
15272 .kr(1)
15273 .sr(4)
15274 .m(6)
15275 .n(8)
15276 .k(k)
15277 .a_stride(7)
15278 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15279 }
15280 }
15281
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_lt_4_subtile)15282 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_lt_4_subtile) {
15283 TEST_REQUIRES_ARM_NEON_FMA;
15284 for (size_t k = 1; k < 4; k++) {
15285 for (uint32_t n = 1; n <= 8; n++) {
15286 for (uint32_t m = 1; m <= 6; m++) {
15287 GemmMicrokernelTester()
15288 .mr(6)
15289 .nr(8)
15290 .kr(1)
15291 .sr(4)
15292 .m(m)
15293 .n(n)
15294 .k(k)
15295 .iterations(1)
15296 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15297 }
15298 }
15299 }
15300 }
15301
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_gt_4)15302 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_gt_4) {
15303 TEST_REQUIRES_ARM_NEON_FMA;
15304 for (size_t k = 5; k < 8; k++) {
15305 GemmMicrokernelTester()
15306 .mr(6)
15307 .nr(8)
15308 .kr(1)
15309 .sr(4)
15310 .m(6)
15311 .n(8)
15312 .k(k)
15313 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15314 }
15315 }
15316
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_gt_4_strided_a)15317 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_gt_4_strided_a) {
15318 TEST_REQUIRES_ARM_NEON_FMA;
15319 for (size_t k = 5; k < 8; k++) {
15320 GemmMicrokernelTester()
15321 .mr(6)
15322 .nr(8)
15323 .kr(1)
15324 .sr(4)
15325 .m(6)
15326 .n(8)
15327 .k(k)
15328 .a_stride(11)
15329 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15330 }
15331 }
15332
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_gt_4_subtile)15333 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_gt_4_subtile) {
15334 TEST_REQUIRES_ARM_NEON_FMA;
15335 for (size_t k = 5; k < 8; k++) {
15336 for (uint32_t n = 1; n <= 8; n++) {
15337 for (uint32_t m = 1; m <= 6; m++) {
15338 GemmMicrokernelTester()
15339 .mr(6)
15340 .nr(8)
15341 .kr(1)
15342 .sr(4)
15343 .m(m)
15344 .n(n)
15345 .k(k)
15346 .iterations(1)
15347 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15348 }
15349 }
15350 }
15351 }
15352
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_div_4)15353 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_div_4) {
15354 TEST_REQUIRES_ARM_NEON_FMA;
15355 for (size_t k = 8; k <= 40; k += 4) {
15356 GemmMicrokernelTester()
15357 .mr(6)
15358 .nr(8)
15359 .kr(1)
15360 .sr(4)
15361 .m(6)
15362 .n(8)
15363 .k(k)
15364 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15365 }
15366 }
15367
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_div_4_strided_a)15368 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_div_4_strided_a) {
15369 TEST_REQUIRES_ARM_NEON_FMA;
15370 for (size_t k = 8; k <= 40; k += 4) {
15371 GemmMicrokernelTester()
15372 .mr(6)
15373 .nr(8)
15374 .kr(1)
15375 .sr(4)
15376 .m(6)
15377 .n(8)
15378 .k(k)
15379 .a_stride(43)
15380 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15381 }
15382 }
15383
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,k_div_4_subtile)15384 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, k_div_4_subtile) {
15385 TEST_REQUIRES_ARM_NEON_FMA;
15386 for (size_t k = 8; k <= 40; k += 4) {
15387 for (uint32_t n = 1; n <= 8; n++) {
15388 for (uint32_t m = 1; m <= 6; m++) {
15389 GemmMicrokernelTester()
15390 .mr(6)
15391 .nr(8)
15392 .kr(1)
15393 .sr(4)
15394 .m(m)
15395 .n(n)
15396 .k(k)
15397 .iterations(1)
15398 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15399 }
15400 }
15401 }
15402 }
15403
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,n_gt_8)15404 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, n_gt_8) {
15405 TEST_REQUIRES_ARM_NEON_FMA;
15406 for (uint32_t n = 9; n < 16; n++) {
15407 for (size_t k = 1; k <= 20; k += 5) {
15408 GemmMicrokernelTester()
15409 .mr(6)
15410 .nr(8)
15411 .kr(1)
15412 .sr(4)
15413 .m(6)
15414 .n(n)
15415 .k(k)
15416 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15417 }
15418 }
15419 }
15420
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,n_gt_8_strided_cn)15421 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, n_gt_8_strided_cn) {
15422 TEST_REQUIRES_ARM_NEON_FMA;
15423 for (uint32_t n = 9; n < 16; n++) {
15424 for (size_t k = 1; k <= 20; k += 5) {
15425 GemmMicrokernelTester()
15426 .mr(6)
15427 .nr(8)
15428 .kr(1)
15429 .sr(4)
15430 .m(6)
15431 .n(n)
15432 .k(k)
15433 .cn_stride(11)
15434 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15435 }
15436 }
15437 }
15438
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,n_gt_8_strided_a)15439 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, n_gt_8_strided_a) {
15440 TEST_REQUIRES_ARM_NEON_FMA;
15441 for (uint32_t n = 9; n < 16; n++) {
15442 for (size_t k = 1; k <= 20; k += 5) {
15443 GemmMicrokernelTester()
15444 .mr(6)
15445 .nr(8)
15446 .kr(1)
15447 .sr(4)
15448 .m(6)
15449 .n(n)
15450 .k(k)
15451 .a_stride(23)
15452 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15453 }
15454 }
15455 }
15456
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,n_gt_8_subtile)15457 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, n_gt_8_subtile) {
15458 TEST_REQUIRES_ARM_NEON_FMA;
15459 for (uint32_t n = 9; n < 16; n++) {
15460 for (size_t k = 1; k <= 20; k += 5) {
15461 for (uint32_t m = 1; m <= 6; m++) {
15462 GemmMicrokernelTester()
15463 .mr(6)
15464 .nr(8)
15465 .kr(1)
15466 .sr(4)
15467 .m(m)
15468 .n(n)
15469 .k(k)
15470 .iterations(1)
15471 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15472 }
15473 }
15474 }
15475 }
15476
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,n_div_8)15477 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, n_div_8) {
15478 TEST_REQUIRES_ARM_NEON_FMA;
15479 for (uint32_t n = 16; n <= 24; n += 8) {
15480 for (size_t k = 1; k <= 20; k += 5) {
15481 GemmMicrokernelTester()
15482 .mr(6)
15483 .nr(8)
15484 .kr(1)
15485 .sr(4)
15486 .m(6)
15487 .n(n)
15488 .k(k)
15489 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15490 }
15491 }
15492 }
15493
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,n_div_8_strided_cn)15494 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, n_div_8_strided_cn) {
15495 TEST_REQUIRES_ARM_NEON_FMA;
15496 for (uint32_t n = 16; n <= 24; n += 8) {
15497 for (size_t k = 1; k <= 20; k += 5) {
15498 GemmMicrokernelTester()
15499 .mr(6)
15500 .nr(8)
15501 .kr(1)
15502 .sr(4)
15503 .m(6)
15504 .n(n)
15505 .k(k)
15506 .cn_stride(11)
15507 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15508 }
15509 }
15510 }
15511
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,n_div_8_strided_a)15512 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, n_div_8_strided_a) {
15513 TEST_REQUIRES_ARM_NEON_FMA;
15514 for (uint32_t n = 16; n <= 24; n += 8) {
15515 for (size_t k = 1; k <= 20; k += 5) {
15516 GemmMicrokernelTester()
15517 .mr(6)
15518 .nr(8)
15519 .kr(1)
15520 .sr(4)
15521 .m(6)
15522 .n(n)
15523 .k(k)
15524 .a_stride(23)
15525 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15526 }
15527 }
15528 }
15529
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,n_div_8_subtile)15530 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, n_div_8_subtile) {
15531 TEST_REQUIRES_ARM_NEON_FMA;
15532 for (uint32_t n = 16; n <= 24; n += 8) {
15533 for (size_t k = 1; k <= 20; k += 5) {
15534 for (uint32_t m = 1; m <= 6; m++) {
15535 GemmMicrokernelTester()
15536 .mr(6)
15537 .nr(8)
15538 .kr(1)
15539 .sr(4)
15540 .m(m)
15541 .n(n)
15542 .k(k)
15543 .iterations(1)
15544 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15545 }
15546 }
15547 }
15548 }
15549
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,strided_cm_subtile)15550 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, strided_cm_subtile) {
15551 TEST_REQUIRES_ARM_NEON_FMA;
15552 for (size_t k = 1; k <= 20; k += 5) {
15553 for (uint32_t n = 1; n <= 8; n++) {
15554 for (uint32_t m = 1; m <= 6; m++) {
15555 GemmMicrokernelTester()
15556 .mr(6)
15557 .nr(8)
15558 .kr(1)
15559 .sr(4)
15560 .m(m)
15561 .n(n)
15562 .k(k)
15563 .cm_stride(11)
15564 .iterations(1)
15565 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15566 }
15567 }
15568 }
15569 }
15570
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,qmin)15571 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, qmin) {
15572 TEST_REQUIRES_ARM_NEON_FMA;
15573 GemmMicrokernelTester()
15574 .mr(6)
15575 .nr(8)
15576 .kr(1)
15577 .sr(4)
15578 .m(6)
15579 .n(8)
15580 .k(4)
15581 .qmin(128)
15582 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15583 }
15584
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,qmax)15585 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, qmax) {
15586 TEST_REQUIRES_ARM_NEON_FMA;
15587 GemmMicrokernelTester()
15588 .mr(6)
15589 .nr(8)
15590 .kr(1)
15591 .sr(4)
15592 .m(6)
15593 .n(8)
15594 .k(4)
15595 .qmax(128)
15596 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15597 }
15598
TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA,strided_cm)15599 TEST(F32_GEMM_MINMAX_6X8S4__NEONFMA, strided_cm) {
15600 TEST_REQUIRES_ARM_NEON_FMA;
15601 GemmMicrokernelTester()
15602 .mr(6)
15603 .nr(8)
15604 .kr(1)
15605 .sr(4)
15606 .m(6)
15607 .n(8)
15608 .k(4)
15609 .cm_stride(11)
15610 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15611 }
15612 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
15613
15614
15615 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_eq_4)15616 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_eq_4) {
15617 TEST_REQUIRES_ARM_NEON_FMA;
15618 GemmMicrokernelTester()
15619 .mr(8)
15620 .nr(8)
15621 .kr(1)
15622 .sr(4)
15623 .m(8)
15624 .n(8)
15625 .k(4)
15626 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15627 }
15628
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,strided_cn)15629 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, strided_cn) {
15630 TEST_REQUIRES_ARM_NEON_FMA;
15631 GemmMicrokernelTester()
15632 .mr(8)
15633 .nr(8)
15634 .kr(1)
15635 .sr(4)
15636 .m(8)
15637 .n(8)
15638 .k(4)
15639 .cn_stride(11)
15640 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15641 }
15642
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_eq_4_strided_a)15643 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_eq_4_strided_a) {
15644 TEST_REQUIRES_ARM_NEON_FMA;
15645 GemmMicrokernelTester()
15646 .mr(8)
15647 .nr(8)
15648 .kr(1)
15649 .sr(4)
15650 .m(8)
15651 .n(8)
15652 .k(4)
15653 .a_stride(7)
15654 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15655 }
15656
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_eq_4_subtile)15657 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_eq_4_subtile) {
15658 TEST_REQUIRES_ARM_NEON_FMA;
15659 for (uint32_t n = 1; n <= 8; n++) {
15660 for (uint32_t m = 1; m <= 8; m++) {
15661 GemmMicrokernelTester()
15662 .mr(8)
15663 .nr(8)
15664 .kr(1)
15665 .sr(4)
15666 .m(m)
15667 .n(n)
15668 .k(4)
15669 .iterations(1)
15670 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15671 }
15672 }
15673 }
15674
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_eq_4_subtile_m)15675 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_eq_4_subtile_m) {
15676 TEST_REQUIRES_ARM_NEON_FMA;
15677 for (uint32_t m = 1; m <= 8; m++) {
15678 GemmMicrokernelTester()
15679 .mr(8)
15680 .nr(8)
15681 .kr(1)
15682 .sr(4)
15683 .m(m)
15684 .n(8)
15685 .k(4)
15686 .iterations(1)
15687 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15688 }
15689 }
15690
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_eq_4_subtile_n)15691 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_eq_4_subtile_n) {
15692 TEST_REQUIRES_ARM_NEON_FMA;
15693 for (uint32_t n = 1; n <= 8; n++) {
15694 GemmMicrokernelTester()
15695 .mr(8)
15696 .nr(8)
15697 .kr(1)
15698 .sr(4)
15699 .m(8)
15700 .n(n)
15701 .k(4)
15702 .iterations(1)
15703 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15704 }
15705 }
15706
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_lt_4)15707 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_lt_4) {
15708 TEST_REQUIRES_ARM_NEON_FMA;
15709 for (size_t k = 1; k < 4; k++) {
15710 GemmMicrokernelTester()
15711 .mr(8)
15712 .nr(8)
15713 .kr(1)
15714 .sr(4)
15715 .m(8)
15716 .n(8)
15717 .k(k)
15718 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15719 }
15720 }
15721
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_lt_4_strided_a)15722 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_lt_4_strided_a) {
15723 TEST_REQUIRES_ARM_NEON_FMA;
15724 for (size_t k = 1; k < 4; k++) {
15725 GemmMicrokernelTester()
15726 .mr(8)
15727 .nr(8)
15728 .kr(1)
15729 .sr(4)
15730 .m(8)
15731 .n(8)
15732 .k(k)
15733 .a_stride(7)
15734 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15735 }
15736 }
15737
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_lt_4_subtile)15738 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_lt_4_subtile) {
15739 TEST_REQUIRES_ARM_NEON_FMA;
15740 for (size_t k = 1; k < 4; k++) {
15741 for (uint32_t n = 1; n <= 8; n++) {
15742 for (uint32_t m = 1; m <= 8; m++) {
15743 GemmMicrokernelTester()
15744 .mr(8)
15745 .nr(8)
15746 .kr(1)
15747 .sr(4)
15748 .m(m)
15749 .n(n)
15750 .k(k)
15751 .iterations(1)
15752 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15753 }
15754 }
15755 }
15756 }
15757
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_gt_4)15758 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_gt_4) {
15759 TEST_REQUIRES_ARM_NEON_FMA;
15760 for (size_t k = 5; k < 8; k++) {
15761 GemmMicrokernelTester()
15762 .mr(8)
15763 .nr(8)
15764 .kr(1)
15765 .sr(4)
15766 .m(8)
15767 .n(8)
15768 .k(k)
15769 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15770 }
15771 }
15772
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_gt_4_strided_a)15773 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_gt_4_strided_a) {
15774 TEST_REQUIRES_ARM_NEON_FMA;
15775 for (size_t k = 5; k < 8; k++) {
15776 GemmMicrokernelTester()
15777 .mr(8)
15778 .nr(8)
15779 .kr(1)
15780 .sr(4)
15781 .m(8)
15782 .n(8)
15783 .k(k)
15784 .a_stride(11)
15785 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15786 }
15787 }
15788
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_gt_4_subtile)15789 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_gt_4_subtile) {
15790 TEST_REQUIRES_ARM_NEON_FMA;
15791 for (size_t k = 5; k < 8; k++) {
15792 for (uint32_t n = 1; n <= 8; n++) {
15793 for (uint32_t m = 1; m <= 8; m++) {
15794 GemmMicrokernelTester()
15795 .mr(8)
15796 .nr(8)
15797 .kr(1)
15798 .sr(4)
15799 .m(m)
15800 .n(n)
15801 .k(k)
15802 .iterations(1)
15803 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15804 }
15805 }
15806 }
15807 }
15808
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_div_4)15809 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_div_4) {
15810 TEST_REQUIRES_ARM_NEON_FMA;
15811 for (size_t k = 8; k <= 40; k += 4) {
15812 GemmMicrokernelTester()
15813 .mr(8)
15814 .nr(8)
15815 .kr(1)
15816 .sr(4)
15817 .m(8)
15818 .n(8)
15819 .k(k)
15820 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15821 }
15822 }
15823
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_div_4_strided_a)15824 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_div_4_strided_a) {
15825 TEST_REQUIRES_ARM_NEON_FMA;
15826 for (size_t k = 8; k <= 40; k += 4) {
15827 GemmMicrokernelTester()
15828 .mr(8)
15829 .nr(8)
15830 .kr(1)
15831 .sr(4)
15832 .m(8)
15833 .n(8)
15834 .k(k)
15835 .a_stride(43)
15836 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15837 }
15838 }
15839
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,k_div_4_subtile)15840 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, k_div_4_subtile) {
15841 TEST_REQUIRES_ARM_NEON_FMA;
15842 for (size_t k = 8; k <= 40; k += 4) {
15843 for (uint32_t n = 1; n <= 8; n++) {
15844 for (uint32_t m = 1; m <= 8; m++) {
15845 GemmMicrokernelTester()
15846 .mr(8)
15847 .nr(8)
15848 .kr(1)
15849 .sr(4)
15850 .m(m)
15851 .n(n)
15852 .k(k)
15853 .iterations(1)
15854 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15855 }
15856 }
15857 }
15858 }
15859
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,n_gt_8)15860 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, n_gt_8) {
15861 TEST_REQUIRES_ARM_NEON_FMA;
15862 for (uint32_t n = 9; n < 16; n++) {
15863 for (size_t k = 1; k <= 20; k += 5) {
15864 GemmMicrokernelTester()
15865 .mr(8)
15866 .nr(8)
15867 .kr(1)
15868 .sr(4)
15869 .m(8)
15870 .n(n)
15871 .k(k)
15872 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15873 }
15874 }
15875 }
15876
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,n_gt_8_strided_cn)15877 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, n_gt_8_strided_cn) {
15878 TEST_REQUIRES_ARM_NEON_FMA;
15879 for (uint32_t n = 9; n < 16; n++) {
15880 for (size_t k = 1; k <= 20; k += 5) {
15881 GemmMicrokernelTester()
15882 .mr(8)
15883 .nr(8)
15884 .kr(1)
15885 .sr(4)
15886 .m(8)
15887 .n(n)
15888 .k(k)
15889 .cn_stride(11)
15890 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15891 }
15892 }
15893 }
15894
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,n_gt_8_strided_a)15895 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, n_gt_8_strided_a) {
15896 TEST_REQUIRES_ARM_NEON_FMA;
15897 for (uint32_t n = 9; n < 16; n++) {
15898 for (size_t k = 1; k <= 20; k += 5) {
15899 GemmMicrokernelTester()
15900 .mr(8)
15901 .nr(8)
15902 .kr(1)
15903 .sr(4)
15904 .m(8)
15905 .n(n)
15906 .k(k)
15907 .a_stride(23)
15908 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15909 }
15910 }
15911 }
15912
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,n_gt_8_subtile)15913 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, n_gt_8_subtile) {
15914 TEST_REQUIRES_ARM_NEON_FMA;
15915 for (uint32_t n = 9; n < 16; n++) {
15916 for (size_t k = 1; k <= 20; k += 5) {
15917 for (uint32_t m = 1; m <= 8; m++) {
15918 GemmMicrokernelTester()
15919 .mr(8)
15920 .nr(8)
15921 .kr(1)
15922 .sr(4)
15923 .m(m)
15924 .n(n)
15925 .k(k)
15926 .iterations(1)
15927 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15928 }
15929 }
15930 }
15931 }
15932
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,n_div_8)15933 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, n_div_8) {
15934 TEST_REQUIRES_ARM_NEON_FMA;
15935 for (uint32_t n = 16; n <= 24; n += 8) {
15936 for (size_t k = 1; k <= 20; k += 5) {
15937 GemmMicrokernelTester()
15938 .mr(8)
15939 .nr(8)
15940 .kr(1)
15941 .sr(4)
15942 .m(8)
15943 .n(n)
15944 .k(k)
15945 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15946 }
15947 }
15948 }
15949
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,n_div_8_strided_cn)15950 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, n_div_8_strided_cn) {
15951 TEST_REQUIRES_ARM_NEON_FMA;
15952 for (uint32_t n = 16; n <= 24; n += 8) {
15953 for (size_t k = 1; k <= 20; k += 5) {
15954 GemmMicrokernelTester()
15955 .mr(8)
15956 .nr(8)
15957 .kr(1)
15958 .sr(4)
15959 .m(8)
15960 .n(n)
15961 .k(k)
15962 .cn_stride(11)
15963 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15964 }
15965 }
15966 }
15967
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,n_div_8_strided_a)15968 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, n_div_8_strided_a) {
15969 TEST_REQUIRES_ARM_NEON_FMA;
15970 for (uint32_t n = 16; n <= 24; n += 8) {
15971 for (size_t k = 1; k <= 20; k += 5) {
15972 GemmMicrokernelTester()
15973 .mr(8)
15974 .nr(8)
15975 .kr(1)
15976 .sr(4)
15977 .m(8)
15978 .n(n)
15979 .k(k)
15980 .a_stride(23)
15981 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
15982 }
15983 }
15984 }
15985
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,n_div_8_subtile)15986 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, n_div_8_subtile) {
15987 TEST_REQUIRES_ARM_NEON_FMA;
15988 for (uint32_t n = 16; n <= 24; n += 8) {
15989 for (size_t k = 1; k <= 20; k += 5) {
15990 for (uint32_t m = 1; m <= 8; m++) {
15991 GemmMicrokernelTester()
15992 .mr(8)
15993 .nr(8)
15994 .kr(1)
15995 .sr(4)
15996 .m(m)
15997 .n(n)
15998 .k(k)
15999 .iterations(1)
16000 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
16001 }
16002 }
16003 }
16004 }
16005
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,strided_cm_subtile)16006 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, strided_cm_subtile) {
16007 TEST_REQUIRES_ARM_NEON_FMA;
16008 for (size_t k = 1; k <= 20; k += 5) {
16009 for (uint32_t n = 1; n <= 8; n++) {
16010 for (uint32_t m = 1; m <= 8; m++) {
16011 GemmMicrokernelTester()
16012 .mr(8)
16013 .nr(8)
16014 .kr(1)
16015 .sr(4)
16016 .m(m)
16017 .n(n)
16018 .k(k)
16019 .cm_stride(11)
16020 .iterations(1)
16021 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
16022 }
16023 }
16024 }
16025 }
16026
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,qmin)16027 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, qmin) {
16028 TEST_REQUIRES_ARM_NEON_FMA;
16029 GemmMicrokernelTester()
16030 .mr(8)
16031 .nr(8)
16032 .kr(1)
16033 .sr(4)
16034 .m(8)
16035 .n(8)
16036 .k(4)
16037 .qmin(128)
16038 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
16039 }
16040
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,qmax)16041 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, qmax) {
16042 TEST_REQUIRES_ARM_NEON_FMA;
16043 GemmMicrokernelTester()
16044 .mr(8)
16045 .nr(8)
16046 .kr(1)
16047 .sr(4)
16048 .m(8)
16049 .n(8)
16050 .k(4)
16051 .qmax(128)
16052 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
16053 }
16054
TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA,strided_cm)16055 TEST(F32_GEMM_MINMAX_8X8S4__NEONFMA, strided_cm) {
16056 TEST_REQUIRES_ARM_NEON_FMA;
16057 GemmMicrokernelTester()
16058 .mr(8)
16059 .nr(8)
16060 .kr(1)
16061 .sr(4)
16062 .m(8)
16063 .n(8)
16064 .k(4)
16065 .cm_stride(11)
16066 .Test(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma, xnn_init_f32_minmax_scalar_params);
16067 }
16068 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
16069
16070
16071 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_eq_4)16072 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_eq_4) {
16073 TEST_REQUIRES_X86_SSE2;
16074 GemmMicrokernelTester()
16075 .mr(1)
16076 .nr(8)
16077 .kr(1)
16078 .sr(1)
16079 .m(1)
16080 .n(8)
16081 .k(4)
16082 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16083 }
16084
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,strided_cn)16085 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, strided_cn) {
16086 TEST_REQUIRES_X86_SSE2;
16087 GemmMicrokernelTester()
16088 .mr(1)
16089 .nr(8)
16090 .kr(1)
16091 .sr(1)
16092 .m(1)
16093 .n(8)
16094 .k(4)
16095 .cn_stride(11)
16096 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16097 }
16098
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_eq_4_strided_a)16099 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_eq_4_strided_a) {
16100 TEST_REQUIRES_X86_SSE2;
16101 GemmMicrokernelTester()
16102 .mr(1)
16103 .nr(8)
16104 .kr(1)
16105 .sr(1)
16106 .m(1)
16107 .n(8)
16108 .k(4)
16109 .a_stride(7)
16110 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16111 }
16112
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_eq_4_subtile)16113 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_eq_4_subtile) {
16114 TEST_REQUIRES_X86_SSE2;
16115 for (uint32_t n = 1; n <= 8; n++) {
16116 for (uint32_t m = 1; m <= 1; m++) {
16117 GemmMicrokernelTester()
16118 .mr(1)
16119 .nr(8)
16120 .kr(1)
16121 .sr(1)
16122 .m(m)
16123 .n(n)
16124 .k(4)
16125 .iterations(1)
16126 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16127 }
16128 }
16129 }
16130
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_eq_4_subtile_m)16131 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_eq_4_subtile_m) {
16132 TEST_REQUIRES_X86_SSE2;
16133 for (uint32_t m = 1; m <= 1; m++) {
16134 GemmMicrokernelTester()
16135 .mr(1)
16136 .nr(8)
16137 .kr(1)
16138 .sr(1)
16139 .m(m)
16140 .n(8)
16141 .k(4)
16142 .iterations(1)
16143 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16144 }
16145 }
16146
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_eq_4_subtile_n)16147 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_eq_4_subtile_n) {
16148 TEST_REQUIRES_X86_SSE2;
16149 for (uint32_t n = 1; n <= 8; n++) {
16150 GemmMicrokernelTester()
16151 .mr(1)
16152 .nr(8)
16153 .kr(1)
16154 .sr(1)
16155 .m(1)
16156 .n(n)
16157 .k(4)
16158 .iterations(1)
16159 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16160 }
16161 }
16162
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_lt_4)16163 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_lt_4) {
16164 TEST_REQUIRES_X86_SSE2;
16165 for (size_t k = 1; k < 4; k++) {
16166 GemmMicrokernelTester()
16167 .mr(1)
16168 .nr(8)
16169 .kr(1)
16170 .sr(1)
16171 .m(1)
16172 .n(8)
16173 .k(k)
16174 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16175 }
16176 }
16177
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_lt_4_strided_a)16178 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_lt_4_strided_a) {
16179 TEST_REQUIRES_X86_SSE2;
16180 for (size_t k = 1; k < 4; k++) {
16181 GemmMicrokernelTester()
16182 .mr(1)
16183 .nr(8)
16184 .kr(1)
16185 .sr(1)
16186 .m(1)
16187 .n(8)
16188 .k(k)
16189 .a_stride(7)
16190 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16191 }
16192 }
16193
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_lt_4_subtile)16194 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_lt_4_subtile) {
16195 TEST_REQUIRES_X86_SSE2;
16196 for (size_t k = 1; k < 4; k++) {
16197 for (uint32_t n = 1; n <= 8; n++) {
16198 for (uint32_t m = 1; m <= 1; m++) {
16199 GemmMicrokernelTester()
16200 .mr(1)
16201 .nr(8)
16202 .kr(1)
16203 .sr(1)
16204 .m(m)
16205 .n(n)
16206 .k(k)
16207 .iterations(1)
16208 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16209 }
16210 }
16211 }
16212 }
16213
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_gt_4)16214 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_gt_4) {
16215 TEST_REQUIRES_X86_SSE2;
16216 for (size_t k = 5; k < 8; k++) {
16217 GemmMicrokernelTester()
16218 .mr(1)
16219 .nr(8)
16220 .kr(1)
16221 .sr(1)
16222 .m(1)
16223 .n(8)
16224 .k(k)
16225 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16226 }
16227 }
16228
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_gt_4_strided_a)16229 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_gt_4_strided_a) {
16230 TEST_REQUIRES_X86_SSE2;
16231 for (size_t k = 5; k < 8; k++) {
16232 GemmMicrokernelTester()
16233 .mr(1)
16234 .nr(8)
16235 .kr(1)
16236 .sr(1)
16237 .m(1)
16238 .n(8)
16239 .k(k)
16240 .a_stride(11)
16241 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16242 }
16243 }
16244
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_gt_4_subtile)16245 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_gt_4_subtile) {
16246 TEST_REQUIRES_X86_SSE2;
16247 for (size_t k = 5; k < 8; k++) {
16248 for (uint32_t n = 1; n <= 8; n++) {
16249 for (uint32_t m = 1; m <= 1; m++) {
16250 GemmMicrokernelTester()
16251 .mr(1)
16252 .nr(8)
16253 .kr(1)
16254 .sr(1)
16255 .m(m)
16256 .n(n)
16257 .k(k)
16258 .iterations(1)
16259 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16260 }
16261 }
16262 }
16263 }
16264
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_div_4)16265 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_div_4) {
16266 TEST_REQUIRES_X86_SSE2;
16267 for (size_t k = 8; k <= 40; k += 4) {
16268 GemmMicrokernelTester()
16269 .mr(1)
16270 .nr(8)
16271 .kr(1)
16272 .sr(1)
16273 .m(1)
16274 .n(8)
16275 .k(k)
16276 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16277 }
16278 }
16279
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_div_4_strided_a)16280 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_div_4_strided_a) {
16281 TEST_REQUIRES_X86_SSE2;
16282 for (size_t k = 8; k <= 40; k += 4) {
16283 GemmMicrokernelTester()
16284 .mr(1)
16285 .nr(8)
16286 .kr(1)
16287 .sr(1)
16288 .m(1)
16289 .n(8)
16290 .k(k)
16291 .a_stride(43)
16292 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16293 }
16294 }
16295
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,k_div_4_subtile)16296 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, k_div_4_subtile) {
16297 TEST_REQUIRES_X86_SSE2;
16298 for (size_t k = 8; k <= 40; k += 4) {
16299 for (uint32_t n = 1; n <= 8; n++) {
16300 for (uint32_t m = 1; m <= 1; m++) {
16301 GemmMicrokernelTester()
16302 .mr(1)
16303 .nr(8)
16304 .kr(1)
16305 .sr(1)
16306 .m(m)
16307 .n(n)
16308 .k(k)
16309 .iterations(1)
16310 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16311 }
16312 }
16313 }
16314 }
16315
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,n_gt_8)16316 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, n_gt_8) {
16317 TEST_REQUIRES_X86_SSE2;
16318 for (uint32_t n = 9; n < 16; n++) {
16319 for (size_t k = 1; k <= 20; k += 5) {
16320 GemmMicrokernelTester()
16321 .mr(1)
16322 .nr(8)
16323 .kr(1)
16324 .sr(1)
16325 .m(1)
16326 .n(n)
16327 .k(k)
16328 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16329 }
16330 }
16331 }
16332
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,n_gt_8_strided_cn)16333 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, n_gt_8_strided_cn) {
16334 TEST_REQUIRES_X86_SSE2;
16335 for (uint32_t n = 9; n < 16; n++) {
16336 for (size_t k = 1; k <= 20; k += 5) {
16337 GemmMicrokernelTester()
16338 .mr(1)
16339 .nr(8)
16340 .kr(1)
16341 .sr(1)
16342 .m(1)
16343 .n(n)
16344 .k(k)
16345 .cn_stride(11)
16346 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16347 }
16348 }
16349 }
16350
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,n_gt_8_strided_a)16351 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, n_gt_8_strided_a) {
16352 TEST_REQUIRES_X86_SSE2;
16353 for (uint32_t n = 9; n < 16; n++) {
16354 for (size_t k = 1; k <= 20; k += 5) {
16355 GemmMicrokernelTester()
16356 .mr(1)
16357 .nr(8)
16358 .kr(1)
16359 .sr(1)
16360 .m(1)
16361 .n(n)
16362 .k(k)
16363 .a_stride(23)
16364 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16365 }
16366 }
16367 }
16368
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,n_gt_8_subtile)16369 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, n_gt_8_subtile) {
16370 TEST_REQUIRES_X86_SSE2;
16371 for (uint32_t n = 9; n < 16; n++) {
16372 for (size_t k = 1; k <= 20; k += 5) {
16373 for (uint32_t m = 1; m <= 1; m++) {
16374 GemmMicrokernelTester()
16375 .mr(1)
16376 .nr(8)
16377 .kr(1)
16378 .sr(1)
16379 .m(m)
16380 .n(n)
16381 .k(k)
16382 .iterations(1)
16383 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16384 }
16385 }
16386 }
16387 }
16388
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,n_div_8)16389 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, n_div_8) {
16390 TEST_REQUIRES_X86_SSE2;
16391 for (uint32_t n = 16; n <= 24; n += 8) {
16392 for (size_t k = 1; k <= 20; k += 5) {
16393 GemmMicrokernelTester()
16394 .mr(1)
16395 .nr(8)
16396 .kr(1)
16397 .sr(1)
16398 .m(1)
16399 .n(n)
16400 .k(k)
16401 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16402 }
16403 }
16404 }
16405
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,n_div_8_strided_cn)16406 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, n_div_8_strided_cn) {
16407 TEST_REQUIRES_X86_SSE2;
16408 for (uint32_t n = 16; n <= 24; n += 8) {
16409 for (size_t k = 1; k <= 20; k += 5) {
16410 GemmMicrokernelTester()
16411 .mr(1)
16412 .nr(8)
16413 .kr(1)
16414 .sr(1)
16415 .m(1)
16416 .n(n)
16417 .k(k)
16418 .cn_stride(11)
16419 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16420 }
16421 }
16422 }
16423
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,n_div_8_strided_a)16424 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, n_div_8_strided_a) {
16425 TEST_REQUIRES_X86_SSE2;
16426 for (uint32_t n = 16; n <= 24; n += 8) {
16427 for (size_t k = 1; k <= 20; k += 5) {
16428 GemmMicrokernelTester()
16429 .mr(1)
16430 .nr(8)
16431 .kr(1)
16432 .sr(1)
16433 .m(1)
16434 .n(n)
16435 .k(k)
16436 .a_stride(23)
16437 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16438 }
16439 }
16440 }
16441
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,n_div_8_subtile)16442 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, n_div_8_subtile) {
16443 TEST_REQUIRES_X86_SSE2;
16444 for (uint32_t n = 16; n <= 24; n += 8) {
16445 for (size_t k = 1; k <= 20; k += 5) {
16446 for (uint32_t m = 1; m <= 1; m++) {
16447 GemmMicrokernelTester()
16448 .mr(1)
16449 .nr(8)
16450 .kr(1)
16451 .sr(1)
16452 .m(m)
16453 .n(n)
16454 .k(k)
16455 .iterations(1)
16456 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16457 }
16458 }
16459 }
16460 }
16461
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,strided_cm_subtile)16462 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, strided_cm_subtile) {
16463 TEST_REQUIRES_X86_SSE2;
16464 for (size_t k = 1; k <= 20; k += 5) {
16465 for (uint32_t n = 1; n <= 8; n++) {
16466 for (uint32_t m = 1; m <= 1; m++) {
16467 GemmMicrokernelTester()
16468 .mr(1)
16469 .nr(8)
16470 .kr(1)
16471 .sr(1)
16472 .m(m)
16473 .n(n)
16474 .k(k)
16475 .cm_stride(11)
16476 .iterations(1)
16477 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16478 }
16479 }
16480 }
16481 }
16482
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,qmin)16483 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, qmin) {
16484 TEST_REQUIRES_X86_SSE2;
16485 GemmMicrokernelTester()
16486 .mr(1)
16487 .nr(8)
16488 .kr(1)
16489 .sr(1)
16490 .m(1)
16491 .n(8)
16492 .k(4)
16493 .qmin(128)
16494 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16495 }
16496
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,qmax)16497 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, qmax) {
16498 TEST_REQUIRES_X86_SSE2;
16499 GemmMicrokernelTester()
16500 .mr(1)
16501 .nr(8)
16502 .kr(1)
16503 .sr(1)
16504 .m(1)
16505 .n(8)
16506 .k(4)
16507 .qmax(128)
16508 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16509 }
16510
TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP,strided_cm)16511 TEST(F32_GEMM_MINMAX_1X8__SSE2_DUP, strided_cm) {
16512 TEST_REQUIRES_X86_SSE2;
16513 GemmMicrokernelTester()
16514 .mr(1)
16515 .nr(8)
16516 .kr(1)
16517 .sr(1)
16518 .m(1)
16519 .n(8)
16520 .k(4)
16521 .cm_stride(11)
16522 .Test(xnn_f32_gemm_minmax_ukernel_1x8__sse2_dup, xnn_init_f32_minmax_sse_params);
16523 }
16524 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
16525
16526
16527 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_eq_4)16528 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_eq_4) {
16529 TEST_REQUIRES_X86_SSE;
16530 GemmMicrokernelTester()
16531 .mr(3)
16532 .nr(8)
16533 .kr(1)
16534 .sr(1)
16535 .m(3)
16536 .n(8)
16537 .k(4)
16538 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16539 }
16540
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,strided_cn)16541 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, strided_cn) {
16542 TEST_REQUIRES_X86_SSE;
16543 GemmMicrokernelTester()
16544 .mr(3)
16545 .nr(8)
16546 .kr(1)
16547 .sr(1)
16548 .m(3)
16549 .n(8)
16550 .k(4)
16551 .cn_stride(11)
16552 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16553 }
16554
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_eq_4_strided_a)16555 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_eq_4_strided_a) {
16556 TEST_REQUIRES_X86_SSE;
16557 GemmMicrokernelTester()
16558 .mr(3)
16559 .nr(8)
16560 .kr(1)
16561 .sr(1)
16562 .m(3)
16563 .n(8)
16564 .k(4)
16565 .a_stride(7)
16566 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16567 }
16568
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_eq_4_subtile)16569 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_eq_4_subtile) {
16570 TEST_REQUIRES_X86_SSE;
16571 for (uint32_t n = 1; n <= 8; n++) {
16572 for (uint32_t m = 1; m <= 3; m++) {
16573 GemmMicrokernelTester()
16574 .mr(3)
16575 .nr(8)
16576 .kr(1)
16577 .sr(1)
16578 .m(m)
16579 .n(n)
16580 .k(4)
16581 .iterations(1)
16582 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16583 }
16584 }
16585 }
16586
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_eq_4_subtile_m)16587 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_eq_4_subtile_m) {
16588 TEST_REQUIRES_X86_SSE;
16589 for (uint32_t m = 1; m <= 3; m++) {
16590 GemmMicrokernelTester()
16591 .mr(3)
16592 .nr(8)
16593 .kr(1)
16594 .sr(1)
16595 .m(m)
16596 .n(8)
16597 .k(4)
16598 .iterations(1)
16599 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16600 }
16601 }
16602
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_eq_4_subtile_n)16603 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_eq_4_subtile_n) {
16604 TEST_REQUIRES_X86_SSE;
16605 for (uint32_t n = 1; n <= 8; n++) {
16606 GemmMicrokernelTester()
16607 .mr(3)
16608 .nr(8)
16609 .kr(1)
16610 .sr(1)
16611 .m(3)
16612 .n(n)
16613 .k(4)
16614 .iterations(1)
16615 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16616 }
16617 }
16618
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_lt_4)16619 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_lt_4) {
16620 TEST_REQUIRES_X86_SSE;
16621 for (size_t k = 1; k < 4; k++) {
16622 GemmMicrokernelTester()
16623 .mr(3)
16624 .nr(8)
16625 .kr(1)
16626 .sr(1)
16627 .m(3)
16628 .n(8)
16629 .k(k)
16630 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16631 }
16632 }
16633
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_lt_4_strided_a)16634 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_lt_4_strided_a) {
16635 TEST_REQUIRES_X86_SSE;
16636 for (size_t k = 1; k < 4; k++) {
16637 GemmMicrokernelTester()
16638 .mr(3)
16639 .nr(8)
16640 .kr(1)
16641 .sr(1)
16642 .m(3)
16643 .n(8)
16644 .k(k)
16645 .a_stride(7)
16646 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16647 }
16648 }
16649
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_lt_4_subtile)16650 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_lt_4_subtile) {
16651 TEST_REQUIRES_X86_SSE;
16652 for (size_t k = 1; k < 4; k++) {
16653 for (uint32_t n = 1; n <= 8; n++) {
16654 for (uint32_t m = 1; m <= 3; m++) {
16655 GemmMicrokernelTester()
16656 .mr(3)
16657 .nr(8)
16658 .kr(1)
16659 .sr(1)
16660 .m(m)
16661 .n(n)
16662 .k(k)
16663 .iterations(1)
16664 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16665 }
16666 }
16667 }
16668 }
16669
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_gt_4)16670 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_gt_4) {
16671 TEST_REQUIRES_X86_SSE;
16672 for (size_t k = 5; k < 8; k++) {
16673 GemmMicrokernelTester()
16674 .mr(3)
16675 .nr(8)
16676 .kr(1)
16677 .sr(1)
16678 .m(3)
16679 .n(8)
16680 .k(k)
16681 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16682 }
16683 }
16684
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_gt_4_strided_a)16685 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_gt_4_strided_a) {
16686 TEST_REQUIRES_X86_SSE;
16687 for (size_t k = 5; k < 8; k++) {
16688 GemmMicrokernelTester()
16689 .mr(3)
16690 .nr(8)
16691 .kr(1)
16692 .sr(1)
16693 .m(3)
16694 .n(8)
16695 .k(k)
16696 .a_stride(11)
16697 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16698 }
16699 }
16700
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_gt_4_subtile)16701 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_gt_4_subtile) {
16702 TEST_REQUIRES_X86_SSE;
16703 for (size_t k = 5; k < 8; k++) {
16704 for (uint32_t n = 1; n <= 8; n++) {
16705 for (uint32_t m = 1; m <= 3; m++) {
16706 GemmMicrokernelTester()
16707 .mr(3)
16708 .nr(8)
16709 .kr(1)
16710 .sr(1)
16711 .m(m)
16712 .n(n)
16713 .k(k)
16714 .iterations(1)
16715 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16716 }
16717 }
16718 }
16719 }
16720
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_div_4)16721 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_div_4) {
16722 TEST_REQUIRES_X86_SSE;
16723 for (size_t k = 8; k <= 40; k += 4) {
16724 GemmMicrokernelTester()
16725 .mr(3)
16726 .nr(8)
16727 .kr(1)
16728 .sr(1)
16729 .m(3)
16730 .n(8)
16731 .k(k)
16732 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16733 }
16734 }
16735
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_div_4_strided_a)16736 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_div_4_strided_a) {
16737 TEST_REQUIRES_X86_SSE;
16738 for (size_t k = 8; k <= 40; k += 4) {
16739 GemmMicrokernelTester()
16740 .mr(3)
16741 .nr(8)
16742 .kr(1)
16743 .sr(1)
16744 .m(3)
16745 .n(8)
16746 .k(k)
16747 .a_stride(43)
16748 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16749 }
16750 }
16751
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,k_div_4_subtile)16752 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, k_div_4_subtile) {
16753 TEST_REQUIRES_X86_SSE;
16754 for (size_t k = 8; k <= 40; k += 4) {
16755 for (uint32_t n = 1; n <= 8; n++) {
16756 for (uint32_t m = 1; m <= 3; m++) {
16757 GemmMicrokernelTester()
16758 .mr(3)
16759 .nr(8)
16760 .kr(1)
16761 .sr(1)
16762 .m(m)
16763 .n(n)
16764 .k(k)
16765 .iterations(1)
16766 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16767 }
16768 }
16769 }
16770 }
16771
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,n_gt_8)16772 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, n_gt_8) {
16773 TEST_REQUIRES_X86_SSE;
16774 for (uint32_t n = 9; n < 16; n++) {
16775 for (size_t k = 1; k <= 20; k += 5) {
16776 GemmMicrokernelTester()
16777 .mr(3)
16778 .nr(8)
16779 .kr(1)
16780 .sr(1)
16781 .m(3)
16782 .n(n)
16783 .k(k)
16784 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16785 }
16786 }
16787 }
16788
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,n_gt_8_strided_cn)16789 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, n_gt_8_strided_cn) {
16790 TEST_REQUIRES_X86_SSE;
16791 for (uint32_t n = 9; n < 16; n++) {
16792 for (size_t k = 1; k <= 20; k += 5) {
16793 GemmMicrokernelTester()
16794 .mr(3)
16795 .nr(8)
16796 .kr(1)
16797 .sr(1)
16798 .m(3)
16799 .n(n)
16800 .k(k)
16801 .cn_stride(11)
16802 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16803 }
16804 }
16805 }
16806
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,n_gt_8_strided_a)16807 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, n_gt_8_strided_a) {
16808 TEST_REQUIRES_X86_SSE;
16809 for (uint32_t n = 9; n < 16; n++) {
16810 for (size_t k = 1; k <= 20; k += 5) {
16811 GemmMicrokernelTester()
16812 .mr(3)
16813 .nr(8)
16814 .kr(1)
16815 .sr(1)
16816 .m(3)
16817 .n(n)
16818 .k(k)
16819 .a_stride(23)
16820 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16821 }
16822 }
16823 }
16824
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,n_gt_8_subtile)16825 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, n_gt_8_subtile) {
16826 TEST_REQUIRES_X86_SSE;
16827 for (uint32_t n = 9; n < 16; n++) {
16828 for (size_t k = 1; k <= 20; k += 5) {
16829 for (uint32_t m = 1; m <= 3; m++) {
16830 GemmMicrokernelTester()
16831 .mr(3)
16832 .nr(8)
16833 .kr(1)
16834 .sr(1)
16835 .m(m)
16836 .n(n)
16837 .k(k)
16838 .iterations(1)
16839 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16840 }
16841 }
16842 }
16843 }
16844
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,n_div_8)16845 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, n_div_8) {
16846 TEST_REQUIRES_X86_SSE;
16847 for (uint32_t n = 16; n <= 24; n += 8) {
16848 for (size_t k = 1; k <= 20; k += 5) {
16849 GemmMicrokernelTester()
16850 .mr(3)
16851 .nr(8)
16852 .kr(1)
16853 .sr(1)
16854 .m(3)
16855 .n(n)
16856 .k(k)
16857 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16858 }
16859 }
16860 }
16861
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,n_div_8_strided_cn)16862 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, n_div_8_strided_cn) {
16863 TEST_REQUIRES_X86_SSE;
16864 for (uint32_t n = 16; n <= 24; n += 8) {
16865 for (size_t k = 1; k <= 20; k += 5) {
16866 GemmMicrokernelTester()
16867 .mr(3)
16868 .nr(8)
16869 .kr(1)
16870 .sr(1)
16871 .m(3)
16872 .n(n)
16873 .k(k)
16874 .cn_stride(11)
16875 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16876 }
16877 }
16878 }
16879
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,n_div_8_strided_a)16880 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, n_div_8_strided_a) {
16881 TEST_REQUIRES_X86_SSE;
16882 for (uint32_t n = 16; n <= 24; n += 8) {
16883 for (size_t k = 1; k <= 20; k += 5) {
16884 GemmMicrokernelTester()
16885 .mr(3)
16886 .nr(8)
16887 .kr(1)
16888 .sr(1)
16889 .m(3)
16890 .n(n)
16891 .k(k)
16892 .a_stride(23)
16893 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16894 }
16895 }
16896 }
16897
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,n_div_8_subtile)16898 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, n_div_8_subtile) {
16899 TEST_REQUIRES_X86_SSE;
16900 for (uint32_t n = 16; n <= 24; n += 8) {
16901 for (size_t k = 1; k <= 20; k += 5) {
16902 for (uint32_t m = 1; m <= 3; m++) {
16903 GemmMicrokernelTester()
16904 .mr(3)
16905 .nr(8)
16906 .kr(1)
16907 .sr(1)
16908 .m(m)
16909 .n(n)
16910 .k(k)
16911 .iterations(1)
16912 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16913 }
16914 }
16915 }
16916 }
16917
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,strided_cm_subtile)16918 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, strided_cm_subtile) {
16919 TEST_REQUIRES_X86_SSE;
16920 for (size_t k = 1; k <= 20; k += 5) {
16921 for (uint32_t n = 1; n <= 8; n++) {
16922 for (uint32_t m = 1; m <= 3; m++) {
16923 GemmMicrokernelTester()
16924 .mr(3)
16925 .nr(8)
16926 .kr(1)
16927 .sr(1)
16928 .m(m)
16929 .n(n)
16930 .k(k)
16931 .cm_stride(11)
16932 .iterations(1)
16933 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16934 }
16935 }
16936 }
16937 }
16938
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,qmin)16939 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, qmin) {
16940 TEST_REQUIRES_X86_SSE;
16941 GemmMicrokernelTester()
16942 .mr(3)
16943 .nr(8)
16944 .kr(1)
16945 .sr(1)
16946 .m(3)
16947 .n(8)
16948 .k(4)
16949 .qmin(128)
16950 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16951 }
16952
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,qmax)16953 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, qmax) {
16954 TEST_REQUIRES_X86_SSE;
16955 GemmMicrokernelTester()
16956 .mr(3)
16957 .nr(8)
16958 .kr(1)
16959 .sr(1)
16960 .m(3)
16961 .n(8)
16962 .k(4)
16963 .qmax(128)
16964 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16965 }
16966
TEST(F32_GEMM_MINMAX_3X8__SSE_DUP,strided_cm)16967 TEST(F32_GEMM_MINMAX_3X8__SSE_DUP, strided_cm) {
16968 TEST_REQUIRES_X86_SSE;
16969 GemmMicrokernelTester()
16970 .mr(3)
16971 .nr(8)
16972 .kr(1)
16973 .sr(1)
16974 .m(3)
16975 .n(8)
16976 .k(4)
16977 .cm_stride(11)
16978 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_dup, xnn_init_f32_minmax_sse_params);
16979 }
16980 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
16981
16982
16983 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,k_eq_1)16984 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, k_eq_1) {
16985 TEST_REQUIRES_X86_SSE;
16986 GemmMicrokernelTester()
16987 .mr(3)
16988 .nr(8)
16989 .kr(1)
16990 .sr(1)
16991 .m(3)
16992 .n(8)
16993 .k(1)
16994 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
16995 }
16996
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,strided_cn)16997 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, strided_cn) {
16998 TEST_REQUIRES_X86_SSE;
16999 GemmMicrokernelTester()
17000 .mr(3)
17001 .nr(8)
17002 .kr(1)
17003 .sr(1)
17004 .m(3)
17005 .n(8)
17006 .k(1)
17007 .cn_stride(11)
17008 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17009 }
17010
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,k_eq_1_strided_a)17011 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, k_eq_1_strided_a) {
17012 TEST_REQUIRES_X86_SSE;
17013 GemmMicrokernelTester()
17014 .mr(3)
17015 .nr(8)
17016 .kr(1)
17017 .sr(1)
17018 .m(3)
17019 .n(8)
17020 .k(1)
17021 .a_stride(3)
17022 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17023 }
17024
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,k_eq_1_subtile)17025 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, k_eq_1_subtile) {
17026 TEST_REQUIRES_X86_SSE;
17027 for (uint32_t n = 1; n <= 8; n++) {
17028 for (uint32_t m = 1; m <= 3; m++) {
17029 GemmMicrokernelTester()
17030 .mr(3)
17031 .nr(8)
17032 .kr(1)
17033 .sr(1)
17034 .m(m)
17035 .n(n)
17036 .k(1)
17037 .iterations(1)
17038 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17039 }
17040 }
17041 }
17042
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,k_eq_1_subtile_m)17043 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, k_eq_1_subtile_m) {
17044 TEST_REQUIRES_X86_SSE;
17045 for (uint32_t m = 1; m <= 3; m++) {
17046 GemmMicrokernelTester()
17047 .mr(3)
17048 .nr(8)
17049 .kr(1)
17050 .sr(1)
17051 .m(m)
17052 .n(8)
17053 .k(1)
17054 .iterations(1)
17055 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17056 }
17057 }
17058
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,k_eq_1_subtile_n)17059 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, k_eq_1_subtile_n) {
17060 TEST_REQUIRES_X86_SSE;
17061 for (uint32_t n = 1; n <= 8; n++) {
17062 GemmMicrokernelTester()
17063 .mr(3)
17064 .nr(8)
17065 .kr(1)
17066 .sr(1)
17067 .m(3)
17068 .n(n)
17069 .k(1)
17070 .iterations(1)
17071 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17072 }
17073 }
17074
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,k_gt_1)17075 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, k_gt_1) {
17076 TEST_REQUIRES_X86_SSE;
17077 for (size_t k = 2; k < 10; k++) {
17078 GemmMicrokernelTester()
17079 .mr(3)
17080 .nr(8)
17081 .kr(1)
17082 .sr(1)
17083 .m(3)
17084 .n(8)
17085 .k(k)
17086 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17087 }
17088 }
17089
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,k_gt_1_strided_a)17090 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, k_gt_1_strided_a) {
17091 TEST_REQUIRES_X86_SSE;
17092 for (size_t k = 2; k < 10; k++) {
17093 GemmMicrokernelTester()
17094 .mr(3)
17095 .nr(8)
17096 .kr(1)
17097 .sr(1)
17098 .m(3)
17099 .n(8)
17100 .k(k)
17101 .a_stride(11)
17102 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17103 }
17104 }
17105
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,k_gt_1_subtile)17106 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, k_gt_1_subtile) {
17107 TEST_REQUIRES_X86_SSE;
17108 for (size_t k = 2; k < 10; k++) {
17109 for (uint32_t n = 1; n <= 8; n++) {
17110 for (uint32_t m = 1; m <= 3; m++) {
17111 GemmMicrokernelTester()
17112 .mr(3)
17113 .nr(8)
17114 .kr(1)
17115 .sr(1)
17116 .m(m)
17117 .n(n)
17118 .k(k)
17119 .iterations(1)
17120 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17121 }
17122 }
17123 }
17124 }
17125
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,n_gt_8)17126 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, n_gt_8) {
17127 TEST_REQUIRES_X86_SSE;
17128 for (uint32_t n = 9; n < 16; n++) {
17129 for (size_t k = 1; k <= 5; k += 2) {
17130 GemmMicrokernelTester()
17131 .mr(3)
17132 .nr(8)
17133 .kr(1)
17134 .sr(1)
17135 .m(3)
17136 .n(n)
17137 .k(k)
17138 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17139 }
17140 }
17141 }
17142
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,n_gt_8_strided_cn)17143 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, n_gt_8_strided_cn) {
17144 TEST_REQUIRES_X86_SSE;
17145 for (uint32_t n = 9; n < 16; n++) {
17146 for (size_t k = 1; k <= 5; k += 2) {
17147 GemmMicrokernelTester()
17148 .mr(3)
17149 .nr(8)
17150 .kr(1)
17151 .sr(1)
17152 .m(3)
17153 .n(n)
17154 .k(k)
17155 .cn_stride(11)
17156 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17157 }
17158 }
17159 }
17160
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,n_gt_8_strided_a)17161 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, n_gt_8_strided_a) {
17162 TEST_REQUIRES_X86_SSE;
17163 for (uint32_t n = 9; n < 16; n++) {
17164 for (size_t k = 1; k <= 5; k += 2) {
17165 GemmMicrokernelTester()
17166 .mr(3)
17167 .nr(8)
17168 .kr(1)
17169 .sr(1)
17170 .m(3)
17171 .n(n)
17172 .k(k)
17173 .a_stride(7)
17174 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17175 }
17176 }
17177 }
17178
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,n_gt_8_subtile)17179 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, n_gt_8_subtile) {
17180 TEST_REQUIRES_X86_SSE;
17181 for (uint32_t n = 9; n < 16; n++) {
17182 for (size_t k = 1; k <= 5; k += 2) {
17183 for (uint32_t m = 1; m <= 3; m++) {
17184 GemmMicrokernelTester()
17185 .mr(3)
17186 .nr(8)
17187 .kr(1)
17188 .sr(1)
17189 .m(m)
17190 .n(n)
17191 .k(k)
17192 .iterations(1)
17193 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17194 }
17195 }
17196 }
17197 }
17198
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,n_div_8)17199 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, n_div_8) {
17200 TEST_REQUIRES_X86_SSE;
17201 for (uint32_t n = 16; n <= 24; n += 8) {
17202 for (size_t k = 1; k <= 5; k += 2) {
17203 GemmMicrokernelTester()
17204 .mr(3)
17205 .nr(8)
17206 .kr(1)
17207 .sr(1)
17208 .m(3)
17209 .n(n)
17210 .k(k)
17211 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17212 }
17213 }
17214 }
17215
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,n_div_8_strided_cn)17216 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, n_div_8_strided_cn) {
17217 TEST_REQUIRES_X86_SSE;
17218 for (uint32_t n = 16; n <= 24; n += 8) {
17219 for (size_t k = 1; k <= 5; k += 2) {
17220 GemmMicrokernelTester()
17221 .mr(3)
17222 .nr(8)
17223 .kr(1)
17224 .sr(1)
17225 .m(3)
17226 .n(n)
17227 .k(k)
17228 .cn_stride(11)
17229 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17230 }
17231 }
17232 }
17233
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,n_div_8_strided_a)17234 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, n_div_8_strided_a) {
17235 TEST_REQUIRES_X86_SSE;
17236 for (uint32_t n = 16; n <= 24; n += 8) {
17237 for (size_t k = 1; k <= 5; k += 2) {
17238 GemmMicrokernelTester()
17239 .mr(3)
17240 .nr(8)
17241 .kr(1)
17242 .sr(1)
17243 .m(3)
17244 .n(n)
17245 .k(k)
17246 .a_stride(7)
17247 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17248 }
17249 }
17250 }
17251
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,n_div_8_subtile)17252 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, n_div_8_subtile) {
17253 TEST_REQUIRES_X86_SSE;
17254 for (uint32_t n = 16; n <= 24; n += 8) {
17255 for (size_t k = 1; k <= 5; k += 2) {
17256 for (uint32_t m = 1; m <= 3; m++) {
17257 GemmMicrokernelTester()
17258 .mr(3)
17259 .nr(8)
17260 .kr(1)
17261 .sr(1)
17262 .m(m)
17263 .n(n)
17264 .k(k)
17265 .iterations(1)
17266 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17267 }
17268 }
17269 }
17270 }
17271
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,strided_cm_subtile)17272 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, strided_cm_subtile) {
17273 TEST_REQUIRES_X86_SSE;
17274 for (size_t k = 1; k <= 5; k += 2) {
17275 for (uint32_t n = 1; n <= 8; n++) {
17276 for (uint32_t m = 1; m <= 3; m++) {
17277 GemmMicrokernelTester()
17278 .mr(3)
17279 .nr(8)
17280 .kr(1)
17281 .sr(1)
17282 .m(m)
17283 .n(n)
17284 .k(k)
17285 .cm_stride(11)
17286 .iterations(1)
17287 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17288 }
17289 }
17290 }
17291 }
17292
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,qmin)17293 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, qmin) {
17294 TEST_REQUIRES_X86_SSE;
17295 GemmMicrokernelTester()
17296 .mr(3)
17297 .nr(8)
17298 .kr(1)
17299 .sr(1)
17300 .m(3)
17301 .n(8)
17302 .k(1)
17303 .qmin(128)
17304 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17305 }
17306
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,qmax)17307 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, qmax) {
17308 TEST_REQUIRES_X86_SSE;
17309 GemmMicrokernelTester()
17310 .mr(3)
17311 .nr(8)
17312 .kr(1)
17313 .sr(1)
17314 .m(3)
17315 .n(8)
17316 .k(1)
17317 .qmax(128)
17318 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17319 }
17320
TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1,strided_cm)17321 TEST(F32_GEMM_MINMAX_3X8__SSE_LOAD1, strided_cm) {
17322 TEST_REQUIRES_X86_SSE;
17323 GemmMicrokernelTester()
17324 .mr(3)
17325 .nr(8)
17326 .kr(1)
17327 .sr(1)
17328 .m(3)
17329 .n(8)
17330 .k(1)
17331 .cm_stride(11)
17332 .Test(xnn_f32_gemm_minmax_ukernel_3x8__sse_load1, xnn_init_f32_minmax_sse_params);
17333 }
17334 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
17335
17336
17337 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_eq_4)17338 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_eq_4) {
17339 TEST_REQUIRES_X86_SSE;
17340 GemmMicrokernelTester()
17341 .mr(3)
17342 .nr(8)
17343 .kr(1)
17344 .sr(4)
17345 .m(3)
17346 .n(8)
17347 .k(4)
17348 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17349 }
17350
TEST(F32_GEMM_MINMAX_3X8S4__SSE,strided_cn)17351 TEST(F32_GEMM_MINMAX_3X8S4__SSE, strided_cn) {
17352 TEST_REQUIRES_X86_SSE;
17353 GemmMicrokernelTester()
17354 .mr(3)
17355 .nr(8)
17356 .kr(1)
17357 .sr(4)
17358 .m(3)
17359 .n(8)
17360 .k(4)
17361 .cn_stride(11)
17362 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17363 }
17364
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_eq_4_strided_a)17365 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_eq_4_strided_a) {
17366 TEST_REQUIRES_X86_SSE;
17367 GemmMicrokernelTester()
17368 .mr(3)
17369 .nr(8)
17370 .kr(1)
17371 .sr(4)
17372 .m(3)
17373 .n(8)
17374 .k(4)
17375 .a_stride(7)
17376 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17377 }
17378
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_eq_4_subtile)17379 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_eq_4_subtile) {
17380 TEST_REQUIRES_X86_SSE;
17381 for (uint32_t n = 1; n <= 8; n++) {
17382 for (uint32_t m = 1; m <= 3; m++) {
17383 GemmMicrokernelTester()
17384 .mr(3)
17385 .nr(8)
17386 .kr(1)
17387 .sr(4)
17388 .m(m)
17389 .n(n)
17390 .k(4)
17391 .iterations(1)
17392 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17393 }
17394 }
17395 }
17396
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_eq_4_subtile_m)17397 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_eq_4_subtile_m) {
17398 TEST_REQUIRES_X86_SSE;
17399 for (uint32_t m = 1; m <= 3; m++) {
17400 GemmMicrokernelTester()
17401 .mr(3)
17402 .nr(8)
17403 .kr(1)
17404 .sr(4)
17405 .m(m)
17406 .n(8)
17407 .k(4)
17408 .iterations(1)
17409 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17410 }
17411 }
17412
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_eq_4_subtile_n)17413 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_eq_4_subtile_n) {
17414 TEST_REQUIRES_X86_SSE;
17415 for (uint32_t n = 1; n <= 8; n++) {
17416 GemmMicrokernelTester()
17417 .mr(3)
17418 .nr(8)
17419 .kr(1)
17420 .sr(4)
17421 .m(3)
17422 .n(n)
17423 .k(4)
17424 .iterations(1)
17425 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17426 }
17427 }
17428
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_lt_4)17429 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_lt_4) {
17430 TEST_REQUIRES_X86_SSE;
17431 for (size_t k = 1; k < 4; k++) {
17432 GemmMicrokernelTester()
17433 .mr(3)
17434 .nr(8)
17435 .kr(1)
17436 .sr(4)
17437 .m(3)
17438 .n(8)
17439 .k(k)
17440 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17441 }
17442 }
17443
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_lt_4_strided_a)17444 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_lt_4_strided_a) {
17445 TEST_REQUIRES_X86_SSE;
17446 for (size_t k = 1; k < 4; k++) {
17447 GemmMicrokernelTester()
17448 .mr(3)
17449 .nr(8)
17450 .kr(1)
17451 .sr(4)
17452 .m(3)
17453 .n(8)
17454 .k(k)
17455 .a_stride(7)
17456 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17457 }
17458 }
17459
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_lt_4_subtile)17460 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_lt_4_subtile) {
17461 TEST_REQUIRES_X86_SSE;
17462 for (size_t k = 1; k < 4; k++) {
17463 for (uint32_t n = 1; n <= 8; n++) {
17464 for (uint32_t m = 1; m <= 3; m++) {
17465 GemmMicrokernelTester()
17466 .mr(3)
17467 .nr(8)
17468 .kr(1)
17469 .sr(4)
17470 .m(m)
17471 .n(n)
17472 .k(k)
17473 .iterations(1)
17474 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17475 }
17476 }
17477 }
17478 }
17479
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_gt_4)17480 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_gt_4) {
17481 TEST_REQUIRES_X86_SSE;
17482 for (size_t k = 5; k < 8; k++) {
17483 GemmMicrokernelTester()
17484 .mr(3)
17485 .nr(8)
17486 .kr(1)
17487 .sr(4)
17488 .m(3)
17489 .n(8)
17490 .k(k)
17491 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17492 }
17493 }
17494
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_gt_4_strided_a)17495 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_gt_4_strided_a) {
17496 TEST_REQUIRES_X86_SSE;
17497 for (size_t k = 5; k < 8; k++) {
17498 GemmMicrokernelTester()
17499 .mr(3)
17500 .nr(8)
17501 .kr(1)
17502 .sr(4)
17503 .m(3)
17504 .n(8)
17505 .k(k)
17506 .a_stride(11)
17507 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17508 }
17509 }
17510
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_gt_4_subtile)17511 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_gt_4_subtile) {
17512 TEST_REQUIRES_X86_SSE;
17513 for (size_t k = 5; k < 8; k++) {
17514 for (uint32_t n = 1; n <= 8; n++) {
17515 for (uint32_t m = 1; m <= 3; m++) {
17516 GemmMicrokernelTester()
17517 .mr(3)
17518 .nr(8)
17519 .kr(1)
17520 .sr(4)
17521 .m(m)
17522 .n(n)
17523 .k(k)
17524 .iterations(1)
17525 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17526 }
17527 }
17528 }
17529 }
17530
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_div_4)17531 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_div_4) {
17532 TEST_REQUIRES_X86_SSE;
17533 for (size_t k = 8; k <= 40; k += 4) {
17534 GemmMicrokernelTester()
17535 .mr(3)
17536 .nr(8)
17537 .kr(1)
17538 .sr(4)
17539 .m(3)
17540 .n(8)
17541 .k(k)
17542 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17543 }
17544 }
17545
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_div_4_strided_a)17546 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_div_4_strided_a) {
17547 TEST_REQUIRES_X86_SSE;
17548 for (size_t k = 8; k <= 40; k += 4) {
17549 GemmMicrokernelTester()
17550 .mr(3)
17551 .nr(8)
17552 .kr(1)
17553 .sr(4)
17554 .m(3)
17555 .n(8)
17556 .k(k)
17557 .a_stride(43)
17558 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17559 }
17560 }
17561
TEST(F32_GEMM_MINMAX_3X8S4__SSE,k_div_4_subtile)17562 TEST(F32_GEMM_MINMAX_3X8S4__SSE, k_div_4_subtile) {
17563 TEST_REQUIRES_X86_SSE;
17564 for (size_t k = 8; k <= 40; k += 4) {
17565 for (uint32_t n = 1; n <= 8; n++) {
17566 for (uint32_t m = 1; m <= 3; m++) {
17567 GemmMicrokernelTester()
17568 .mr(3)
17569 .nr(8)
17570 .kr(1)
17571 .sr(4)
17572 .m(m)
17573 .n(n)
17574 .k(k)
17575 .iterations(1)
17576 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17577 }
17578 }
17579 }
17580 }
17581
TEST(F32_GEMM_MINMAX_3X8S4__SSE,n_gt_8)17582 TEST(F32_GEMM_MINMAX_3X8S4__SSE, n_gt_8) {
17583 TEST_REQUIRES_X86_SSE;
17584 for (uint32_t n = 9; n < 16; n++) {
17585 for (size_t k = 1; k <= 20; k += 5) {
17586 GemmMicrokernelTester()
17587 .mr(3)
17588 .nr(8)
17589 .kr(1)
17590 .sr(4)
17591 .m(3)
17592 .n(n)
17593 .k(k)
17594 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17595 }
17596 }
17597 }
17598
TEST(F32_GEMM_MINMAX_3X8S4__SSE,n_gt_8_strided_cn)17599 TEST(F32_GEMM_MINMAX_3X8S4__SSE, n_gt_8_strided_cn) {
17600 TEST_REQUIRES_X86_SSE;
17601 for (uint32_t n = 9; n < 16; n++) {
17602 for (size_t k = 1; k <= 20; k += 5) {
17603 GemmMicrokernelTester()
17604 .mr(3)
17605 .nr(8)
17606 .kr(1)
17607 .sr(4)
17608 .m(3)
17609 .n(n)
17610 .k(k)
17611 .cn_stride(11)
17612 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17613 }
17614 }
17615 }
17616
TEST(F32_GEMM_MINMAX_3X8S4__SSE,n_gt_8_strided_a)17617 TEST(F32_GEMM_MINMAX_3X8S4__SSE, n_gt_8_strided_a) {
17618 TEST_REQUIRES_X86_SSE;
17619 for (uint32_t n = 9; n < 16; n++) {
17620 for (size_t k = 1; k <= 20; k += 5) {
17621 GemmMicrokernelTester()
17622 .mr(3)
17623 .nr(8)
17624 .kr(1)
17625 .sr(4)
17626 .m(3)
17627 .n(n)
17628 .k(k)
17629 .a_stride(23)
17630 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17631 }
17632 }
17633 }
17634
TEST(F32_GEMM_MINMAX_3X8S4__SSE,n_gt_8_subtile)17635 TEST(F32_GEMM_MINMAX_3X8S4__SSE, n_gt_8_subtile) {
17636 TEST_REQUIRES_X86_SSE;
17637 for (uint32_t n = 9; n < 16; n++) {
17638 for (size_t k = 1; k <= 20; k += 5) {
17639 for (uint32_t m = 1; m <= 3; m++) {
17640 GemmMicrokernelTester()
17641 .mr(3)
17642 .nr(8)
17643 .kr(1)
17644 .sr(4)
17645 .m(m)
17646 .n(n)
17647 .k(k)
17648 .iterations(1)
17649 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17650 }
17651 }
17652 }
17653 }
17654
TEST(F32_GEMM_MINMAX_3X8S4__SSE,n_div_8)17655 TEST(F32_GEMM_MINMAX_3X8S4__SSE, n_div_8) {
17656 TEST_REQUIRES_X86_SSE;
17657 for (uint32_t n = 16; n <= 24; n += 8) {
17658 for (size_t k = 1; k <= 20; k += 5) {
17659 GemmMicrokernelTester()
17660 .mr(3)
17661 .nr(8)
17662 .kr(1)
17663 .sr(4)
17664 .m(3)
17665 .n(n)
17666 .k(k)
17667 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17668 }
17669 }
17670 }
17671
TEST(F32_GEMM_MINMAX_3X8S4__SSE,n_div_8_strided_cn)17672 TEST(F32_GEMM_MINMAX_3X8S4__SSE, n_div_8_strided_cn) {
17673 TEST_REQUIRES_X86_SSE;
17674 for (uint32_t n = 16; n <= 24; n += 8) {
17675 for (size_t k = 1; k <= 20; k += 5) {
17676 GemmMicrokernelTester()
17677 .mr(3)
17678 .nr(8)
17679 .kr(1)
17680 .sr(4)
17681 .m(3)
17682 .n(n)
17683 .k(k)
17684 .cn_stride(11)
17685 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17686 }
17687 }
17688 }
17689
TEST(F32_GEMM_MINMAX_3X8S4__SSE,n_div_8_strided_a)17690 TEST(F32_GEMM_MINMAX_3X8S4__SSE, n_div_8_strided_a) {
17691 TEST_REQUIRES_X86_SSE;
17692 for (uint32_t n = 16; n <= 24; n += 8) {
17693 for (size_t k = 1; k <= 20; k += 5) {
17694 GemmMicrokernelTester()
17695 .mr(3)
17696 .nr(8)
17697 .kr(1)
17698 .sr(4)
17699 .m(3)
17700 .n(n)
17701 .k(k)
17702 .a_stride(23)
17703 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17704 }
17705 }
17706 }
17707
TEST(F32_GEMM_MINMAX_3X8S4__SSE,n_div_8_subtile)17708 TEST(F32_GEMM_MINMAX_3X8S4__SSE, n_div_8_subtile) {
17709 TEST_REQUIRES_X86_SSE;
17710 for (uint32_t n = 16; n <= 24; n += 8) {
17711 for (size_t k = 1; k <= 20; k += 5) {
17712 for (uint32_t m = 1; m <= 3; m++) {
17713 GemmMicrokernelTester()
17714 .mr(3)
17715 .nr(8)
17716 .kr(1)
17717 .sr(4)
17718 .m(m)
17719 .n(n)
17720 .k(k)
17721 .iterations(1)
17722 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17723 }
17724 }
17725 }
17726 }
17727
TEST(F32_GEMM_MINMAX_3X8S4__SSE,strided_cm_subtile)17728 TEST(F32_GEMM_MINMAX_3X8S4__SSE, strided_cm_subtile) {
17729 TEST_REQUIRES_X86_SSE;
17730 for (size_t k = 1; k <= 20; k += 5) {
17731 for (uint32_t n = 1; n <= 8; n++) {
17732 for (uint32_t m = 1; m <= 3; m++) {
17733 GemmMicrokernelTester()
17734 .mr(3)
17735 .nr(8)
17736 .kr(1)
17737 .sr(4)
17738 .m(m)
17739 .n(n)
17740 .k(k)
17741 .cm_stride(11)
17742 .iterations(1)
17743 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17744 }
17745 }
17746 }
17747 }
17748
TEST(F32_GEMM_MINMAX_3X8S4__SSE,qmin)17749 TEST(F32_GEMM_MINMAX_3X8S4__SSE, qmin) {
17750 TEST_REQUIRES_X86_SSE;
17751 GemmMicrokernelTester()
17752 .mr(3)
17753 .nr(8)
17754 .kr(1)
17755 .sr(4)
17756 .m(3)
17757 .n(8)
17758 .k(4)
17759 .qmin(128)
17760 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17761 }
17762
TEST(F32_GEMM_MINMAX_3X8S4__SSE,qmax)17763 TEST(F32_GEMM_MINMAX_3X8S4__SSE, qmax) {
17764 TEST_REQUIRES_X86_SSE;
17765 GemmMicrokernelTester()
17766 .mr(3)
17767 .nr(8)
17768 .kr(1)
17769 .sr(4)
17770 .m(3)
17771 .n(8)
17772 .k(4)
17773 .qmax(128)
17774 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17775 }
17776
TEST(F32_GEMM_MINMAX_3X8S4__SSE,strided_cm)17777 TEST(F32_GEMM_MINMAX_3X8S4__SSE, strided_cm) {
17778 TEST_REQUIRES_X86_SSE;
17779 GemmMicrokernelTester()
17780 .mr(3)
17781 .nr(8)
17782 .kr(1)
17783 .sr(4)
17784 .m(3)
17785 .n(8)
17786 .k(4)
17787 .cm_stride(11)
17788 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__sse, xnn_init_f32_minmax_sse_params);
17789 }
17790 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
17791
17792
17793 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,k_eq_1)17794 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, k_eq_1) {
17795 TEST_REQUIRES_X86_SSE;
17796 GemmMicrokernelTester()
17797 .mr(4)
17798 .nr(8)
17799 .kr(1)
17800 .sr(1)
17801 .m(4)
17802 .n(8)
17803 .k(1)
17804 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17805 }
17806
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,strided_cn)17807 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, strided_cn) {
17808 TEST_REQUIRES_X86_SSE;
17809 GemmMicrokernelTester()
17810 .mr(4)
17811 .nr(8)
17812 .kr(1)
17813 .sr(1)
17814 .m(4)
17815 .n(8)
17816 .k(1)
17817 .cn_stride(11)
17818 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17819 }
17820
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,k_eq_1_strided_a)17821 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, k_eq_1_strided_a) {
17822 TEST_REQUIRES_X86_SSE;
17823 GemmMicrokernelTester()
17824 .mr(4)
17825 .nr(8)
17826 .kr(1)
17827 .sr(1)
17828 .m(4)
17829 .n(8)
17830 .k(1)
17831 .a_stride(3)
17832 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17833 }
17834
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,k_eq_1_subtile)17835 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, k_eq_1_subtile) {
17836 TEST_REQUIRES_X86_SSE;
17837 for (uint32_t n = 1; n <= 8; n++) {
17838 for (uint32_t m = 1; m <= 4; m++) {
17839 GemmMicrokernelTester()
17840 .mr(4)
17841 .nr(8)
17842 .kr(1)
17843 .sr(1)
17844 .m(m)
17845 .n(n)
17846 .k(1)
17847 .iterations(1)
17848 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17849 }
17850 }
17851 }
17852
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,k_eq_1_subtile_m)17853 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, k_eq_1_subtile_m) {
17854 TEST_REQUIRES_X86_SSE;
17855 for (uint32_t m = 1; m <= 4; m++) {
17856 GemmMicrokernelTester()
17857 .mr(4)
17858 .nr(8)
17859 .kr(1)
17860 .sr(1)
17861 .m(m)
17862 .n(8)
17863 .k(1)
17864 .iterations(1)
17865 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17866 }
17867 }
17868
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,k_eq_1_subtile_n)17869 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, k_eq_1_subtile_n) {
17870 TEST_REQUIRES_X86_SSE;
17871 for (uint32_t n = 1; n <= 8; n++) {
17872 GemmMicrokernelTester()
17873 .mr(4)
17874 .nr(8)
17875 .kr(1)
17876 .sr(1)
17877 .m(4)
17878 .n(n)
17879 .k(1)
17880 .iterations(1)
17881 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17882 }
17883 }
17884
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,k_gt_1)17885 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, k_gt_1) {
17886 TEST_REQUIRES_X86_SSE;
17887 for (size_t k = 2; k < 10; k++) {
17888 GemmMicrokernelTester()
17889 .mr(4)
17890 .nr(8)
17891 .kr(1)
17892 .sr(1)
17893 .m(4)
17894 .n(8)
17895 .k(k)
17896 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17897 }
17898 }
17899
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,k_gt_1_strided_a)17900 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, k_gt_1_strided_a) {
17901 TEST_REQUIRES_X86_SSE;
17902 for (size_t k = 2; k < 10; k++) {
17903 GemmMicrokernelTester()
17904 .mr(4)
17905 .nr(8)
17906 .kr(1)
17907 .sr(1)
17908 .m(4)
17909 .n(8)
17910 .k(k)
17911 .a_stride(11)
17912 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17913 }
17914 }
17915
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,k_gt_1_subtile)17916 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, k_gt_1_subtile) {
17917 TEST_REQUIRES_X86_SSE;
17918 for (size_t k = 2; k < 10; k++) {
17919 for (uint32_t n = 1; n <= 8; n++) {
17920 for (uint32_t m = 1; m <= 4; m++) {
17921 GemmMicrokernelTester()
17922 .mr(4)
17923 .nr(8)
17924 .kr(1)
17925 .sr(1)
17926 .m(m)
17927 .n(n)
17928 .k(k)
17929 .iterations(1)
17930 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17931 }
17932 }
17933 }
17934 }
17935
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,n_gt_8)17936 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, n_gt_8) {
17937 TEST_REQUIRES_X86_SSE;
17938 for (uint32_t n = 9; n < 16; n++) {
17939 for (size_t k = 1; k <= 5; k += 2) {
17940 GemmMicrokernelTester()
17941 .mr(4)
17942 .nr(8)
17943 .kr(1)
17944 .sr(1)
17945 .m(4)
17946 .n(n)
17947 .k(k)
17948 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17949 }
17950 }
17951 }
17952
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,n_gt_8_strided_cn)17953 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, n_gt_8_strided_cn) {
17954 TEST_REQUIRES_X86_SSE;
17955 for (uint32_t n = 9; n < 16; n++) {
17956 for (size_t k = 1; k <= 5; k += 2) {
17957 GemmMicrokernelTester()
17958 .mr(4)
17959 .nr(8)
17960 .kr(1)
17961 .sr(1)
17962 .m(4)
17963 .n(n)
17964 .k(k)
17965 .cn_stride(11)
17966 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17967 }
17968 }
17969 }
17970
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,n_gt_8_strided_a)17971 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, n_gt_8_strided_a) {
17972 TEST_REQUIRES_X86_SSE;
17973 for (uint32_t n = 9; n < 16; n++) {
17974 for (size_t k = 1; k <= 5; k += 2) {
17975 GemmMicrokernelTester()
17976 .mr(4)
17977 .nr(8)
17978 .kr(1)
17979 .sr(1)
17980 .m(4)
17981 .n(n)
17982 .k(k)
17983 .a_stride(7)
17984 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
17985 }
17986 }
17987 }
17988
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,n_gt_8_subtile)17989 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, n_gt_8_subtile) {
17990 TEST_REQUIRES_X86_SSE;
17991 for (uint32_t n = 9; n < 16; n++) {
17992 for (size_t k = 1; k <= 5; k += 2) {
17993 for (uint32_t m = 1; m <= 4; m++) {
17994 GemmMicrokernelTester()
17995 .mr(4)
17996 .nr(8)
17997 .kr(1)
17998 .sr(1)
17999 .m(m)
18000 .n(n)
18001 .k(k)
18002 .iterations(1)
18003 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
18004 }
18005 }
18006 }
18007 }
18008
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,n_div_8)18009 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, n_div_8) {
18010 TEST_REQUIRES_X86_SSE;
18011 for (uint32_t n = 16; n <= 24; n += 8) {
18012 for (size_t k = 1; k <= 5; k += 2) {
18013 GemmMicrokernelTester()
18014 .mr(4)
18015 .nr(8)
18016 .kr(1)
18017 .sr(1)
18018 .m(4)
18019 .n(n)
18020 .k(k)
18021 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
18022 }
18023 }
18024 }
18025
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,n_div_8_strided_cn)18026 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, n_div_8_strided_cn) {
18027 TEST_REQUIRES_X86_SSE;
18028 for (uint32_t n = 16; n <= 24; n += 8) {
18029 for (size_t k = 1; k <= 5; k += 2) {
18030 GemmMicrokernelTester()
18031 .mr(4)
18032 .nr(8)
18033 .kr(1)
18034 .sr(1)
18035 .m(4)
18036 .n(n)
18037 .k(k)
18038 .cn_stride(11)
18039 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
18040 }
18041 }
18042 }
18043
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,n_div_8_strided_a)18044 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, n_div_8_strided_a) {
18045 TEST_REQUIRES_X86_SSE;
18046 for (uint32_t n = 16; n <= 24; n += 8) {
18047 for (size_t k = 1; k <= 5; k += 2) {
18048 GemmMicrokernelTester()
18049 .mr(4)
18050 .nr(8)
18051 .kr(1)
18052 .sr(1)
18053 .m(4)
18054 .n(n)
18055 .k(k)
18056 .a_stride(7)
18057 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
18058 }
18059 }
18060 }
18061
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,n_div_8_subtile)18062 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, n_div_8_subtile) {
18063 TEST_REQUIRES_X86_SSE;
18064 for (uint32_t n = 16; n <= 24; n += 8) {
18065 for (size_t k = 1; k <= 5; k += 2) {
18066 for (uint32_t m = 1; m <= 4; m++) {
18067 GemmMicrokernelTester()
18068 .mr(4)
18069 .nr(8)
18070 .kr(1)
18071 .sr(1)
18072 .m(m)
18073 .n(n)
18074 .k(k)
18075 .iterations(1)
18076 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
18077 }
18078 }
18079 }
18080 }
18081
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,strided_cm_subtile)18082 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, strided_cm_subtile) {
18083 TEST_REQUIRES_X86_SSE;
18084 for (size_t k = 1; k <= 5; k += 2) {
18085 for (uint32_t n = 1; n <= 8; n++) {
18086 for (uint32_t m = 1; m <= 4; m++) {
18087 GemmMicrokernelTester()
18088 .mr(4)
18089 .nr(8)
18090 .kr(1)
18091 .sr(1)
18092 .m(m)
18093 .n(n)
18094 .k(k)
18095 .cm_stride(11)
18096 .iterations(1)
18097 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
18098 }
18099 }
18100 }
18101 }
18102
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,qmin)18103 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, qmin) {
18104 TEST_REQUIRES_X86_SSE;
18105 GemmMicrokernelTester()
18106 .mr(4)
18107 .nr(8)
18108 .kr(1)
18109 .sr(1)
18110 .m(4)
18111 .n(8)
18112 .k(1)
18113 .qmin(128)
18114 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
18115 }
18116
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,qmax)18117 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, qmax) {
18118 TEST_REQUIRES_X86_SSE;
18119 GemmMicrokernelTester()
18120 .mr(4)
18121 .nr(8)
18122 .kr(1)
18123 .sr(1)
18124 .m(4)
18125 .n(8)
18126 .k(1)
18127 .qmax(128)
18128 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
18129 }
18130
TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1,strided_cm)18131 TEST(F32_GEMM_MINMAX_4X8__SSE_LOAD1, strided_cm) {
18132 TEST_REQUIRES_X86_SSE;
18133 GemmMicrokernelTester()
18134 .mr(4)
18135 .nr(8)
18136 .kr(1)
18137 .sr(1)
18138 .m(4)
18139 .n(8)
18140 .k(1)
18141 .cm_stride(11)
18142 .Test(xnn_f32_gemm_minmax_ukernel_4x8__sse_load1, xnn_init_f32_minmax_sse_params);
18143 }
18144 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
18145
18146
18147 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_eq_4)18148 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_eq_4) {
18149 TEST_REQUIRES_X86_SSE;
18150 GemmMicrokernelTester()
18151 .mr(5)
18152 .nr(8)
18153 .kr(1)
18154 .sr(1)
18155 .m(5)
18156 .n(8)
18157 .k(4)
18158 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18159 }
18160
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,strided_cn)18161 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, strided_cn) {
18162 TEST_REQUIRES_X86_SSE;
18163 GemmMicrokernelTester()
18164 .mr(5)
18165 .nr(8)
18166 .kr(1)
18167 .sr(1)
18168 .m(5)
18169 .n(8)
18170 .k(4)
18171 .cn_stride(11)
18172 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18173 }
18174
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_eq_4_strided_a)18175 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_eq_4_strided_a) {
18176 TEST_REQUIRES_X86_SSE;
18177 GemmMicrokernelTester()
18178 .mr(5)
18179 .nr(8)
18180 .kr(1)
18181 .sr(1)
18182 .m(5)
18183 .n(8)
18184 .k(4)
18185 .a_stride(7)
18186 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18187 }
18188
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_eq_4_subtile)18189 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_eq_4_subtile) {
18190 TEST_REQUIRES_X86_SSE;
18191 for (uint32_t n = 1; n <= 8; n++) {
18192 for (uint32_t m = 1; m <= 5; m++) {
18193 GemmMicrokernelTester()
18194 .mr(5)
18195 .nr(8)
18196 .kr(1)
18197 .sr(1)
18198 .m(m)
18199 .n(n)
18200 .k(4)
18201 .iterations(1)
18202 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18203 }
18204 }
18205 }
18206
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_eq_4_subtile_m)18207 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_eq_4_subtile_m) {
18208 TEST_REQUIRES_X86_SSE;
18209 for (uint32_t m = 1; m <= 5; m++) {
18210 GemmMicrokernelTester()
18211 .mr(5)
18212 .nr(8)
18213 .kr(1)
18214 .sr(1)
18215 .m(m)
18216 .n(8)
18217 .k(4)
18218 .iterations(1)
18219 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18220 }
18221 }
18222
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_eq_4_subtile_n)18223 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_eq_4_subtile_n) {
18224 TEST_REQUIRES_X86_SSE;
18225 for (uint32_t n = 1; n <= 8; n++) {
18226 GemmMicrokernelTester()
18227 .mr(5)
18228 .nr(8)
18229 .kr(1)
18230 .sr(1)
18231 .m(5)
18232 .n(n)
18233 .k(4)
18234 .iterations(1)
18235 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18236 }
18237 }
18238
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_lt_4)18239 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_lt_4) {
18240 TEST_REQUIRES_X86_SSE;
18241 for (size_t k = 1; k < 4; k++) {
18242 GemmMicrokernelTester()
18243 .mr(5)
18244 .nr(8)
18245 .kr(1)
18246 .sr(1)
18247 .m(5)
18248 .n(8)
18249 .k(k)
18250 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18251 }
18252 }
18253
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_lt_4_strided_a)18254 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_lt_4_strided_a) {
18255 TEST_REQUIRES_X86_SSE;
18256 for (size_t k = 1; k < 4; k++) {
18257 GemmMicrokernelTester()
18258 .mr(5)
18259 .nr(8)
18260 .kr(1)
18261 .sr(1)
18262 .m(5)
18263 .n(8)
18264 .k(k)
18265 .a_stride(7)
18266 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18267 }
18268 }
18269
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_lt_4_subtile)18270 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_lt_4_subtile) {
18271 TEST_REQUIRES_X86_SSE;
18272 for (size_t k = 1; k < 4; k++) {
18273 for (uint32_t n = 1; n <= 8; n++) {
18274 for (uint32_t m = 1; m <= 5; m++) {
18275 GemmMicrokernelTester()
18276 .mr(5)
18277 .nr(8)
18278 .kr(1)
18279 .sr(1)
18280 .m(m)
18281 .n(n)
18282 .k(k)
18283 .iterations(1)
18284 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18285 }
18286 }
18287 }
18288 }
18289
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_gt_4)18290 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_gt_4) {
18291 TEST_REQUIRES_X86_SSE;
18292 for (size_t k = 5; k < 8; k++) {
18293 GemmMicrokernelTester()
18294 .mr(5)
18295 .nr(8)
18296 .kr(1)
18297 .sr(1)
18298 .m(5)
18299 .n(8)
18300 .k(k)
18301 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18302 }
18303 }
18304
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_gt_4_strided_a)18305 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_gt_4_strided_a) {
18306 TEST_REQUIRES_X86_SSE;
18307 for (size_t k = 5; k < 8; k++) {
18308 GemmMicrokernelTester()
18309 .mr(5)
18310 .nr(8)
18311 .kr(1)
18312 .sr(1)
18313 .m(5)
18314 .n(8)
18315 .k(k)
18316 .a_stride(11)
18317 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18318 }
18319 }
18320
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_gt_4_subtile)18321 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_gt_4_subtile) {
18322 TEST_REQUIRES_X86_SSE;
18323 for (size_t k = 5; k < 8; k++) {
18324 for (uint32_t n = 1; n <= 8; n++) {
18325 for (uint32_t m = 1; m <= 5; m++) {
18326 GemmMicrokernelTester()
18327 .mr(5)
18328 .nr(8)
18329 .kr(1)
18330 .sr(1)
18331 .m(m)
18332 .n(n)
18333 .k(k)
18334 .iterations(1)
18335 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18336 }
18337 }
18338 }
18339 }
18340
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_div_4)18341 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_div_4) {
18342 TEST_REQUIRES_X86_SSE;
18343 for (size_t k = 8; k <= 40; k += 4) {
18344 GemmMicrokernelTester()
18345 .mr(5)
18346 .nr(8)
18347 .kr(1)
18348 .sr(1)
18349 .m(5)
18350 .n(8)
18351 .k(k)
18352 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18353 }
18354 }
18355
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_div_4_strided_a)18356 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_div_4_strided_a) {
18357 TEST_REQUIRES_X86_SSE;
18358 for (size_t k = 8; k <= 40; k += 4) {
18359 GemmMicrokernelTester()
18360 .mr(5)
18361 .nr(8)
18362 .kr(1)
18363 .sr(1)
18364 .m(5)
18365 .n(8)
18366 .k(k)
18367 .a_stride(43)
18368 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18369 }
18370 }
18371
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,k_div_4_subtile)18372 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, k_div_4_subtile) {
18373 TEST_REQUIRES_X86_SSE;
18374 for (size_t k = 8; k <= 40; k += 4) {
18375 for (uint32_t n = 1; n <= 8; n++) {
18376 for (uint32_t m = 1; m <= 5; m++) {
18377 GemmMicrokernelTester()
18378 .mr(5)
18379 .nr(8)
18380 .kr(1)
18381 .sr(1)
18382 .m(m)
18383 .n(n)
18384 .k(k)
18385 .iterations(1)
18386 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18387 }
18388 }
18389 }
18390 }
18391
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,n_gt_8)18392 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, n_gt_8) {
18393 TEST_REQUIRES_X86_SSE;
18394 for (uint32_t n = 9; n < 16; n++) {
18395 for (size_t k = 1; k <= 20; k += 5) {
18396 GemmMicrokernelTester()
18397 .mr(5)
18398 .nr(8)
18399 .kr(1)
18400 .sr(1)
18401 .m(5)
18402 .n(n)
18403 .k(k)
18404 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18405 }
18406 }
18407 }
18408
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,n_gt_8_strided_cn)18409 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, n_gt_8_strided_cn) {
18410 TEST_REQUIRES_X86_SSE;
18411 for (uint32_t n = 9; n < 16; n++) {
18412 for (size_t k = 1; k <= 20; k += 5) {
18413 GemmMicrokernelTester()
18414 .mr(5)
18415 .nr(8)
18416 .kr(1)
18417 .sr(1)
18418 .m(5)
18419 .n(n)
18420 .k(k)
18421 .cn_stride(11)
18422 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18423 }
18424 }
18425 }
18426
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,n_gt_8_strided_a)18427 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, n_gt_8_strided_a) {
18428 TEST_REQUIRES_X86_SSE;
18429 for (uint32_t n = 9; n < 16; n++) {
18430 for (size_t k = 1; k <= 20; k += 5) {
18431 GemmMicrokernelTester()
18432 .mr(5)
18433 .nr(8)
18434 .kr(1)
18435 .sr(1)
18436 .m(5)
18437 .n(n)
18438 .k(k)
18439 .a_stride(23)
18440 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18441 }
18442 }
18443 }
18444
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,n_gt_8_subtile)18445 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, n_gt_8_subtile) {
18446 TEST_REQUIRES_X86_SSE;
18447 for (uint32_t n = 9; n < 16; n++) {
18448 for (size_t k = 1; k <= 20; k += 5) {
18449 for (uint32_t m = 1; m <= 5; m++) {
18450 GemmMicrokernelTester()
18451 .mr(5)
18452 .nr(8)
18453 .kr(1)
18454 .sr(1)
18455 .m(m)
18456 .n(n)
18457 .k(k)
18458 .iterations(1)
18459 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18460 }
18461 }
18462 }
18463 }
18464
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,n_div_8)18465 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, n_div_8) {
18466 TEST_REQUIRES_X86_SSE;
18467 for (uint32_t n = 16; n <= 24; n += 8) {
18468 for (size_t k = 1; k <= 20; k += 5) {
18469 GemmMicrokernelTester()
18470 .mr(5)
18471 .nr(8)
18472 .kr(1)
18473 .sr(1)
18474 .m(5)
18475 .n(n)
18476 .k(k)
18477 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18478 }
18479 }
18480 }
18481
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,n_div_8_strided_cn)18482 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, n_div_8_strided_cn) {
18483 TEST_REQUIRES_X86_SSE;
18484 for (uint32_t n = 16; n <= 24; n += 8) {
18485 for (size_t k = 1; k <= 20; k += 5) {
18486 GemmMicrokernelTester()
18487 .mr(5)
18488 .nr(8)
18489 .kr(1)
18490 .sr(1)
18491 .m(5)
18492 .n(n)
18493 .k(k)
18494 .cn_stride(11)
18495 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18496 }
18497 }
18498 }
18499
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,n_div_8_strided_a)18500 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, n_div_8_strided_a) {
18501 TEST_REQUIRES_X86_SSE;
18502 for (uint32_t n = 16; n <= 24; n += 8) {
18503 for (size_t k = 1; k <= 20; k += 5) {
18504 GemmMicrokernelTester()
18505 .mr(5)
18506 .nr(8)
18507 .kr(1)
18508 .sr(1)
18509 .m(5)
18510 .n(n)
18511 .k(k)
18512 .a_stride(23)
18513 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18514 }
18515 }
18516 }
18517
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,n_div_8_subtile)18518 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, n_div_8_subtile) {
18519 TEST_REQUIRES_X86_SSE;
18520 for (uint32_t n = 16; n <= 24; n += 8) {
18521 for (size_t k = 1; k <= 20; k += 5) {
18522 for (uint32_t m = 1; m <= 5; m++) {
18523 GemmMicrokernelTester()
18524 .mr(5)
18525 .nr(8)
18526 .kr(1)
18527 .sr(1)
18528 .m(m)
18529 .n(n)
18530 .k(k)
18531 .iterations(1)
18532 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18533 }
18534 }
18535 }
18536 }
18537
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,strided_cm_subtile)18538 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, strided_cm_subtile) {
18539 TEST_REQUIRES_X86_SSE;
18540 for (size_t k = 1; k <= 20; k += 5) {
18541 for (uint32_t n = 1; n <= 8; n++) {
18542 for (uint32_t m = 1; m <= 5; m++) {
18543 GemmMicrokernelTester()
18544 .mr(5)
18545 .nr(8)
18546 .kr(1)
18547 .sr(1)
18548 .m(m)
18549 .n(n)
18550 .k(k)
18551 .cm_stride(11)
18552 .iterations(1)
18553 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18554 }
18555 }
18556 }
18557 }
18558
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,qmin)18559 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, qmin) {
18560 TEST_REQUIRES_X86_SSE;
18561 GemmMicrokernelTester()
18562 .mr(5)
18563 .nr(8)
18564 .kr(1)
18565 .sr(1)
18566 .m(5)
18567 .n(8)
18568 .k(4)
18569 .qmin(128)
18570 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18571 }
18572
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,qmax)18573 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, qmax) {
18574 TEST_REQUIRES_X86_SSE;
18575 GemmMicrokernelTester()
18576 .mr(5)
18577 .nr(8)
18578 .kr(1)
18579 .sr(1)
18580 .m(5)
18581 .n(8)
18582 .k(4)
18583 .qmax(128)
18584 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18585 }
18586
TEST(F32_GEMM_MINMAX_5X8__SSE_DUP,strided_cm)18587 TEST(F32_GEMM_MINMAX_5X8__SSE_DUP, strided_cm) {
18588 TEST_REQUIRES_X86_SSE;
18589 GemmMicrokernelTester()
18590 .mr(5)
18591 .nr(8)
18592 .kr(1)
18593 .sr(1)
18594 .m(5)
18595 .n(8)
18596 .k(4)
18597 .cm_stride(11)
18598 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_dup, xnn_init_f32_minmax_sse_params);
18599 }
18600 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
18601
18602
18603 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,k_eq_1)18604 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, k_eq_1) {
18605 TEST_REQUIRES_X86_SSE;
18606 GemmMicrokernelTester()
18607 .mr(5)
18608 .nr(8)
18609 .kr(1)
18610 .sr(1)
18611 .m(5)
18612 .n(8)
18613 .k(1)
18614 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18615 }
18616
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,strided_cn)18617 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, strided_cn) {
18618 TEST_REQUIRES_X86_SSE;
18619 GemmMicrokernelTester()
18620 .mr(5)
18621 .nr(8)
18622 .kr(1)
18623 .sr(1)
18624 .m(5)
18625 .n(8)
18626 .k(1)
18627 .cn_stride(11)
18628 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18629 }
18630
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,k_eq_1_strided_a)18631 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, k_eq_1_strided_a) {
18632 TEST_REQUIRES_X86_SSE;
18633 GemmMicrokernelTester()
18634 .mr(5)
18635 .nr(8)
18636 .kr(1)
18637 .sr(1)
18638 .m(5)
18639 .n(8)
18640 .k(1)
18641 .a_stride(3)
18642 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18643 }
18644
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,k_eq_1_subtile)18645 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, k_eq_1_subtile) {
18646 TEST_REQUIRES_X86_SSE;
18647 for (uint32_t n = 1; n <= 8; n++) {
18648 for (uint32_t m = 1; m <= 5; m++) {
18649 GemmMicrokernelTester()
18650 .mr(5)
18651 .nr(8)
18652 .kr(1)
18653 .sr(1)
18654 .m(m)
18655 .n(n)
18656 .k(1)
18657 .iterations(1)
18658 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18659 }
18660 }
18661 }
18662
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,k_eq_1_subtile_m)18663 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, k_eq_1_subtile_m) {
18664 TEST_REQUIRES_X86_SSE;
18665 for (uint32_t m = 1; m <= 5; m++) {
18666 GemmMicrokernelTester()
18667 .mr(5)
18668 .nr(8)
18669 .kr(1)
18670 .sr(1)
18671 .m(m)
18672 .n(8)
18673 .k(1)
18674 .iterations(1)
18675 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18676 }
18677 }
18678
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,k_eq_1_subtile_n)18679 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, k_eq_1_subtile_n) {
18680 TEST_REQUIRES_X86_SSE;
18681 for (uint32_t n = 1; n <= 8; n++) {
18682 GemmMicrokernelTester()
18683 .mr(5)
18684 .nr(8)
18685 .kr(1)
18686 .sr(1)
18687 .m(5)
18688 .n(n)
18689 .k(1)
18690 .iterations(1)
18691 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18692 }
18693 }
18694
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,k_gt_1)18695 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, k_gt_1) {
18696 TEST_REQUIRES_X86_SSE;
18697 for (size_t k = 2; k < 10; k++) {
18698 GemmMicrokernelTester()
18699 .mr(5)
18700 .nr(8)
18701 .kr(1)
18702 .sr(1)
18703 .m(5)
18704 .n(8)
18705 .k(k)
18706 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18707 }
18708 }
18709
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,k_gt_1_strided_a)18710 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, k_gt_1_strided_a) {
18711 TEST_REQUIRES_X86_SSE;
18712 for (size_t k = 2; k < 10; k++) {
18713 GemmMicrokernelTester()
18714 .mr(5)
18715 .nr(8)
18716 .kr(1)
18717 .sr(1)
18718 .m(5)
18719 .n(8)
18720 .k(k)
18721 .a_stride(11)
18722 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18723 }
18724 }
18725
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,k_gt_1_subtile)18726 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, k_gt_1_subtile) {
18727 TEST_REQUIRES_X86_SSE;
18728 for (size_t k = 2; k < 10; k++) {
18729 for (uint32_t n = 1; n <= 8; n++) {
18730 for (uint32_t m = 1; m <= 5; m++) {
18731 GemmMicrokernelTester()
18732 .mr(5)
18733 .nr(8)
18734 .kr(1)
18735 .sr(1)
18736 .m(m)
18737 .n(n)
18738 .k(k)
18739 .iterations(1)
18740 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18741 }
18742 }
18743 }
18744 }
18745
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,n_gt_8)18746 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, n_gt_8) {
18747 TEST_REQUIRES_X86_SSE;
18748 for (uint32_t n = 9; n < 16; n++) {
18749 for (size_t k = 1; k <= 5; k += 2) {
18750 GemmMicrokernelTester()
18751 .mr(5)
18752 .nr(8)
18753 .kr(1)
18754 .sr(1)
18755 .m(5)
18756 .n(n)
18757 .k(k)
18758 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18759 }
18760 }
18761 }
18762
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,n_gt_8_strided_cn)18763 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, n_gt_8_strided_cn) {
18764 TEST_REQUIRES_X86_SSE;
18765 for (uint32_t n = 9; n < 16; n++) {
18766 for (size_t k = 1; k <= 5; k += 2) {
18767 GemmMicrokernelTester()
18768 .mr(5)
18769 .nr(8)
18770 .kr(1)
18771 .sr(1)
18772 .m(5)
18773 .n(n)
18774 .k(k)
18775 .cn_stride(11)
18776 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18777 }
18778 }
18779 }
18780
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,n_gt_8_strided_a)18781 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, n_gt_8_strided_a) {
18782 TEST_REQUIRES_X86_SSE;
18783 for (uint32_t n = 9; n < 16; n++) {
18784 for (size_t k = 1; k <= 5; k += 2) {
18785 GemmMicrokernelTester()
18786 .mr(5)
18787 .nr(8)
18788 .kr(1)
18789 .sr(1)
18790 .m(5)
18791 .n(n)
18792 .k(k)
18793 .a_stride(7)
18794 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18795 }
18796 }
18797 }
18798
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,n_gt_8_subtile)18799 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, n_gt_8_subtile) {
18800 TEST_REQUIRES_X86_SSE;
18801 for (uint32_t n = 9; n < 16; n++) {
18802 for (size_t k = 1; k <= 5; k += 2) {
18803 for (uint32_t m = 1; m <= 5; m++) {
18804 GemmMicrokernelTester()
18805 .mr(5)
18806 .nr(8)
18807 .kr(1)
18808 .sr(1)
18809 .m(m)
18810 .n(n)
18811 .k(k)
18812 .iterations(1)
18813 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18814 }
18815 }
18816 }
18817 }
18818
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,n_div_8)18819 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, n_div_8) {
18820 TEST_REQUIRES_X86_SSE;
18821 for (uint32_t n = 16; n <= 24; n += 8) {
18822 for (size_t k = 1; k <= 5; k += 2) {
18823 GemmMicrokernelTester()
18824 .mr(5)
18825 .nr(8)
18826 .kr(1)
18827 .sr(1)
18828 .m(5)
18829 .n(n)
18830 .k(k)
18831 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18832 }
18833 }
18834 }
18835
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,n_div_8_strided_cn)18836 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, n_div_8_strided_cn) {
18837 TEST_REQUIRES_X86_SSE;
18838 for (uint32_t n = 16; n <= 24; n += 8) {
18839 for (size_t k = 1; k <= 5; k += 2) {
18840 GemmMicrokernelTester()
18841 .mr(5)
18842 .nr(8)
18843 .kr(1)
18844 .sr(1)
18845 .m(5)
18846 .n(n)
18847 .k(k)
18848 .cn_stride(11)
18849 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18850 }
18851 }
18852 }
18853
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,n_div_8_strided_a)18854 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, n_div_8_strided_a) {
18855 TEST_REQUIRES_X86_SSE;
18856 for (uint32_t n = 16; n <= 24; n += 8) {
18857 for (size_t k = 1; k <= 5; k += 2) {
18858 GemmMicrokernelTester()
18859 .mr(5)
18860 .nr(8)
18861 .kr(1)
18862 .sr(1)
18863 .m(5)
18864 .n(n)
18865 .k(k)
18866 .a_stride(7)
18867 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18868 }
18869 }
18870 }
18871
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,n_div_8_subtile)18872 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, n_div_8_subtile) {
18873 TEST_REQUIRES_X86_SSE;
18874 for (uint32_t n = 16; n <= 24; n += 8) {
18875 for (size_t k = 1; k <= 5; k += 2) {
18876 for (uint32_t m = 1; m <= 5; m++) {
18877 GemmMicrokernelTester()
18878 .mr(5)
18879 .nr(8)
18880 .kr(1)
18881 .sr(1)
18882 .m(m)
18883 .n(n)
18884 .k(k)
18885 .iterations(1)
18886 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18887 }
18888 }
18889 }
18890 }
18891
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,strided_cm_subtile)18892 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, strided_cm_subtile) {
18893 TEST_REQUIRES_X86_SSE;
18894 for (size_t k = 1; k <= 5; k += 2) {
18895 for (uint32_t n = 1; n <= 8; n++) {
18896 for (uint32_t m = 1; m <= 5; m++) {
18897 GemmMicrokernelTester()
18898 .mr(5)
18899 .nr(8)
18900 .kr(1)
18901 .sr(1)
18902 .m(m)
18903 .n(n)
18904 .k(k)
18905 .cm_stride(11)
18906 .iterations(1)
18907 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18908 }
18909 }
18910 }
18911 }
18912
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,qmin)18913 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, qmin) {
18914 TEST_REQUIRES_X86_SSE;
18915 GemmMicrokernelTester()
18916 .mr(5)
18917 .nr(8)
18918 .kr(1)
18919 .sr(1)
18920 .m(5)
18921 .n(8)
18922 .k(1)
18923 .qmin(128)
18924 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18925 }
18926
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,qmax)18927 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, qmax) {
18928 TEST_REQUIRES_X86_SSE;
18929 GemmMicrokernelTester()
18930 .mr(5)
18931 .nr(8)
18932 .kr(1)
18933 .sr(1)
18934 .m(5)
18935 .n(8)
18936 .k(1)
18937 .qmax(128)
18938 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18939 }
18940
TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1,strided_cm)18941 TEST(F32_GEMM_MINMAX_5X8__SSE_LOAD1, strided_cm) {
18942 TEST_REQUIRES_X86_SSE;
18943 GemmMicrokernelTester()
18944 .mr(5)
18945 .nr(8)
18946 .kr(1)
18947 .sr(1)
18948 .m(5)
18949 .n(8)
18950 .k(1)
18951 .cm_stride(11)
18952 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse_load1, xnn_init_f32_minmax_sse_params);
18953 }
18954 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
18955
18956
18957 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_eq_4)18958 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_eq_4) {
18959 TEST_REQUIRES_X86_SSE2;
18960 GemmMicrokernelTester()
18961 .mr(5)
18962 .nr(8)
18963 .kr(1)
18964 .sr(1)
18965 .m(5)
18966 .n(8)
18967 .k(4)
18968 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
18969 }
18970
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,strided_cn)18971 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, strided_cn) {
18972 TEST_REQUIRES_X86_SSE2;
18973 GemmMicrokernelTester()
18974 .mr(5)
18975 .nr(8)
18976 .kr(1)
18977 .sr(1)
18978 .m(5)
18979 .n(8)
18980 .k(4)
18981 .cn_stride(11)
18982 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
18983 }
18984
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_eq_4_strided_a)18985 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_eq_4_strided_a) {
18986 TEST_REQUIRES_X86_SSE2;
18987 GemmMicrokernelTester()
18988 .mr(5)
18989 .nr(8)
18990 .kr(1)
18991 .sr(1)
18992 .m(5)
18993 .n(8)
18994 .k(4)
18995 .a_stride(7)
18996 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
18997 }
18998
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_eq_4_subtile)18999 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_eq_4_subtile) {
19000 TEST_REQUIRES_X86_SSE2;
19001 for (uint32_t n = 1; n <= 8; n++) {
19002 for (uint32_t m = 1; m <= 5; m++) {
19003 GemmMicrokernelTester()
19004 .mr(5)
19005 .nr(8)
19006 .kr(1)
19007 .sr(1)
19008 .m(m)
19009 .n(n)
19010 .k(4)
19011 .iterations(1)
19012 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19013 }
19014 }
19015 }
19016
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_eq_4_subtile_m)19017 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_eq_4_subtile_m) {
19018 TEST_REQUIRES_X86_SSE2;
19019 for (uint32_t m = 1; m <= 5; m++) {
19020 GemmMicrokernelTester()
19021 .mr(5)
19022 .nr(8)
19023 .kr(1)
19024 .sr(1)
19025 .m(m)
19026 .n(8)
19027 .k(4)
19028 .iterations(1)
19029 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19030 }
19031 }
19032
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_eq_4_subtile_n)19033 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_eq_4_subtile_n) {
19034 TEST_REQUIRES_X86_SSE2;
19035 for (uint32_t n = 1; n <= 8; n++) {
19036 GemmMicrokernelTester()
19037 .mr(5)
19038 .nr(8)
19039 .kr(1)
19040 .sr(1)
19041 .m(5)
19042 .n(n)
19043 .k(4)
19044 .iterations(1)
19045 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19046 }
19047 }
19048
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_lt_4)19049 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_lt_4) {
19050 TEST_REQUIRES_X86_SSE2;
19051 for (size_t k = 1; k < 4; k++) {
19052 GemmMicrokernelTester()
19053 .mr(5)
19054 .nr(8)
19055 .kr(1)
19056 .sr(1)
19057 .m(5)
19058 .n(8)
19059 .k(k)
19060 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19061 }
19062 }
19063
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_lt_4_strided_a)19064 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_lt_4_strided_a) {
19065 TEST_REQUIRES_X86_SSE2;
19066 for (size_t k = 1; k < 4; k++) {
19067 GemmMicrokernelTester()
19068 .mr(5)
19069 .nr(8)
19070 .kr(1)
19071 .sr(1)
19072 .m(5)
19073 .n(8)
19074 .k(k)
19075 .a_stride(7)
19076 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19077 }
19078 }
19079
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_lt_4_subtile)19080 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_lt_4_subtile) {
19081 TEST_REQUIRES_X86_SSE2;
19082 for (size_t k = 1; k < 4; k++) {
19083 for (uint32_t n = 1; n <= 8; n++) {
19084 for (uint32_t m = 1; m <= 5; m++) {
19085 GemmMicrokernelTester()
19086 .mr(5)
19087 .nr(8)
19088 .kr(1)
19089 .sr(1)
19090 .m(m)
19091 .n(n)
19092 .k(k)
19093 .iterations(1)
19094 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19095 }
19096 }
19097 }
19098 }
19099
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_gt_4)19100 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_gt_4) {
19101 TEST_REQUIRES_X86_SSE2;
19102 for (size_t k = 5; k < 8; k++) {
19103 GemmMicrokernelTester()
19104 .mr(5)
19105 .nr(8)
19106 .kr(1)
19107 .sr(1)
19108 .m(5)
19109 .n(8)
19110 .k(k)
19111 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19112 }
19113 }
19114
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_gt_4_strided_a)19115 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_gt_4_strided_a) {
19116 TEST_REQUIRES_X86_SSE2;
19117 for (size_t k = 5; k < 8; k++) {
19118 GemmMicrokernelTester()
19119 .mr(5)
19120 .nr(8)
19121 .kr(1)
19122 .sr(1)
19123 .m(5)
19124 .n(8)
19125 .k(k)
19126 .a_stride(11)
19127 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19128 }
19129 }
19130
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_gt_4_subtile)19131 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_gt_4_subtile) {
19132 TEST_REQUIRES_X86_SSE2;
19133 for (size_t k = 5; k < 8; k++) {
19134 for (uint32_t n = 1; n <= 8; n++) {
19135 for (uint32_t m = 1; m <= 5; m++) {
19136 GemmMicrokernelTester()
19137 .mr(5)
19138 .nr(8)
19139 .kr(1)
19140 .sr(1)
19141 .m(m)
19142 .n(n)
19143 .k(k)
19144 .iterations(1)
19145 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19146 }
19147 }
19148 }
19149 }
19150
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_div_4)19151 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_div_4) {
19152 TEST_REQUIRES_X86_SSE2;
19153 for (size_t k = 8; k <= 40; k += 4) {
19154 GemmMicrokernelTester()
19155 .mr(5)
19156 .nr(8)
19157 .kr(1)
19158 .sr(1)
19159 .m(5)
19160 .n(8)
19161 .k(k)
19162 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19163 }
19164 }
19165
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_div_4_strided_a)19166 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_div_4_strided_a) {
19167 TEST_REQUIRES_X86_SSE2;
19168 for (size_t k = 8; k <= 40; k += 4) {
19169 GemmMicrokernelTester()
19170 .mr(5)
19171 .nr(8)
19172 .kr(1)
19173 .sr(1)
19174 .m(5)
19175 .n(8)
19176 .k(k)
19177 .a_stride(43)
19178 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19179 }
19180 }
19181
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,k_div_4_subtile)19182 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, k_div_4_subtile) {
19183 TEST_REQUIRES_X86_SSE2;
19184 for (size_t k = 8; k <= 40; k += 4) {
19185 for (uint32_t n = 1; n <= 8; n++) {
19186 for (uint32_t m = 1; m <= 5; m++) {
19187 GemmMicrokernelTester()
19188 .mr(5)
19189 .nr(8)
19190 .kr(1)
19191 .sr(1)
19192 .m(m)
19193 .n(n)
19194 .k(k)
19195 .iterations(1)
19196 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19197 }
19198 }
19199 }
19200 }
19201
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,n_gt_8)19202 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, n_gt_8) {
19203 TEST_REQUIRES_X86_SSE2;
19204 for (uint32_t n = 9; n < 16; n++) {
19205 for (size_t k = 1; k <= 20; k += 5) {
19206 GemmMicrokernelTester()
19207 .mr(5)
19208 .nr(8)
19209 .kr(1)
19210 .sr(1)
19211 .m(5)
19212 .n(n)
19213 .k(k)
19214 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19215 }
19216 }
19217 }
19218
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,n_gt_8_strided_cn)19219 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, n_gt_8_strided_cn) {
19220 TEST_REQUIRES_X86_SSE2;
19221 for (uint32_t n = 9; n < 16; n++) {
19222 for (size_t k = 1; k <= 20; k += 5) {
19223 GemmMicrokernelTester()
19224 .mr(5)
19225 .nr(8)
19226 .kr(1)
19227 .sr(1)
19228 .m(5)
19229 .n(n)
19230 .k(k)
19231 .cn_stride(11)
19232 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19233 }
19234 }
19235 }
19236
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,n_gt_8_strided_a)19237 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, n_gt_8_strided_a) {
19238 TEST_REQUIRES_X86_SSE2;
19239 for (uint32_t n = 9; n < 16; n++) {
19240 for (size_t k = 1; k <= 20; k += 5) {
19241 GemmMicrokernelTester()
19242 .mr(5)
19243 .nr(8)
19244 .kr(1)
19245 .sr(1)
19246 .m(5)
19247 .n(n)
19248 .k(k)
19249 .a_stride(23)
19250 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19251 }
19252 }
19253 }
19254
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,n_gt_8_subtile)19255 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, n_gt_8_subtile) {
19256 TEST_REQUIRES_X86_SSE2;
19257 for (uint32_t n = 9; n < 16; n++) {
19258 for (size_t k = 1; k <= 20; k += 5) {
19259 for (uint32_t m = 1; m <= 5; m++) {
19260 GemmMicrokernelTester()
19261 .mr(5)
19262 .nr(8)
19263 .kr(1)
19264 .sr(1)
19265 .m(m)
19266 .n(n)
19267 .k(k)
19268 .iterations(1)
19269 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19270 }
19271 }
19272 }
19273 }
19274
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,n_div_8)19275 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, n_div_8) {
19276 TEST_REQUIRES_X86_SSE2;
19277 for (uint32_t n = 16; n <= 24; n += 8) {
19278 for (size_t k = 1; k <= 20; k += 5) {
19279 GemmMicrokernelTester()
19280 .mr(5)
19281 .nr(8)
19282 .kr(1)
19283 .sr(1)
19284 .m(5)
19285 .n(n)
19286 .k(k)
19287 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19288 }
19289 }
19290 }
19291
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,n_div_8_strided_cn)19292 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, n_div_8_strided_cn) {
19293 TEST_REQUIRES_X86_SSE2;
19294 for (uint32_t n = 16; n <= 24; n += 8) {
19295 for (size_t k = 1; k <= 20; k += 5) {
19296 GemmMicrokernelTester()
19297 .mr(5)
19298 .nr(8)
19299 .kr(1)
19300 .sr(1)
19301 .m(5)
19302 .n(n)
19303 .k(k)
19304 .cn_stride(11)
19305 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19306 }
19307 }
19308 }
19309
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,n_div_8_strided_a)19310 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, n_div_8_strided_a) {
19311 TEST_REQUIRES_X86_SSE2;
19312 for (uint32_t n = 16; n <= 24; n += 8) {
19313 for (size_t k = 1; k <= 20; k += 5) {
19314 GemmMicrokernelTester()
19315 .mr(5)
19316 .nr(8)
19317 .kr(1)
19318 .sr(1)
19319 .m(5)
19320 .n(n)
19321 .k(k)
19322 .a_stride(23)
19323 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19324 }
19325 }
19326 }
19327
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,n_div_8_subtile)19328 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, n_div_8_subtile) {
19329 TEST_REQUIRES_X86_SSE2;
19330 for (uint32_t n = 16; n <= 24; n += 8) {
19331 for (size_t k = 1; k <= 20; k += 5) {
19332 for (uint32_t m = 1; m <= 5; m++) {
19333 GemmMicrokernelTester()
19334 .mr(5)
19335 .nr(8)
19336 .kr(1)
19337 .sr(1)
19338 .m(m)
19339 .n(n)
19340 .k(k)
19341 .iterations(1)
19342 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19343 }
19344 }
19345 }
19346 }
19347
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,strided_cm_subtile)19348 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, strided_cm_subtile) {
19349 TEST_REQUIRES_X86_SSE2;
19350 for (size_t k = 1; k <= 20; k += 5) {
19351 for (uint32_t n = 1; n <= 8; n++) {
19352 for (uint32_t m = 1; m <= 5; m++) {
19353 GemmMicrokernelTester()
19354 .mr(5)
19355 .nr(8)
19356 .kr(1)
19357 .sr(1)
19358 .m(m)
19359 .n(n)
19360 .k(k)
19361 .cm_stride(11)
19362 .iterations(1)
19363 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19364 }
19365 }
19366 }
19367 }
19368
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,qmin)19369 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, qmin) {
19370 TEST_REQUIRES_X86_SSE2;
19371 GemmMicrokernelTester()
19372 .mr(5)
19373 .nr(8)
19374 .kr(1)
19375 .sr(1)
19376 .m(5)
19377 .n(8)
19378 .k(4)
19379 .qmin(128)
19380 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19381 }
19382
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,qmax)19383 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, qmax) {
19384 TEST_REQUIRES_X86_SSE2;
19385 GemmMicrokernelTester()
19386 .mr(5)
19387 .nr(8)
19388 .kr(1)
19389 .sr(1)
19390 .m(5)
19391 .n(8)
19392 .k(4)
19393 .qmax(128)
19394 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19395 }
19396
TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP,strided_cm)19397 TEST(F32_GEMM_MINMAX_5X8__SSE2_DUP, strided_cm) {
19398 TEST_REQUIRES_X86_SSE2;
19399 GemmMicrokernelTester()
19400 .mr(5)
19401 .nr(8)
19402 .kr(1)
19403 .sr(1)
19404 .m(5)
19405 .n(8)
19406 .k(4)
19407 .cm_stride(11)
19408 .Test(xnn_f32_gemm_minmax_ukernel_5x8__sse2_dup, xnn_init_f32_minmax_sse_params);
19409 }
19410 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
19411
19412
19413 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,k_eq_1)19414 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, k_eq_1) {
19415 TEST_REQUIRES_X86_AVX;
19416 GemmMicrokernelTester()
19417 .mr(1)
19418 .nr(8)
19419 .kr(1)
19420 .sr(1)
19421 .m(1)
19422 .n(8)
19423 .k(1)
19424 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19425 }
19426
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,strided_cn)19427 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, strided_cn) {
19428 TEST_REQUIRES_X86_AVX;
19429 GemmMicrokernelTester()
19430 .mr(1)
19431 .nr(8)
19432 .kr(1)
19433 .sr(1)
19434 .m(1)
19435 .n(8)
19436 .k(1)
19437 .cn_stride(11)
19438 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19439 }
19440
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,k_eq_1_strided_a)19441 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, k_eq_1_strided_a) {
19442 TEST_REQUIRES_X86_AVX;
19443 GemmMicrokernelTester()
19444 .mr(1)
19445 .nr(8)
19446 .kr(1)
19447 .sr(1)
19448 .m(1)
19449 .n(8)
19450 .k(1)
19451 .a_stride(3)
19452 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19453 }
19454
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,k_eq_1_subtile)19455 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, k_eq_1_subtile) {
19456 TEST_REQUIRES_X86_AVX;
19457 for (uint32_t n = 1; n <= 8; n++) {
19458 for (uint32_t m = 1; m <= 1; m++) {
19459 GemmMicrokernelTester()
19460 .mr(1)
19461 .nr(8)
19462 .kr(1)
19463 .sr(1)
19464 .m(m)
19465 .n(n)
19466 .k(1)
19467 .iterations(1)
19468 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19469 }
19470 }
19471 }
19472
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,k_eq_1_subtile_m)19473 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, k_eq_1_subtile_m) {
19474 TEST_REQUIRES_X86_AVX;
19475 for (uint32_t m = 1; m <= 1; m++) {
19476 GemmMicrokernelTester()
19477 .mr(1)
19478 .nr(8)
19479 .kr(1)
19480 .sr(1)
19481 .m(m)
19482 .n(8)
19483 .k(1)
19484 .iterations(1)
19485 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19486 }
19487 }
19488
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,k_eq_1_subtile_n)19489 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, k_eq_1_subtile_n) {
19490 TEST_REQUIRES_X86_AVX;
19491 for (uint32_t n = 1; n <= 8; n++) {
19492 GemmMicrokernelTester()
19493 .mr(1)
19494 .nr(8)
19495 .kr(1)
19496 .sr(1)
19497 .m(1)
19498 .n(n)
19499 .k(1)
19500 .iterations(1)
19501 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19502 }
19503 }
19504
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,k_gt_1)19505 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, k_gt_1) {
19506 TEST_REQUIRES_X86_AVX;
19507 for (size_t k = 2; k < 10; k++) {
19508 GemmMicrokernelTester()
19509 .mr(1)
19510 .nr(8)
19511 .kr(1)
19512 .sr(1)
19513 .m(1)
19514 .n(8)
19515 .k(k)
19516 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19517 }
19518 }
19519
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,k_gt_1_strided_a)19520 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, k_gt_1_strided_a) {
19521 TEST_REQUIRES_X86_AVX;
19522 for (size_t k = 2; k < 10; k++) {
19523 GemmMicrokernelTester()
19524 .mr(1)
19525 .nr(8)
19526 .kr(1)
19527 .sr(1)
19528 .m(1)
19529 .n(8)
19530 .k(k)
19531 .a_stride(11)
19532 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19533 }
19534 }
19535
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,k_gt_1_subtile)19536 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, k_gt_1_subtile) {
19537 TEST_REQUIRES_X86_AVX;
19538 for (size_t k = 2; k < 10; k++) {
19539 for (uint32_t n = 1; n <= 8; n++) {
19540 for (uint32_t m = 1; m <= 1; m++) {
19541 GemmMicrokernelTester()
19542 .mr(1)
19543 .nr(8)
19544 .kr(1)
19545 .sr(1)
19546 .m(m)
19547 .n(n)
19548 .k(k)
19549 .iterations(1)
19550 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19551 }
19552 }
19553 }
19554 }
19555
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,n_gt_8)19556 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, n_gt_8) {
19557 TEST_REQUIRES_X86_AVX;
19558 for (uint32_t n = 9; n < 16; n++) {
19559 for (size_t k = 1; k <= 5; k += 2) {
19560 GemmMicrokernelTester()
19561 .mr(1)
19562 .nr(8)
19563 .kr(1)
19564 .sr(1)
19565 .m(1)
19566 .n(n)
19567 .k(k)
19568 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19569 }
19570 }
19571 }
19572
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,n_gt_8_strided_cn)19573 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, n_gt_8_strided_cn) {
19574 TEST_REQUIRES_X86_AVX;
19575 for (uint32_t n = 9; n < 16; n++) {
19576 for (size_t k = 1; k <= 5; k += 2) {
19577 GemmMicrokernelTester()
19578 .mr(1)
19579 .nr(8)
19580 .kr(1)
19581 .sr(1)
19582 .m(1)
19583 .n(n)
19584 .k(k)
19585 .cn_stride(11)
19586 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19587 }
19588 }
19589 }
19590
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,n_gt_8_strided_a)19591 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, n_gt_8_strided_a) {
19592 TEST_REQUIRES_X86_AVX;
19593 for (uint32_t n = 9; n < 16; n++) {
19594 for (size_t k = 1; k <= 5; k += 2) {
19595 GemmMicrokernelTester()
19596 .mr(1)
19597 .nr(8)
19598 .kr(1)
19599 .sr(1)
19600 .m(1)
19601 .n(n)
19602 .k(k)
19603 .a_stride(7)
19604 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19605 }
19606 }
19607 }
19608
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,n_gt_8_subtile)19609 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, n_gt_8_subtile) {
19610 TEST_REQUIRES_X86_AVX;
19611 for (uint32_t n = 9; n < 16; n++) {
19612 for (size_t k = 1; k <= 5; k += 2) {
19613 for (uint32_t m = 1; m <= 1; m++) {
19614 GemmMicrokernelTester()
19615 .mr(1)
19616 .nr(8)
19617 .kr(1)
19618 .sr(1)
19619 .m(m)
19620 .n(n)
19621 .k(k)
19622 .iterations(1)
19623 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19624 }
19625 }
19626 }
19627 }
19628
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,n_div_8)19629 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, n_div_8) {
19630 TEST_REQUIRES_X86_AVX;
19631 for (uint32_t n = 16; n <= 24; n += 8) {
19632 for (size_t k = 1; k <= 5; k += 2) {
19633 GemmMicrokernelTester()
19634 .mr(1)
19635 .nr(8)
19636 .kr(1)
19637 .sr(1)
19638 .m(1)
19639 .n(n)
19640 .k(k)
19641 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19642 }
19643 }
19644 }
19645
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,n_div_8_strided_cn)19646 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, n_div_8_strided_cn) {
19647 TEST_REQUIRES_X86_AVX;
19648 for (uint32_t n = 16; n <= 24; n += 8) {
19649 for (size_t k = 1; k <= 5; k += 2) {
19650 GemmMicrokernelTester()
19651 .mr(1)
19652 .nr(8)
19653 .kr(1)
19654 .sr(1)
19655 .m(1)
19656 .n(n)
19657 .k(k)
19658 .cn_stride(11)
19659 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19660 }
19661 }
19662 }
19663
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,n_div_8_strided_a)19664 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, n_div_8_strided_a) {
19665 TEST_REQUIRES_X86_AVX;
19666 for (uint32_t n = 16; n <= 24; n += 8) {
19667 for (size_t k = 1; k <= 5; k += 2) {
19668 GemmMicrokernelTester()
19669 .mr(1)
19670 .nr(8)
19671 .kr(1)
19672 .sr(1)
19673 .m(1)
19674 .n(n)
19675 .k(k)
19676 .a_stride(7)
19677 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19678 }
19679 }
19680 }
19681
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,n_div_8_subtile)19682 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, n_div_8_subtile) {
19683 TEST_REQUIRES_X86_AVX;
19684 for (uint32_t n = 16; n <= 24; n += 8) {
19685 for (size_t k = 1; k <= 5; k += 2) {
19686 for (uint32_t m = 1; m <= 1; m++) {
19687 GemmMicrokernelTester()
19688 .mr(1)
19689 .nr(8)
19690 .kr(1)
19691 .sr(1)
19692 .m(m)
19693 .n(n)
19694 .k(k)
19695 .iterations(1)
19696 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19697 }
19698 }
19699 }
19700 }
19701
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,strided_cm_subtile)19702 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, strided_cm_subtile) {
19703 TEST_REQUIRES_X86_AVX;
19704 for (size_t k = 1; k <= 5; k += 2) {
19705 for (uint32_t n = 1; n <= 8; n++) {
19706 for (uint32_t m = 1; m <= 1; m++) {
19707 GemmMicrokernelTester()
19708 .mr(1)
19709 .nr(8)
19710 .kr(1)
19711 .sr(1)
19712 .m(m)
19713 .n(n)
19714 .k(k)
19715 .cm_stride(11)
19716 .iterations(1)
19717 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19718 }
19719 }
19720 }
19721 }
19722
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,qmin)19723 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, qmin) {
19724 TEST_REQUIRES_X86_AVX;
19725 GemmMicrokernelTester()
19726 .mr(1)
19727 .nr(8)
19728 .kr(1)
19729 .sr(1)
19730 .m(1)
19731 .n(8)
19732 .k(1)
19733 .qmin(128)
19734 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19735 }
19736
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,qmax)19737 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, qmax) {
19738 TEST_REQUIRES_X86_AVX;
19739 GemmMicrokernelTester()
19740 .mr(1)
19741 .nr(8)
19742 .kr(1)
19743 .sr(1)
19744 .m(1)
19745 .n(8)
19746 .k(1)
19747 .qmax(128)
19748 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19749 }
19750
TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST,strided_cm)19751 TEST(F32_GEMM_MINMAX_1X8__AVX_BROADCAST, strided_cm) {
19752 TEST_REQUIRES_X86_AVX;
19753 GemmMicrokernelTester()
19754 .mr(1)
19755 .nr(8)
19756 .kr(1)
19757 .sr(1)
19758 .m(1)
19759 .n(8)
19760 .k(1)
19761 .cm_stride(11)
19762 .Test(xnn_f32_gemm_minmax_ukernel_1x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
19763 }
19764 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
19765
19766
19767 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,k_eq_1)19768 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, k_eq_1) {
19769 TEST_REQUIRES_X86_AVX;
19770 GemmMicrokernelTester()
19771 .mr(3)
19772 .nr(16)
19773 .kr(1)
19774 .sr(1)
19775 .m(3)
19776 .n(16)
19777 .k(1)
19778 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19779 }
19780
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,strided_cn)19781 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, strided_cn) {
19782 TEST_REQUIRES_X86_AVX;
19783 GemmMicrokernelTester()
19784 .mr(3)
19785 .nr(16)
19786 .kr(1)
19787 .sr(1)
19788 .m(3)
19789 .n(16)
19790 .k(1)
19791 .cn_stride(19)
19792 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19793 }
19794
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,k_eq_1_strided_a)19795 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, k_eq_1_strided_a) {
19796 TEST_REQUIRES_X86_AVX;
19797 GemmMicrokernelTester()
19798 .mr(3)
19799 .nr(16)
19800 .kr(1)
19801 .sr(1)
19802 .m(3)
19803 .n(16)
19804 .k(1)
19805 .a_stride(3)
19806 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19807 }
19808
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,k_eq_1_subtile)19809 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, k_eq_1_subtile) {
19810 TEST_REQUIRES_X86_AVX;
19811 for (uint32_t n = 1; n <= 16; n++) {
19812 for (uint32_t m = 1; m <= 3; m++) {
19813 GemmMicrokernelTester()
19814 .mr(3)
19815 .nr(16)
19816 .kr(1)
19817 .sr(1)
19818 .m(m)
19819 .n(n)
19820 .k(1)
19821 .iterations(1)
19822 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19823 }
19824 }
19825 }
19826
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,k_eq_1_subtile_m)19827 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, k_eq_1_subtile_m) {
19828 TEST_REQUIRES_X86_AVX;
19829 for (uint32_t m = 1; m <= 3; m++) {
19830 GemmMicrokernelTester()
19831 .mr(3)
19832 .nr(16)
19833 .kr(1)
19834 .sr(1)
19835 .m(m)
19836 .n(16)
19837 .k(1)
19838 .iterations(1)
19839 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19840 }
19841 }
19842
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,k_eq_1_subtile_n)19843 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, k_eq_1_subtile_n) {
19844 TEST_REQUIRES_X86_AVX;
19845 for (uint32_t n = 1; n <= 16; n++) {
19846 GemmMicrokernelTester()
19847 .mr(3)
19848 .nr(16)
19849 .kr(1)
19850 .sr(1)
19851 .m(3)
19852 .n(n)
19853 .k(1)
19854 .iterations(1)
19855 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19856 }
19857 }
19858
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,k_gt_1)19859 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, k_gt_1) {
19860 TEST_REQUIRES_X86_AVX;
19861 for (size_t k = 2; k < 10; k++) {
19862 GemmMicrokernelTester()
19863 .mr(3)
19864 .nr(16)
19865 .kr(1)
19866 .sr(1)
19867 .m(3)
19868 .n(16)
19869 .k(k)
19870 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19871 }
19872 }
19873
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,k_gt_1_strided_a)19874 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, k_gt_1_strided_a) {
19875 TEST_REQUIRES_X86_AVX;
19876 for (size_t k = 2; k < 10; k++) {
19877 GemmMicrokernelTester()
19878 .mr(3)
19879 .nr(16)
19880 .kr(1)
19881 .sr(1)
19882 .m(3)
19883 .n(16)
19884 .k(k)
19885 .a_stride(11)
19886 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19887 }
19888 }
19889
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,k_gt_1_subtile)19890 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, k_gt_1_subtile) {
19891 TEST_REQUIRES_X86_AVX;
19892 for (size_t k = 2; k < 10; k++) {
19893 for (uint32_t n = 1; n <= 16; n++) {
19894 for (uint32_t m = 1; m <= 3; m++) {
19895 GemmMicrokernelTester()
19896 .mr(3)
19897 .nr(16)
19898 .kr(1)
19899 .sr(1)
19900 .m(m)
19901 .n(n)
19902 .k(k)
19903 .iterations(1)
19904 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19905 }
19906 }
19907 }
19908 }
19909
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,n_gt_16)19910 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, n_gt_16) {
19911 TEST_REQUIRES_X86_AVX;
19912 for (uint32_t n = 17; n < 32; n++) {
19913 for (size_t k = 1; k <= 5; k += 2) {
19914 GemmMicrokernelTester()
19915 .mr(3)
19916 .nr(16)
19917 .kr(1)
19918 .sr(1)
19919 .m(3)
19920 .n(n)
19921 .k(k)
19922 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19923 }
19924 }
19925 }
19926
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,n_gt_16_strided_cn)19927 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, n_gt_16_strided_cn) {
19928 TEST_REQUIRES_X86_AVX;
19929 for (uint32_t n = 17; n < 32; n++) {
19930 for (size_t k = 1; k <= 5; k += 2) {
19931 GemmMicrokernelTester()
19932 .mr(3)
19933 .nr(16)
19934 .kr(1)
19935 .sr(1)
19936 .m(3)
19937 .n(n)
19938 .k(k)
19939 .cn_stride(19)
19940 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19941 }
19942 }
19943 }
19944
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,n_gt_16_strided_a)19945 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, n_gt_16_strided_a) {
19946 TEST_REQUIRES_X86_AVX;
19947 for (uint32_t n = 17; n < 32; n++) {
19948 for (size_t k = 1; k <= 5; k += 2) {
19949 GemmMicrokernelTester()
19950 .mr(3)
19951 .nr(16)
19952 .kr(1)
19953 .sr(1)
19954 .m(3)
19955 .n(n)
19956 .k(k)
19957 .a_stride(7)
19958 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19959 }
19960 }
19961 }
19962
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,n_gt_16_subtile)19963 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, n_gt_16_subtile) {
19964 TEST_REQUIRES_X86_AVX;
19965 for (uint32_t n = 17; n < 32; n++) {
19966 for (size_t k = 1; k <= 5; k += 2) {
19967 for (uint32_t m = 1; m <= 3; m++) {
19968 GemmMicrokernelTester()
19969 .mr(3)
19970 .nr(16)
19971 .kr(1)
19972 .sr(1)
19973 .m(m)
19974 .n(n)
19975 .k(k)
19976 .iterations(1)
19977 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19978 }
19979 }
19980 }
19981 }
19982
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,n_div_16)19983 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, n_div_16) {
19984 TEST_REQUIRES_X86_AVX;
19985 for (uint32_t n = 32; n <= 48; n += 16) {
19986 for (size_t k = 1; k <= 5; k += 2) {
19987 GemmMicrokernelTester()
19988 .mr(3)
19989 .nr(16)
19990 .kr(1)
19991 .sr(1)
19992 .m(3)
19993 .n(n)
19994 .k(k)
19995 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
19996 }
19997 }
19998 }
19999
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,n_div_16_strided_cn)20000 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, n_div_16_strided_cn) {
20001 TEST_REQUIRES_X86_AVX;
20002 for (uint32_t n = 32; n <= 48; n += 16) {
20003 for (size_t k = 1; k <= 5; k += 2) {
20004 GemmMicrokernelTester()
20005 .mr(3)
20006 .nr(16)
20007 .kr(1)
20008 .sr(1)
20009 .m(3)
20010 .n(n)
20011 .k(k)
20012 .cn_stride(19)
20013 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
20014 }
20015 }
20016 }
20017
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,n_div_16_strided_a)20018 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, n_div_16_strided_a) {
20019 TEST_REQUIRES_X86_AVX;
20020 for (uint32_t n = 32; n <= 48; n += 16) {
20021 for (size_t k = 1; k <= 5; k += 2) {
20022 GemmMicrokernelTester()
20023 .mr(3)
20024 .nr(16)
20025 .kr(1)
20026 .sr(1)
20027 .m(3)
20028 .n(n)
20029 .k(k)
20030 .a_stride(7)
20031 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
20032 }
20033 }
20034 }
20035
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,n_div_16_subtile)20036 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, n_div_16_subtile) {
20037 TEST_REQUIRES_X86_AVX;
20038 for (uint32_t n = 32; n <= 48; n += 16) {
20039 for (size_t k = 1; k <= 5; k += 2) {
20040 for (uint32_t m = 1; m <= 3; m++) {
20041 GemmMicrokernelTester()
20042 .mr(3)
20043 .nr(16)
20044 .kr(1)
20045 .sr(1)
20046 .m(m)
20047 .n(n)
20048 .k(k)
20049 .iterations(1)
20050 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
20051 }
20052 }
20053 }
20054 }
20055
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,strided_cm_subtile)20056 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, strided_cm_subtile) {
20057 TEST_REQUIRES_X86_AVX;
20058 for (size_t k = 1; k <= 5; k += 2) {
20059 for (uint32_t n = 1; n <= 16; n++) {
20060 for (uint32_t m = 1; m <= 3; m++) {
20061 GemmMicrokernelTester()
20062 .mr(3)
20063 .nr(16)
20064 .kr(1)
20065 .sr(1)
20066 .m(m)
20067 .n(n)
20068 .k(k)
20069 .cm_stride(19)
20070 .iterations(1)
20071 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
20072 }
20073 }
20074 }
20075 }
20076
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,qmin)20077 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, qmin) {
20078 TEST_REQUIRES_X86_AVX;
20079 GemmMicrokernelTester()
20080 .mr(3)
20081 .nr(16)
20082 .kr(1)
20083 .sr(1)
20084 .m(3)
20085 .n(16)
20086 .k(1)
20087 .qmin(128)
20088 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
20089 }
20090
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,qmax)20091 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, qmax) {
20092 TEST_REQUIRES_X86_AVX;
20093 GemmMicrokernelTester()
20094 .mr(3)
20095 .nr(16)
20096 .kr(1)
20097 .sr(1)
20098 .m(3)
20099 .n(16)
20100 .k(1)
20101 .qmax(128)
20102 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
20103 }
20104
TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST,strided_cm)20105 TEST(F32_GEMM_MINMAX_3X16__AVX_BROADCAST, strided_cm) {
20106 TEST_REQUIRES_X86_AVX;
20107 GemmMicrokernelTester()
20108 .mr(3)
20109 .nr(16)
20110 .kr(1)
20111 .sr(1)
20112 .m(3)
20113 .n(16)
20114 .k(1)
20115 .cm_stride(19)
20116 .Test(xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast, xnn_init_f32_minmax_avx_params);
20117 }
20118 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20119
20120
20121 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,k_eq_1)20122 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, k_eq_1) {
20123 TEST_REQUIRES_X86_AVX;
20124 GemmMicrokernelTester()
20125 .mr(5)
20126 .nr(8)
20127 .kr(1)
20128 .sr(1)
20129 .m(5)
20130 .n(8)
20131 .k(1)
20132 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20133 }
20134
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,strided_cn)20135 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, strided_cn) {
20136 TEST_REQUIRES_X86_AVX;
20137 GemmMicrokernelTester()
20138 .mr(5)
20139 .nr(8)
20140 .kr(1)
20141 .sr(1)
20142 .m(5)
20143 .n(8)
20144 .k(1)
20145 .cn_stride(11)
20146 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20147 }
20148
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,k_eq_1_strided_a)20149 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, k_eq_1_strided_a) {
20150 TEST_REQUIRES_X86_AVX;
20151 GemmMicrokernelTester()
20152 .mr(5)
20153 .nr(8)
20154 .kr(1)
20155 .sr(1)
20156 .m(5)
20157 .n(8)
20158 .k(1)
20159 .a_stride(3)
20160 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20161 }
20162
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,k_eq_1_subtile)20163 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, k_eq_1_subtile) {
20164 TEST_REQUIRES_X86_AVX;
20165 for (uint32_t n = 1; n <= 8; n++) {
20166 for (uint32_t m = 1; m <= 5; m++) {
20167 GemmMicrokernelTester()
20168 .mr(5)
20169 .nr(8)
20170 .kr(1)
20171 .sr(1)
20172 .m(m)
20173 .n(n)
20174 .k(1)
20175 .iterations(1)
20176 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20177 }
20178 }
20179 }
20180
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,k_eq_1_subtile_m)20181 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, k_eq_1_subtile_m) {
20182 TEST_REQUIRES_X86_AVX;
20183 for (uint32_t m = 1; m <= 5; m++) {
20184 GemmMicrokernelTester()
20185 .mr(5)
20186 .nr(8)
20187 .kr(1)
20188 .sr(1)
20189 .m(m)
20190 .n(8)
20191 .k(1)
20192 .iterations(1)
20193 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20194 }
20195 }
20196
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,k_eq_1_subtile_n)20197 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, k_eq_1_subtile_n) {
20198 TEST_REQUIRES_X86_AVX;
20199 for (uint32_t n = 1; n <= 8; n++) {
20200 GemmMicrokernelTester()
20201 .mr(5)
20202 .nr(8)
20203 .kr(1)
20204 .sr(1)
20205 .m(5)
20206 .n(n)
20207 .k(1)
20208 .iterations(1)
20209 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20210 }
20211 }
20212
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,k_gt_1)20213 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, k_gt_1) {
20214 TEST_REQUIRES_X86_AVX;
20215 for (size_t k = 2; k < 10; k++) {
20216 GemmMicrokernelTester()
20217 .mr(5)
20218 .nr(8)
20219 .kr(1)
20220 .sr(1)
20221 .m(5)
20222 .n(8)
20223 .k(k)
20224 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20225 }
20226 }
20227
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,k_gt_1_strided_a)20228 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, k_gt_1_strided_a) {
20229 TEST_REQUIRES_X86_AVX;
20230 for (size_t k = 2; k < 10; k++) {
20231 GemmMicrokernelTester()
20232 .mr(5)
20233 .nr(8)
20234 .kr(1)
20235 .sr(1)
20236 .m(5)
20237 .n(8)
20238 .k(k)
20239 .a_stride(11)
20240 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20241 }
20242 }
20243
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,k_gt_1_subtile)20244 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, k_gt_1_subtile) {
20245 TEST_REQUIRES_X86_AVX;
20246 for (size_t k = 2; k < 10; k++) {
20247 for (uint32_t n = 1; n <= 8; n++) {
20248 for (uint32_t m = 1; m <= 5; m++) {
20249 GemmMicrokernelTester()
20250 .mr(5)
20251 .nr(8)
20252 .kr(1)
20253 .sr(1)
20254 .m(m)
20255 .n(n)
20256 .k(k)
20257 .iterations(1)
20258 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20259 }
20260 }
20261 }
20262 }
20263
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,n_gt_8)20264 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, n_gt_8) {
20265 TEST_REQUIRES_X86_AVX;
20266 for (uint32_t n = 9; n < 16; n++) {
20267 for (size_t k = 1; k <= 5; k += 2) {
20268 GemmMicrokernelTester()
20269 .mr(5)
20270 .nr(8)
20271 .kr(1)
20272 .sr(1)
20273 .m(5)
20274 .n(n)
20275 .k(k)
20276 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20277 }
20278 }
20279 }
20280
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,n_gt_8_strided_cn)20281 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, n_gt_8_strided_cn) {
20282 TEST_REQUIRES_X86_AVX;
20283 for (uint32_t n = 9; n < 16; n++) {
20284 for (size_t k = 1; k <= 5; k += 2) {
20285 GemmMicrokernelTester()
20286 .mr(5)
20287 .nr(8)
20288 .kr(1)
20289 .sr(1)
20290 .m(5)
20291 .n(n)
20292 .k(k)
20293 .cn_stride(11)
20294 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20295 }
20296 }
20297 }
20298
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,n_gt_8_strided_a)20299 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, n_gt_8_strided_a) {
20300 TEST_REQUIRES_X86_AVX;
20301 for (uint32_t n = 9; n < 16; n++) {
20302 for (size_t k = 1; k <= 5; k += 2) {
20303 GemmMicrokernelTester()
20304 .mr(5)
20305 .nr(8)
20306 .kr(1)
20307 .sr(1)
20308 .m(5)
20309 .n(n)
20310 .k(k)
20311 .a_stride(7)
20312 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20313 }
20314 }
20315 }
20316
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,n_gt_8_subtile)20317 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, n_gt_8_subtile) {
20318 TEST_REQUIRES_X86_AVX;
20319 for (uint32_t n = 9; n < 16; n++) {
20320 for (size_t k = 1; k <= 5; k += 2) {
20321 for (uint32_t m = 1; m <= 5; m++) {
20322 GemmMicrokernelTester()
20323 .mr(5)
20324 .nr(8)
20325 .kr(1)
20326 .sr(1)
20327 .m(m)
20328 .n(n)
20329 .k(k)
20330 .iterations(1)
20331 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20332 }
20333 }
20334 }
20335 }
20336
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,n_div_8)20337 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, n_div_8) {
20338 TEST_REQUIRES_X86_AVX;
20339 for (uint32_t n = 16; n <= 24; n += 8) {
20340 for (size_t k = 1; k <= 5; k += 2) {
20341 GemmMicrokernelTester()
20342 .mr(5)
20343 .nr(8)
20344 .kr(1)
20345 .sr(1)
20346 .m(5)
20347 .n(n)
20348 .k(k)
20349 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20350 }
20351 }
20352 }
20353
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,n_div_8_strided_cn)20354 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, n_div_8_strided_cn) {
20355 TEST_REQUIRES_X86_AVX;
20356 for (uint32_t n = 16; n <= 24; n += 8) {
20357 for (size_t k = 1; k <= 5; k += 2) {
20358 GemmMicrokernelTester()
20359 .mr(5)
20360 .nr(8)
20361 .kr(1)
20362 .sr(1)
20363 .m(5)
20364 .n(n)
20365 .k(k)
20366 .cn_stride(11)
20367 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20368 }
20369 }
20370 }
20371
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,n_div_8_strided_a)20372 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, n_div_8_strided_a) {
20373 TEST_REQUIRES_X86_AVX;
20374 for (uint32_t n = 16; n <= 24; n += 8) {
20375 for (size_t k = 1; k <= 5; k += 2) {
20376 GemmMicrokernelTester()
20377 .mr(5)
20378 .nr(8)
20379 .kr(1)
20380 .sr(1)
20381 .m(5)
20382 .n(n)
20383 .k(k)
20384 .a_stride(7)
20385 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20386 }
20387 }
20388 }
20389
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,n_div_8_subtile)20390 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, n_div_8_subtile) {
20391 TEST_REQUIRES_X86_AVX;
20392 for (uint32_t n = 16; n <= 24; n += 8) {
20393 for (size_t k = 1; k <= 5; k += 2) {
20394 for (uint32_t m = 1; m <= 5; m++) {
20395 GemmMicrokernelTester()
20396 .mr(5)
20397 .nr(8)
20398 .kr(1)
20399 .sr(1)
20400 .m(m)
20401 .n(n)
20402 .k(k)
20403 .iterations(1)
20404 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20405 }
20406 }
20407 }
20408 }
20409
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,strided_cm_subtile)20410 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, strided_cm_subtile) {
20411 TEST_REQUIRES_X86_AVX;
20412 for (size_t k = 1; k <= 5; k += 2) {
20413 for (uint32_t n = 1; n <= 8; n++) {
20414 for (uint32_t m = 1; m <= 5; m++) {
20415 GemmMicrokernelTester()
20416 .mr(5)
20417 .nr(8)
20418 .kr(1)
20419 .sr(1)
20420 .m(m)
20421 .n(n)
20422 .k(k)
20423 .cm_stride(11)
20424 .iterations(1)
20425 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20426 }
20427 }
20428 }
20429 }
20430
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,qmin)20431 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, qmin) {
20432 TEST_REQUIRES_X86_AVX;
20433 GemmMicrokernelTester()
20434 .mr(5)
20435 .nr(8)
20436 .kr(1)
20437 .sr(1)
20438 .m(5)
20439 .n(8)
20440 .k(1)
20441 .qmin(128)
20442 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20443 }
20444
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,qmax)20445 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, qmax) {
20446 TEST_REQUIRES_X86_AVX;
20447 GemmMicrokernelTester()
20448 .mr(5)
20449 .nr(8)
20450 .kr(1)
20451 .sr(1)
20452 .m(5)
20453 .n(8)
20454 .k(1)
20455 .qmax(128)
20456 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20457 }
20458
TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST,strided_cm)20459 TEST(F32_GEMM_MINMAX_5X8__AVX_BROADCAST, strided_cm) {
20460 TEST_REQUIRES_X86_AVX;
20461 GemmMicrokernelTester()
20462 .mr(5)
20463 .nr(8)
20464 .kr(1)
20465 .sr(1)
20466 .m(5)
20467 .n(8)
20468 .k(1)
20469 .cm_stride(11)
20470 .Test(xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20471 }
20472 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20473
20474
20475 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,k_eq_1)20476 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, k_eq_1) {
20477 TEST_REQUIRES_X86_AVX;
20478 GemmMicrokernelTester()
20479 .mr(6)
20480 .nr(8)
20481 .kr(1)
20482 .sr(1)
20483 .m(6)
20484 .n(8)
20485 .k(1)
20486 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20487 }
20488
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,strided_cn)20489 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, strided_cn) {
20490 TEST_REQUIRES_X86_AVX;
20491 GemmMicrokernelTester()
20492 .mr(6)
20493 .nr(8)
20494 .kr(1)
20495 .sr(1)
20496 .m(6)
20497 .n(8)
20498 .k(1)
20499 .cn_stride(11)
20500 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20501 }
20502
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,k_eq_1_strided_a)20503 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, k_eq_1_strided_a) {
20504 TEST_REQUIRES_X86_AVX;
20505 GemmMicrokernelTester()
20506 .mr(6)
20507 .nr(8)
20508 .kr(1)
20509 .sr(1)
20510 .m(6)
20511 .n(8)
20512 .k(1)
20513 .a_stride(3)
20514 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20515 }
20516
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,k_eq_1_subtile)20517 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, k_eq_1_subtile) {
20518 TEST_REQUIRES_X86_AVX;
20519 for (uint32_t n = 1; n <= 8; n++) {
20520 for (uint32_t m = 1; m <= 6; m++) {
20521 GemmMicrokernelTester()
20522 .mr(6)
20523 .nr(8)
20524 .kr(1)
20525 .sr(1)
20526 .m(m)
20527 .n(n)
20528 .k(1)
20529 .iterations(1)
20530 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20531 }
20532 }
20533 }
20534
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,k_eq_1_subtile_m)20535 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, k_eq_1_subtile_m) {
20536 TEST_REQUIRES_X86_AVX;
20537 for (uint32_t m = 1; m <= 6; m++) {
20538 GemmMicrokernelTester()
20539 .mr(6)
20540 .nr(8)
20541 .kr(1)
20542 .sr(1)
20543 .m(m)
20544 .n(8)
20545 .k(1)
20546 .iterations(1)
20547 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20548 }
20549 }
20550
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,k_eq_1_subtile_n)20551 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, k_eq_1_subtile_n) {
20552 TEST_REQUIRES_X86_AVX;
20553 for (uint32_t n = 1; n <= 8; n++) {
20554 GemmMicrokernelTester()
20555 .mr(6)
20556 .nr(8)
20557 .kr(1)
20558 .sr(1)
20559 .m(6)
20560 .n(n)
20561 .k(1)
20562 .iterations(1)
20563 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20564 }
20565 }
20566
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,k_gt_1)20567 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, k_gt_1) {
20568 TEST_REQUIRES_X86_AVX;
20569 for (size_t k = 2; k < 10; k++) {
20570 GemmMicrokernelTester()
20571 .mr(6)
20572 .nr(8)
20573 .kr(1)
20574 .sr(1)
20575 .m(6)
20576 .n(8)
20577 .k(k)
20578 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20579 }
20580 }
20581
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,k_gt_1_strided_a)20582 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, k_gt_1_strided_a) {
20583 TEST_REQUIRES_X86_AVX;
20584 for (size_t k = 2; k < 10; k++) {
20585 GemmMicrokernelTester()
20586 .mr(6)
20587 .nr(8)
20588 .kr(1)
20589 .sr(1)
20590 .m(6)
20591 .n(8)
20592 .k(k)
20593 .a_stride(11)
20594 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20595 }
20596 }
20597
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,k_gt_1_subtile)20598 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, k_gt_1_subtile) {
20599 TEST_REQUIRES_X86_AVX;
20600 for (size_t k = 2; k < 10; k++) {
20601 for (uint32_t n = 1; n <= 8; n++) {
20602 for (uint32_t m = 1; m <= 6; m++) {
20603 GemmMicrokernelTester()
20604 .mr(6)
20605 .nr(8)
20606 .kr(1)
20607 .sr(1)
20608 .m(m)
20609 .n(n)
20610 .k(k)
20611 .iterations(1)
20612 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20613 }
20614 }
20615 }
20616 }
20617
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,n_gt_8)20618 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, n_gt_8) {
20619 TEST_REQUIRES_X86_AVX;
20620 for (uint32_t n = 9; n < 16; n++) {
20621 for (size_t k = 1; k <= 5; k += 2) {
20622 GemmMicrokernelTester()
20623 .mr(6)
20624 .nr(8)
20625 .kr(1)
20626 .sr(1)
20627 .m(6)
20628 .n(n)
20629 .k(k)
20630 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20631 }
20632 }
20633 }
20634
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,n_gt_8_strided_cn)20635 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, n_gt_8_strided_cn) {
20636 TEST_REQUIRES_X86_AVX;
20637 for (uint32_t n = 9; n < 16; n++) {
20638 for (size_t k = 1; k <= 5; k += 2) {
20639 GemmMicrokernelTester()
20640 .mr(6)
20641 .nr(8)
20642 .kr(1)
20643 .sr(1)
20644 .m(6)
20645 .n(n)
20646 .k(k)
20647 .cn_stride(11)
20648 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20649 }
20650 }
20651 }
20652
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,n_gt_8_strided_a)20653 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, n_gt_8_strided_a) {
20654 TEST_REQUIRES_X86_AVX;
20655 for (uint32_t n = 9; n < 16; n++) {
20656 for (size_t k = 1; k <= 5; k += 2) {
20657 GemmMicrokernelTester()
20658 .mr(6)
20659 .nr(8)
20660 .kr(1)
20661 .sr(1)
20662 .m(6)
20663 .n(n)
20664 .k(k)
20665 .a_stride(7)
20666 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20667 }
20668 }
20669 }
20670
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,n_gt_8_subtile)20671 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, n_gt_8_subtile) {
20672 TEST_REQUIRES_X86_AVX;
20673 for (uint32_t n = 9; n < 16; n++) {
20674 for (size_t k = 1; k <= 5; k += 2) {
20675 for (uint32_t m = 1; m <= 6; m++) {
20676 GemmMicrokernelTester()
20677 .mr(6)
20678 .nr(8)
20679 .kr(1)
20680 .sr(1)
20681 .m(m)
20682 .n(n)
20683 .k(k)
20684 .iterations(1)
20685 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20686 }
20687 }
20688 }
20689 }
20690
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,n_div_8)20691 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, n_div_8) {
20692 TEST_REQUIRES_X86_AVX;
20693 for (uint32_t n = 16; n <= 24; n += 8) {
20694 for (size_t k = 1; k <= 5; k += 2) {
20695 GemmMicrokernelTester()
20696 .mr(6)
20697 .nr(8)
20698 .kr(1)
20699 .sr(1)
20700 .m(6)
20701 .n(n)
20702 .k(k)
20703 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20704 }
20705 }
20706 }
20707
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,n_div_8_strided_cn)20708 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, n_div_8_strided_cn) {
20709 TEST_REQUIRES_X86_AVX;
20710 for (uint32_t n = 16; n <= 24; n += 8) {
20711 for (size_t k = 1; k <= 5; k += 2) {
20712 GemmMicrokernelTester()
20713 .mr(6)
20714 .nr(8)
20715 .kr(1)
20716 .sr(1)
20717 .m(6)
20718 .n(n)
20719 .k(k)
20720 .cn_stride(11)
20721 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20722 }
20723 }
20724 }
20725
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,n_div_8_strided_a)20726 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, n_div_8_strided_a) {
20727 TEST_REQUIRES_X86_AVX;
20728 for (uint32_t n = 16; n <= 24; n += 8) {
20729 for (size_t k = 1; k <= 5; k += 2) {
20730 GemmMicrokernelTester()
20731 .mr(6)
20732 .nr(8)
20733 .kr(1)
20734 .sr(1)
20735 .m(6)
20736 .n(n)
20737 .k(k)
20738 .a_stride(7)
20739 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20740 }
20741 }
20742 }
20743
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,n_div_8_subtile)20744 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, n_div_8_subtile) {
20745 TEST_REQUIRES_X86_AVX;
20746 for (uint32_t n = 16; n <= 24; n += 8) {
20747 for (size_t k = 1; k <= 5; k += 2) {
20748 for (uint32_t m = 1; m <= 6; m++) {
20749 GemmMicrokernelTester()
20750 .mr(6)
20751 .nr(8)
20752 .kr(1)
20753 .sr(1)
20754 .m(m)
20755 .n(n)
20756 .k(k)
20757 .iterations(1)
20758 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20759 }
20760 }
20761 }
20762 }
20763
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,strided_cm_subtile)20764 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, strided_cm_subtile) {
20765 TEST_REQUIRES_X86_AVX;
20766 for (size_t k = 1; k <= 5; k += 2) {
20767 for (uint32_t n = 1; n <= 8; n++) {
20768 for (uint32_t m = 1; m <= 6; m++) {
20769 GemmMicrokernelTester()
20770 .mr(6)
20771 .nr(8)
20772 .kr(1)
20773 .sr(1)
20774 .m(m)
20775 .n(n)
20776 .k(k)
20777 .cm_stride(11)
20778 .iterations(1)
20779 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20780 }
20781 }
20782 }
20783 }
20784
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,qmin)20785 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, qmin) {
20786 TEST_REQUIRES_X86_AVX;
20787 GemmMicrokernelTester()
20788 .mr(6)
20789 .nr(8)
20790 .kr(1)
20791 .sr(1)
20792 .m(6)
20793 .n(8)
20794 .k(1)
20795 .qmin(128)
20796 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20797 }
20798
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,qmax)20799 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, qmax) {
20800 TEST_REQUIRES_X86_AVX;
20801 GemmMicrokernelTester()
20802 .mr(6)
20803 .nr(8)
20804 .kr(1)
20805 .sr(1)
20806 .m(6)
20807 .n(8)
20808 .k(1)
20809 .qmax(128)
20810 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20811 }
20812
TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST,strided_cm)20813 TEST(F32_GEMM_MINMAX_6X8__AVX_BROADCAST, strided_cm) {
20814 TEST_REQUIRES_X86_AVX;
20815 GemmMicrokernelTester()
20816 .mr(6)
20817 .nr(8)
20818 .kr(1)
20819 .sr(1)
20820 .m(6)
20821 .n(8)
20822 .k(1)
20823 .cm_stride(11)
20824 .Test(xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast, xnn_init_f32_minmax_avx_params);
20825 }
20826 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20827
20828
20829 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,k_eq_1)20830 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, k_eq_1) {
20831 TEST_REQUIRES_X86_FMA3;
20832 GemmMicrokernelTester()
20833 .mr(1)
20834 .nr(8)
20835 .kr(1)
20836 .sr(1)
20837 .m(1)
20838 .n(8)
20839 .k(1)
20840 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20841 }
20842
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,strided_cn)20843 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, strided_cn) {
20844 TEST_REQUIRES_X86_FMA3;
20845 GemmMicrokernelTester()
20846 .mr(1)
20847 .nr(8)
20848 .kr(1)
20849 .sr(1)
20850 .m(1)
20851 .n(8)
20852 .k(1)
20853 .cn_stride(11)
20854 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20855 }
20856
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,k_eq_1_strided_a)20857 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, k_eq_1_strided_a) {
20858 TEST_REQUIRES_X86_FMA3;
20859 GemmMicrokernelTester()
20860 .mr(1)
20861 .nr(8)
20862 .kr(1)
20863 .sr(1)
20864 .m(1)
20865 .n(8)
20866 .k(1)
20867 .a_stride(3)
20868 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20869 }
20870
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,k_eq_1_subtile)20871 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, k_eq_1_subtile) {
20872 TEST_REQUIRES_X86_FMA3;
20873 for (uint32_t n = 1; n <= 8; n++) {
20874 for (uint32_t m = 1; m <= 1; m++) {
20875 GemmMicrokernelTester()
20876 .mr(1)
20877 .nr(8)
20878 .kr(1)
20879 .sr(1)
20880 .m(m)
20881 .n(n)
20882 .k(1)
20883 .iterations(1)
20884 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20885 }
20886 }
20887 }
20888
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,k_eq_1_subtile_m)20889 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
20890 TEST_REQUIRES_X86_FMA3;
20891 for (uint32_t m = 1; m <= 1; m++) {
20892 GemmMicrokernelTester()
20893 .mr(1)
20894 .nr(8)
20895 .kr(1)
20896 .sr(1)
20897 .m(m)
20898 .n(8)
20899 .k(1)
20900 .iterations(1)
20901 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20902 }
20903 }
20904
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,k_eq_1_subtile_n)20905 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
20906 TEST_REQUIRES_X86_FMA3;
20907 for (uint32_t n = 1; n <= 8; n++) {
20908 GemmMicrokernelTester()
20909 .mr(1)
20910 .nr(8)
20911 .kr(1)
20912 .sr(1)
20913 .m(1)
20914 .n(n)
20915 .k(1)
20916 .iterations(1)
20917 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20918 }
20919 }
20920
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,k_gt_1)20921 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, k_gt_1) {
20922 TEST_REQUIRES_X86_FMA3;
20923 for (size_t k = 2; k < 10; k++) {
20924 GemmMicrokernelTester()
20925 .mr(1)
20926 .nr(8)
20927 .kr(1)
20928 .sr(1)
20929 .m(1)
20930 .n(8)
20931 .k(k)
20932 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20933 }
20934 }
20935
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,k_gt_1_strided_a)20936 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, k_gt_1_strided_a) {
20937 TEST_REQUIRES_X86_FMA3;
20938 for (size_t k = 2; k < 10; k++) {
20939 GemmMicrokernelTester()
20940 .mr(1)
20941 .nr(8)
20942 .kr(1)
20943 .sr(1)
20944 .m(1)
20945 .n(8)
20946 .k(k)
20947 .a_stride(11)
20948 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20949 }
20950 }
20951
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,k_gt_1_subtile)20952 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, k_gt_1_subtile) {
20953 TEST_REQUIRES_X86_FMA3;
20954 for (size_t k = 2; k < 10; k++) {
20955 for (uint32_t n = 1; n <= 8; n++) {
20956 for (uint32_t m = 1; m <= 1; m++) {
20957 GemmMicrokernelTester()
20958 .mr(1)
20959 .nr(8)
20960 .kr(1)
20961 .sr(1)
20962 .m(m)
20963 .n(n)
20964 .k(k)
20965 .iterations(1)
20966 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20967 }
20968 }
20969 }
20970 }
20971
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,n_gt_8)20972 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, n_gt_8) {
20973 TEST_REQUIRES_X86_FMA3;
20974 for (uint32_t n = 9; n < 16; n++) {
20975 for (size_t k = 1; k <= 5; k += 2) {
20976 GemmMicrokernelTester()
20977 .mr(1)
20978 .nr(8)
20979 .kr(1)
20980 .sr(1)
20981 .m(1)
20982 .n(n)
20983 .k(k)
20984 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
20985 }
20986 }
20987 }
20988
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,n_gt_8_strided_cn)20989 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
20990 TEST_REQUIRES_X86_FMA3;
20991 for (uint32_t n = 9; n < 16; n++) {
20992 for (size_t k = 1; k <= 5; k += 2) {
20993 GemmMicrokernelTester()
20994 .mr(1)
20995 .nr(8)
20996 .kr(1)
20997 .sr(1)
20998 .m(1)
20999 .n(n)
21000 .k(k)
21001 .cn_stride(11)
21002 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21003 }
21004 }
21005 }
21006
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,n_gt_8_strided_a)21007 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, n_gt_8_strided_a) {
21008 TEST_REQUIRES_X86_FMA3;
21009 for (uint32_t n = 9; n < 16; n++) {
21010 for (size_t k = 1; k <= 5; k += 2) {
21011 GemmMicrokernelTester()
21012 .mr(1)
21013 .nr(8)
21014 .kr(1)
21015 .sr(1)
21016 .m(1)
21017 .n(n)
21018 .k(k)
21019 .a_stride(7)
21020 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21021 }
21022 }
21023 }
21024
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,n_gt_8_subtile)21025 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, n_gt_8_subtile) {
21026 TEST_REQUIRES_X86_FMA3;
21027 for (uint32_t n = 9; n < 16; n++) {
21028 for (size_t k = 1; k <= 5; k += 2) {
21029 for (uint32_t m = 1; m <= 1; m++) {
21030 GemmMicrokernelTester()
21031 .mr(1)
21032 .nr(8)
21033 .kr(1)
21034 .sr(1)
21035 .m(m)
21036 .n(n)
21037 .k(k)
21038 .iterations(1)
21039 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21040 }
21041 }
21042 }
21043 }
21044
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,n_div_8)21045 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, n_div_8) {
21046 TEST_REQUIRES_X86_FMA3;
21047 for (uint32_t n = 16; n <= 24; n += 8) {
21048 for (size_t k = 1; k <= 5; k += 2) {
21049 GemmMicrokernelTester()
21050 .mr(1)
21051 .nr(8)
21052 .kr(1)
21053 .sr(1)
21054 .m(1)
21055 .n(n)
21056 .k(k)
21057 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21058 }
21059 }
21060 }
21061
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,n_div_8_strided_cn)21062 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, n_div_8_strided_cn) {
21063 TEST_REQUIRES_X86_FMA3;
21064 for (uint32_t n = 16; n <= 24; n += 8) {
21065 for (size_t k = 1; k <= 5; k += 2) {
21066 GemmMicrokernelTester()
21067 .mr(1)
21068 .nr(8)
21069 .kr(1)
21070 .sr(1)
21071 .m(1)
21072 .n(n)
21073 .k(k)
21074 .cn_stride(11)
21075 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21076 }
21077 }
21078 }
21079
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,n_div_8_strided_a)21080 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, n_div_8_strided_a) {
21081 TEST_REQUIRES_X86_FMA3;
21082 for (uint32_t n = 16; n <= 24; n += 8) {
21083 for (size_t k = 1; k <= 5; k += 2) {
21084 GemmMicrokernelTester()
21085 .mr(1)
21086 .nr(8)
21087 .kr(1)
21088 .sr(1)
21089 .m(1)
21090 .n(n)
21091 .k(k)
21092 .a_stride(7)
21093 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21094 }
21095 }
21096 }
21097
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,n_div_8_subtile)21098 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, n_div_8_subtile) {
21099 TEST_REQUIRES_X86_FMA3;
21100 for (uint32_t n = 16; n <= 24; n += 8) {
21101 for (size_t k = 1; k <= 5; k += 2) {
21102 for (uint32_t m = 1; m <= 1; m++) {
21103 GemmMicrokernelTester()
21104 .mr(1)
21105 .nr(8)
21106 .kr(1)
21107 .sr(1)
21108 .m(m)
21109 .n(n)
21110 .k(k)
21111 .iterations(1)
21112 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21113 }
21114 }
21115 }
21116 }
21117
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,strided_cm_subtile)21118 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, strided_cm_subtile) {
21119 TEST_REQUIRES_X86_FMA3;
21120 for (size_t k = 1; k <= 5; k += 2) {
21121 for (uint32_t n = 1; n <= 8; n++) {
21122 for (uint32_t m = 1; m <= 1; m++) {
21123 GemmMicrokernelTester()
21124 .mr(1)
21125 .nr(8)
21126 .kr(1)
21127 .sr(1)
21128 .m(m)
21129 .n(n)
21130 .k(k)
21131 .cm_stride(11)
21132 .iterations(1)
21133 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21134 }
21135 }
21136 }
21137 }
21138
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,qmin)21139 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, qmin) {
21140 TEST_REQUIRES_X86_FMA3;
21141 GemmMicrokernelTester()
21142 .mr(1)
21143 .nr(8)
21144 .kr(1)
21145 .sr(1)
21146 .m(1)
21147 .n(8)
21148 .k(1)
21149 .qmin(128)
21150 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21151 }
21152
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,qmax)21153 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, qmax) {
21154 TEST_REQUIRES_X86_FMA3;
21155 GemmMicrokernelTester()
21156 .mr(1)
21157 .nr(8)
21158 .kr(1)
21159 .sr(1)
21160 .m(1)
21161 .n(8)
21162 .k(1)
21163 .qmax(128)
21164 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21165 }
21166
TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST,strided_cm)21167 TEST(F32_GEMM_MINMAX_1X8__FMA3_BROADCAST, strided_cm) {
21168 TEST_REQUIRES_X86_FMA3;
21169 GemmMicrokernelTester()
21170 .mr(1)
21171 .nr(8)
21172 .kr(1)
21173 .sr(1)
21174 .m(1)
21175 .n(8)
21176 .k(1)
21177 .cm_stride(11)
21178 .Test(xnn_f32_gemm_minmax_ukernel_1x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21179 }
21180 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21181
21182
21183 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,k_eq_1)21184 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, k_eq_1) {
21185 TEST_REQUIRES_X86_FMA3;
21186 GemmMicrokernelTester()
21187 .mr(1)
21188 .nr(16)
21189 .kr(1)
21190 .sr(1)
21191 .m(1)
21192 .n(16)
21193 .k(1)
21194 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21195 }
21196
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,strided_cn)21197 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, strided_cn) {
21198 TEST_REQUIRES_X86_FMA3;
21199 GemmMicrokernelTester()
21200 .mr(1)
21201 .nr(16)
21202 .kr(1)
21203 .sr(1)
21204 .m(1)
21205 .n(16)
21206 .k(1)
21207 .cn_stride(19)
21208 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21209 }
21210
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,k_eq_1_strided_a)21211 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, k_eq_1_strided_a) {
21212 TEST_REQUIRES_X86_FMA3;
21213 GemmMicrokernelTester()
21214 .mr(1)
21215 .nr(16)
21216 .kr(1)
21217 .sr(1)
21218 .m(1)
21219 .n(16)
21220 .k(1)
21221 .a_stride(3)
21222 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21223 }
21224
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,k_eq_1_subtile)21225 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, k_eq_1_subtile) {
21226 TEST_REQUIRES_X86_FMA3;
21227 for (uint32_t n = 1; n <= 16; n++) {
21228 for (uint32_t m = 1; m <= 1; m++) {
21229 GemmMicrokernelTester()
21230 .mr(1)
21231 .nr(16)
21232 .kr(1)
21233 .sr(1)
21234 .m(m)
21235 .n(n)
21236 .k(1)
21237 .iterations(1)
21238 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21239 }
21240 }
21241 }
21242
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,k_eq_1_subtile_m)21243 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
21244 TEST_REQUIRES_X86_FMA3;
21245 for (uint32_t m = 1; m <= 1; m++) {
21246 GemmMicrokernelTester()
21247 .mr(1)
21248 .nr(16)
21249 .kr(1)
21250 .sr(1)
21251 .m(m)
21252 .n(16)
21253 .k(1)
21254 .iterations(1)
21255 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21256 }
21257 }
21258
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,k_eq_1_subtile_n)21259 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
21260 TEST_REQUIRES_X86_FMA3;
21261 for (uint32_t n = 1; n <= 16; n++) {
21262 GemmMicrokernelTester()
21263 .mr(1)
21264 .nr(16)
21265 .kr(1)
21266 .sr(1)
21267 .m(1)
21268 .n(n)
21269 .k(1)
21270 .iterations(1)
21271 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21272 }
21273 }
21274
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,k_gt_1)21275 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, k_gt_1) {
21276 TEST_REQUIRES_X86_FMA3;
21277 for (size_t k = 2; k < 10; k++) {
21278 GemmMicrokernelTester()
21279 .mr(1)
21280 .nr(16)
21281 .kr(1)
21282 .sr(1)
21283 .m(1)
21284 .n(16)
21285 .k(k)
21286 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21287 }
21288 }
21289
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,k_gt_1_strided_a)21290 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, k_gt_1_strided_a) {
21291 TEST_REQUIRES_X86_FMA3;
21292 for (size_t k = 2; k < 10; k++) {
21293 GemmMicrokernelTester()
21294 .mr(1)
21295 .nr(16)
21296 .kr(1)
21297 .sr(1)
21298 .m(1)
21299 .n(16)
21300 .k(k)
21301 .a_stride(11)
21302 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21303 }
21304 }
21305
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,k_gt_1_subtile)21306 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, k_gt_1_subtile) {
21307 TEST_REQUIRES_X86_FMA3;
21308 for (size_t k = 2; k < 10; k++) {
21309 for (uint32_t n = 1; n <= 16; n++) {
21310 for (uint32_t m = 1; m <= 1; m++) {
21311 GemmMicrokernelTester()
21312 .mr(1)
21313 .nr(16)
21314 .kr(1)
21315 .sr(1)
21316 .m(m)
21317 .n(n)
21318 .k(k)
21319 .iterations(1)
21320 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21321 }
21322 }
21323 }
21324 }
21325
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,n_gt_16)21326 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, n_gt_16) {
21327 TEST_REQUIRES_X86_FMA3;
21328 for (uint32_t n = 17; n < 32; n++) {
21329 for (size_t k = 1; k <= 5; k += 2) {
21330 GemmMicrokernelTester()
21331 .mr(1)
21332 .nr(16)
21333 .kr(1)
21334 .sr(1)
21335 .m(1)
21336 .n(n)
21337 .k(k)
21338 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21339 }
21340 }
21341 }
21342
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,n_gt_16_strided_cn)21343 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
21344 TEST_REQUIRES_X86_FMA3;
21345 for (uint32_t n = 17; n < 32; n++) {
21346 for (size_t k = 1; k <= 5; k += 2) {
21347 GemmMicrokernelTester()
21348 .mr(1)
21349 .nr(16)
21350 .kr(1)
21351 .sr(1)
21352 .m(1)
21353 .n(n)
21354 .k(k)
21355 .cn_stride(19)
21356 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21357 }
21358 }
21359 }
21360
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,n_gt_16_strided_a)21361 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, n_gt_16_strided_a) {
21362 TEST_REQUIRES_X86_FMA3;
21363 for (uint32_t n = 17; n < 32; n++) {
21364 for (size_t k = 1; k <= 5; k += 2) {
21365 GemmMicrokernelTester()
21366 .mr(1)
21367 .nr(16)
21368 .kr(1)
21369 .sr(1)
21370 .m(1)
21371 .n(n)
21372 .k(k)
21373 .a_stride(7)
21374 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21375 }
21376 }
21377 }
21378
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,n_gt_16_subtile)21379 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, n_gt_16_subtile) {
21380 TEST_REQUIRES_X86_FMA3;
21381 for (uint32_t n = 17; n < 32; n++) {
21382 for (size_t k = 1; k <= 5; k += 2) {
21383 for (uint32_t m = 1; m <= 1; m++) {
21384 GemmMicrokernelTester()
21385 .mr(1)
21386 .nr(16)
21387 .kr(1)
21388 .sr(1)
21389 .m(m)
21390 .n(n)
21391 .k(k)
21392 .iterations(1)
21393 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21394 }
21395 }
21396 }
21397 }
21398
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,n_div_16)21399 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, n_div_16) {
21400 TEST_REQUIRES_X86_FMA3;
21401 for (uint32_t n = 32; n <= 48; n += 16) {
21402 for (size_t k = 1; k <= 5; k += 2) {
21403 GemmMicrokernelTester()
21404 .mr(1)
21405 .nr(16)
21406 .kr(1)
21407 .sr(1)
21408 .m(1)
21409 .n(n)
21410 .k(k)
21411 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21412 }
21413 }
21414 }
21415
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,n_div_16_strided_cn)21416 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, n_div_16_strided_cn) {
21417 TEST_REQUIRES_X86_FMA3;
21418 for (uint32_t n = 32; n <= 48; n += 16) {
21419 for (size_t k = 1; k <= 5; k += 2) {
21420 GemmMicrokernelTester()
21421 .mr(1)
21422 .nr(16)
21423 .kr(1)
21424 .sr(1)
21425 .m(1)
21426 .n(n)
21427 .k(k)
21428 .cn_stride(19)
21429 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21430 }
21431 }
21432 }
21433
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,n_div_16_strided_a)21434 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, n_div_16_strided_a) {
21435 TEST_REQUIRES_X86_FMA3;
21436 for (uint32_t n = 32; n <= 48; n += 16) {
21437 for (size_t k = 1; k <= 5; k += 2) {
21438 GemmMicrokernelTester()
21439 .mr(1)
21440 .nr(16)
21441 .kr(1)
21442 .sr(1)
21443 .m(1)
21444 .n(n)
21445 .k(k)
21446 .a_stride(7)
21447 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21448 }
21449 }
21450 }
21451
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,n_div_16_subtile)21452 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, n_div_16_subtile) {
21453 TEST_REQUIRES_X86_FMA3;
21454 for (uint32_t n = 32; n <= 48; n += 16) {
21455 for (size_t k = 1; k <= 5; k += 2) {
21456 for (uint32_t m = 1; m <= 1; m++) {
21457 GemmMicrokernelTester()
21458 .mr(1)
21459 .nr(16)
21460 .kr(1)
21461 .sr(1)
21462 .m(m)
21463 .n(n)
21464 .k(k)
21465 .iterations(1)
21466 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21467 }
21468 }
21469 }
21470 }
21471
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,strided_cm_subtile)21472 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, strided_cm_subtile) {
21473 TEST_REQUIRES_X86_FMA3;
21474 for (size_t k = 1; k <= 5; k += 2) {
21475 for (uint32_t n = 1; n <= 16; n++) {
21476 for (uint32_t m = 1; m <= 1; m++) {
21477 GemmMicrokernelTester()
21478 .mr(1)
21479 .nr(16)
21480 .kr(1)
21481 .sr(1)
21482 .m(m)
21483 .n(n)
21484 .k(k)
21485 .cm_stride(19)
21486 .iterations(1)
21487 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21488 }
21489 }
21490 }
21491 }
21492
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,qmin)21493 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, qmin) {
21494 TEST_REQUIRES_X86_FMA3;
21495 GemmMicrokernelTester()
21496 .mr(1)
21497 .nr(16)
21498 .kr(1)
21499 .sr(1)
21500 .m(1)
21501 .n(16)
21502 .k(1)
21503 .qmin(128)
21504 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21505 }
21506
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,qmax)21507 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, qmax) {
21508 TEST_REQUIRES_X86_FMA3;
21509 GemmMicrokernelTester()
21510 .mr(1)
21511 .nr(16)
21512 .kr(1)
21513 .sr(1)
21514 .m(1)
21515 .n(16)
21516 .k(1)
21517 .qmax(128)
21518 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21519 }
21520
TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST,strided_cm)21521 TEST(F32_GEMM_MINMAX_1X16__FMA3_BROADCAST, strided_cm) {
21522 TEST_REQUIRES_X86_FMA3;
21523 GemmMicrokernelTester()
21524 .mr(1)
21525 .nr(16)
21526 .kr(1)
21527 .sr(1)
21528 .m(1)
21529 .n(16)
21530 .k(1)
21531 .cm_stride(19)
21532 .Test(xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21533 }
21534 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21535
21536
21537 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,k_eq_1)21538 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, k_eq_1) {
21539 TEST_REQUIRES_X86_FMA3;
21540 GemmMicrokernelTester()
21541 .mr(3)
21542 .nr(16)
21543 .kr(1)
21544 .sr(1)
21545 .m(3)
21546 .n(16)
21547 .k(1)
21548 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21549 }
21550
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,strided_cn)21551 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, strided_cn) {
21552 TEST_REQUIRES_X86_FMA3;
21553 GemmMicrokernelTester()
21554 .mr(3)
21555 .nr(16)
21556 .kr(1)
21557 .sr(1)
21558 .m(3)
21559 .n(16)
21560 .k(1)
21561 .cn_stride(19)
21562 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21563 }
21564
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,k_eq_1_strided_a)21565 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, k_eq_1_strided_a) {
21566 TEST_REQUIRES_X86_FMA3;
21567 GemmMicrokernelTester()
21568 .mr(3)
21569 .nr(16)
21570 .kr(1)
21571 .sr(1)
21572 .m(3)
21573 .n(16)
21574 .k(1)
21575 .a_stride(3)
21576 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21577 }
21578
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,k_eq_1_subtile)21579 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, k_eq_1_subtile) {
21580 TEST_REQUIRES_X86_FMA3;
21581 for (uint32_t n = 1; n <= 16; n++) {
21582 for (uint32_t m = 1; m <= 3; m++) {
21583 GemmMicrokernelTester()
21584 .mr(3)
21585 .nr(16)
21586 .kr(1)
21587 .sr(1)
21588 .m(m)
21589 .n(n)
21590 .k(1)
21591 .iterations(1)
21592 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21593 }
21594 }
21595 }
21596
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,k_eq_1_subtile_m)21597 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
21598 TEST_REQUIRES_X86_FMA3;
21599 for (uint32_t m = 1; m <= 3; m++) {
21600 GemmMicrokernelTester()
21601 .mr(3)
21602 .nr(16)
21603 .kr(1)
21604 .sr(1)
21605 .m(m)
21606 .n(16)
21607 .k(1)
21608 .iterations(1)
21609 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21610 }
21611 }
21612
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,k_eq_1_subtile_n)21613 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
21614 TEST_REQUIRES_X86_FMA3;
21615 for (uint32_t n = 1; n <= 16; n++) {
21616 GemmMicrokernelTester()
21617 .mr(3)
21618 .nr(16)
21619 .kr(1)
21620 .sr(1)
21621 .m(3)
21622 .n(n)
21623 .k(1)
21624 .iterations(1)
21625 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21626 }
21627 }
21628
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,k_gt_1)21629 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, k_gt_1) {
21630 TEST_REQUIRES_X86_FMA3;
21631 for (size_t k = 2; k < 10; k++) {
21632 GemmMicrokernelTester()
21633 .mr(3)
21634 .nr(16)
21635 .kr(1)
21636 .sr(1)
21637 .m(3)
21638 .n(16)
21639 .k(k)
21640 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21641 }
21642 }
21643
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,k_gt_1_strided_a)21644 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, k_gt_1_strided_a) {
21645 TEST_REQUIRES_X86_FMA3;
21646 for (size_t k = 2; k < 10; k++) {
21647 GemmMicrokernelTester()
21648 .mr(3)
21649 .nr(16)
21650 .kr(1)
21651 .sr(1)
21652 .m(3)
21653 .n(16)
21654 .k(k)
21655 .a_stride(11)
21656 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21657 }
21658 }
21659
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,k_gt_1_subtile)21660 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, k_gt_1_subtile) {
21661 TEST_REQUIRES_X86_FMA3;
21662 for (size_t k = 2; k < 10; k++) {
21663 for (uint32_t n = 1; n <= 16; n++) {
21664 for (uint32_t m = 1; m <= 3; m++) {
21665 GemmMicrokernelTester()
21666 .mr(3)
21667 .nr(16)
21668 .kr(1)
21669 .sr(1)
21670 .m(m)
21671 .n(n)
21672 .k(k)
21673 .iterations(1)
21674 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21675 }
21676 }
21677 }
21678 }
21679
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,n_gt_16)21680 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, n_gt_16) {
21681 TEST_REQUIRES_X86_FMA3;
21682 for (uint32_t n = 17; n < 32; n++) {
21683 for (size_t k = 1; k <= 5; k += 2) {
21684 GemmMicrokernelTester()
21685 .mr(3)
21686 .nr(16)
21687 .kr(1)
21688 .sr(1)
21689 .m(3)
21690 .n(n)
21691 .k(k)
21692 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21693 }
21694 }
21695 }
21696
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,n_gt_16_strided_cn)21697 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
21698 TEST_REQUIRES_X86_FMA3;
21699 for (uint32_t n = 17; n < 32; n++) {
21700 for (size_t k = 1; k <= 5; k += 2) {
21701 GemmMicrokernelTester()
21702 .mr(3)
21703 .nr(16)
21704 .kr(1)
21705 .sr(1)
21706 .m(3)
21707 .n(n)
21708 .k(k)
21709 .cn_stride(19)
21710 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21711 }
21712 }
21713 }
21714
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,n_gt_16_strided_a)21715 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, n_gt_16_strided_a) {
21716 TEST_REQUIRES_X86_FMA3;
21717 for (uint32_t n = 17; n < 32; n++) {
21718 for (size_t k = 1; k <= 5; k += 2) {
21719 GemmMicrokernelTester()
21720 .mr(3)
21721 .nr(16)
21722 .kr(1)
21723 .sr(1)
21724 .m(3)
21725 .n(n)
21726 .k(k)
21727 .a_stride(7)
21728 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21729 }
21730 }
21731 }
21732
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,n_gt_16_subtile)21733 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, n_gt_16_subtile) {
21734 TEST_REQUIRES_X86_FMA3;
21735 for (uint32_t n = 17; n < 32; n++) {
21736 for (size_t k = 1; k <= 5; k += 2) {
21737 for (uint32_t m = 1; m <= 3; m++) {
21738 GemmMicrokernelTester()
21739 .mr(3)
21740 .nr(16)
21741 .kr(1)
21742 .sr(1)
21743 .m(m)
21744 .n(n)
21745 .k(k)
21746 .iterations(1)
21747 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21748 }
21749 }
21750 }
21751 }
21752
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,n_div_16)21753 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, n_div_16) {
21754 TEST_REQUIRES_X86_FMA3;
21755 for (uint32_t n = 32; n <= 48; n += 16) {
21756 for (size_t k = 1; k <= 5; k += 2) {
21757 GemmMicrokernelTester()
21758 .mr(3)
21759 .nr(16)
21760 .kr(1)
21761 .sr(1)
21762 .m(3)
21763 .n(n)
21764 .k(k)
21765 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21766 }
21767 }
21768 }
21769
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,n_div_16_strided_cn)21770 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, n_div_16_strided_cn) {
21771 TEST_REQUIRES_X86_FMA3;
21772 for (uint32_t n = 32; n <= 48; n += 16) {
21773 for (size_t k = 1; k <= 5; k += 2) {
21774 GemmMicrokernelTester()
21775 .mr(3)
21776 .nr(16)
21777 .kr(1)
21778 .sr(1)
21779 .m(3)
21780 .n(n)
21781 .k(k)
21782 .cn_stride(19)
21783 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21784 }
21785 }
21786 }
21787
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,n_div_16_strided_a)21788 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, n_div_16_strided_a) {
21789 TEST_REQUIRES_X86_FMA3;
21790 for (uint32_t n = 32; n <= 48; n += 16) {
21791 for (size_t k = 1; k <= 5; k += 2) {
21792 GemmMicrokernelTester()
21793 .mr(3)
21794 .nr(16)
21795 .kr(1)
21796 .sr(1)
21797 .m(3)
21798 .n(n)
21799 .k(k)
21800 .a_stride(7)
21801 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21802 }
21803 }
21804 }
21805
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,n_div_16_subtile)21806 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, n_div_16_subtile) {
21807 TEST_REQUIRES_X86_FMA3;
21808 for (uint32_t n = 32; n <= 48; n += 16) {
21809 for (size_t k = 1; k <= 5; k += 2) {
21810 for (uint32_t m = 1; m <= 3; m++) {
21811 GemmMicrokernelTester()
21812 .mr(3)
21813 .nr(16)
21814 .kr(1)
21815 .sr(1)
21816 .m(m)
21817 .n(n)
21818 .k(k)
21819 .iterations(1)
21820 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21821 }
21822 }
21823 }
21824 }
21825
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,strided_cm_subtile)21826 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, strided_cm_subtile) {
21827 TEST_REQUIRES_X86_FMA3;
21828 for (size_t k = 1; k <= 5; k += 2) {
21829 for (uint32_t n = 1; n <= 16; n++) {
21830 for (uint32_t m = 1; m <= 3; m++) {
21831 GemmMicrokernelTester()
21832 .mr(3)
21833 .nr(16)
21834 .kr(1)
21835 .sr(1)
21836 .m(m)
21837 .n(n)
21838 .k(k)
21839 .cm_stride(19)
21840 .iterations(1)
21841 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21842 }
21843 }
21844 }
21845 }
21846
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,qmin)21847 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, qmin) {
21848 TEST_REQUIRES_X86_FMA3;
21849 GemmMicrokernelTester()
21850 .mr(3)
21851 .nr(16)
21852 .kr(1)
21853 .sr(1)
21854 .m(3)
21855 .n(16)
21856 .k(1)
21857 .qmin(128)
21858 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21859 }
21860
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,qmax)21861 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, qmax) {
21862 TEST_REQUIRES_X86_FMA3;
21863 GemmMicrokernelTester()
21864 .mr(3)
21865 .nr(16)
21866 .kr(1)
21867 .sr(1)
21868 .m(3)
21869 .n(16)
21870 .k(1)
21871 .qmax(128)
21872 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21873 }
21874
TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST,strided_cm)21875 TEST(F32_GEMM_MINMAX_3X16__FMA3_BROADCAST, strided_cm) {
21876 TEST_REQUIRES_X86_FMA3;
21877 GemmMicrokernelTester()
21878 .mr(3)
21879 .nr(16)
21880 .kr(1)
21881 .sr(1)
21882 .m(3)
21883 .n(16)
21884 .k(1)
21885 .cm_stride(19)
21886 .Test(xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21887 }
21888 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21889
21890
21891 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,k_eq_1)21892 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, k_eq_1) {
21893 TEST_REQUIRES_X86_FMA3;
21894 GemmMicrokernelTester()
21895 .mr(4)
21896 .nr(8)
21897 .kr(1)
21898 .sr(1)
21899 .m(4)
21900 .n(8)
21901 .k(1)
21902 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21903 }
21904
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,strided_cn)21905 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, strided_cn) {
21906 TEST_REQUIRES_X86_FMA3;
21907 GemmMicrokernelTester()
21908 .mr(4)
21909 .nr(8)
21910 .kr(1)
21911 .sr(1)
21912 .m(4)
21913 .n(8)
21914 .k(1)
21915 .cn_stride(11)
21916 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21917 }
21918
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,k_eq_1_strided_a)21919 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, k_eq_1_strided_a) {
21920 TEST_REQUIRES_X86_FMA3;
21921 GemmMicrokernelTester()
21922 .mr(4)
21923 .nr(8)
21924 .kr(1)
21925 .sr(1)
21926 .m(4)
21927 .n(8)
21928 .k(1)
21929 .a_stride(3)
21930 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21931 }
21932
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,k_eq_1_subtile)21933 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, k_eq_1_subtile) {
21934 TEST_REQUIRES_X86_FMA3;
21935 for (uint32_t n = 1; n <= 8; n++) {
21936 for (uint32_t m = 1; m <= 4; m++) {
21937 GemmMicrokernelTester()
21938 .mr(4)
21939 .nr(8)
21940 .kr(1)
21941 .sr(1)
21942 .m(m)
21943 .n(n)
21944 .k(1)
21945 .iterations(1)
21946 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21947 }
21948 }
21949 }
21950
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,k_eq_1_subtile_m)21951 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
21952 TEST_REQUIRES_X86_FMA3;
21953 for (uint32_t m = 1; m <= 4; m++) {
21954 GemmMicrokernelTester()
21955 .mr(4)
21956 .nr(8)
21957 .kr(1)
21958 .sr(1)
21959 .m(m)
21960 .n(8)
21961 .k(1)
21962 .iterations(1)
21963 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21964 }
21965 }
21966
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,k_eq_1_subtile_n)21967 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
21968 TEST_REQUIRES_X86_FMA3;
21969 for (uint32_t n = 1; n <= 8; n++) {
21970 GemmMicrokernelTester()
21971 .mr(4)
21972 .nr(8)
21973 .kr(1)
21974 .sr(1)
21975 .m(4)
21976 .n(n)
21977 .k(1)
21978 .iterations(1)
21979 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21980 }
21981 }
21982
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,k_gt_1)21983 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, k_gt_1) {
21984 TEST_REQUIRES_X86_FMA3;
21985 for (size_t k = 2; k < 10; k++) {
21986 GemmMicrokernelTester()
21987 .mr(4)
21988 .nr(8)
21989 .kr(1)
21990 .sr(1)
21991 .m(4)
21992 .n(8)
21993 .k(k)
21994 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
21995 }
21996 }
21997
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,k_gt_1_strided_a)21998 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, k_gt_1_strided_a) {
21999 TEST_REQUIRES_X86_FMA3;
22000 for (size_t k = 2; k < 10; k++) {
22001 GemmMicrokernelTester()
22002 .mr(4)
22003 .nr(8)
22004 .kr(1)
22005 .sr(1)
22006 .m(4)
22007 .n(8)
22008 .k(k)
22009 .a_stride(11)
22010 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22011 }
22012 }
22013
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,k_gt_1_subtile)22014 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, k_gt_1_subtile) {
22015 TEST_REQUIRES_X86_FMA3;
22016 for (size_t k = 2; k < 10; k++) {
22017 for (uint32_t n = 1; n <= 8; n++) {
22018 for (uint32_t m = 1; m <= 4; m++) {
22019 GemmMicrokernelTester()
22020 .mr(4)
22021 .nr(8)
22022 .kr(1)
22023 .sr(1)
22024 .m(m)
22025 .n(n)
22026 .k(k)
22027 .iterations(1)
22028 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22029 }
22030 }
22031 }
22032 }
22033
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,n_gt_8)22034 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, n_gt_8) {
22035 TEST_REQUIRES_X86_FMA3;
22036 for (uint32_t n = 9; n < 16; n++) {
22037 for (size_t k = 1; k <= 5; k += 2) {
22038 GemmMicrokernelTester()
22039 .mr(4)
22040 .nr(8)
22041 .kr(1)
22042 .sr(1)
22043 .m(4)
22044 .n(n)
22045 .k(k)
22046 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22047 }
22048 }
22049 }
22050
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,n_gt_8_strided_cn)22051 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
22052 TEST_REQUIRES_X86_FMA3;
22053 for (uint32_t n = 9; n < 16; n++) {
22054 for (size_t k = 1; k <= 5; k += 2) {
22055 GemmMicrokernelTester()
22056 .mr(4)
22057 .nr(8)
22058 .kr(1)
22059 .sr(1)
22060 .m(4)
22061 .n(n)
22062 .k(k)
22063 .cn_stride(11)
22064 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22065 }
22066 }
22067 }
22068
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,n_gt_8_strided_a)22069 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, n_gt_8_strided_a) {
22070 TEST_REQUIRES_X86_FMA3;
22071 for (uint32_t n = 9; n < 16; n++) {
22072 for (size_t k = 1; k <= 5; k += 2) {
22073 GemmMicrokernelTester()
22074 .mr(4)
22075 .nr(8)
22076 .kr(1)
22077 .sr(1)
22078 .m(4)
22079 .n(n)
22080 .k(k)
22081 .a_stride(7)
22082 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22083 }
22084 }
22085 }
22086
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,n_gt_8_subtile)22087 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, n_gt_8_subtile) {
22088 TEST_REQUIRES_X86_FMA3;
22089 for (uint32_t n = 9; n < 16; n++) {
22090 for (size_t k = 1; k <= 5; k += 2) {
22091 for (uint32_t m = 1; m <= 4; m++) {
22092 GemmMicrokernelTester()
22093 .mr(4)
22094 .nr(8)
22095 .kr(1)
22096 .sr(1)
22097 .m(m)
22098 .n(n)
22099 .k(k)
22100 .iterations(1)
22101 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22102 }
22103 }
22104 }
22105 }
22106
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,n_div_8)22107 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, n_div_8) {
22108 TEST_REQUIRES_X86_FMA3;
22109 for (uint32_t n = 16; n <= 24; n += 8) {
22110 for (size_t k = 1; k <= 5; k += 2) {
22111 GemmMicrokernelTester()
22112 .mr(4)
22113 .nr(8)
22114 .kr(1)
22115 .sr(1)
22116 .m(4)
22117 .n(n)
22118 .k(k)
22119 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22120 }
22121 }
22122 }
22123
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,n_div_8_strided_cn)22124 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, n_div_8_strided_cn) {
22125 TEST_REQUIRES_X86_FMA3;
22126 for (uint32_t n = 16; n <= 24; n += 8) {
22127 for (size_t k = 1; k <= 5; k += 2) {
22128 GemmMicrokernelTester()
22129 .mr(4)
22130 .nr(8)
22131 .kr(1)
22132 .sr(1)
22133 .m(4)
22134 .n(n)
22135 .k(k)
22136 .cn_stride(11)
22137 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22138 }
22139 }
22140 }
22141
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,n_div_8_strided_a)22142 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, n_div_8_strided_a) {
22143 TEST_REQUIRES_X86_FMA3;
22144 for (uint32_t n = 16; n <= 24; n += 8) {
22145 for (size_t k = 1; k <= 5; k += 2) {
22146 GemmMicrokernelTester()
22147 .mr(4)
22148 .nr(8)
22149 .kr(1)
22150 .sr(1)
22151 .m(4)
22152 .n(n)
22153 .k(k)
22154 .a_stride(7)
22155 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22156 }
22157 }
22158 }
22159
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,n_div_8_subtile)22160 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, n_div_8_subtile) {
22161 TEST_REQUIRES_X86_FMA3;
22162 for (uint32_t n = 16; n <= 24; n += 8) {
22163 for (size_t k = 1; k <= 5; k += 2) {
22164 for (uint32_t m = 1; m <= 4; m++) {
22165 GemmMicrokernelTester()
22166 .mr(4)
22167 .nr(8)
22168 .kr(1)
22169 .sr(1)
22170 .m(m)
22171 .n(n)
22172 .k(k)
22173 .iterations(1)
22174 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22175 }
22176 }
22177 }
22178 }
22179
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,strided_cm_subtile)22180 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, strided_cm_subtile) {
22181 TEST_REQUIRES_X86_FMA3;
22182 for (size_t k = 1; k <= 5; k += 2) {
22183 for (uint32_t n = 1; n <= 8; n++) {
22184 for (uint32_t m = 1; m <= 4; m++) {
22185 GemmMicrokernelTester()
22186 .mr(4)
22187 .nr(8)
22188 .kr(1)
22189 .sr(1)
22190 .m(m)
22191 .n(n)
22192 .k(k)
22193 .cm_stride(11)
22194 .iterations(1)
22195 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22196 }
22197 }
22198 }
22199 }
22200
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,qmin)22201 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, qmin) {
22202 TEST_REQUIRES_X86_FMA3;
22203 GemmMicrokernelTester()
22204 .mr(4)
22205 .nr(8)
22206 .kr(1)
22207 .sr(1)
22208 .m(4)
22209 .n(8)
22210 .k(1)
22211 .qmin(128)
22212 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22213 }
22214
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,qmax)22215 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, qmax) {
22216 TEST_REQUIRES_X86_FMA3;
22217 GemmMicrokernelTester()
22218 .mr(4)
22219 .nr(8)
22220 .kr(1)
22221 .sr(1)
22222 .m(4)
22223 .n(8)
22224 .k(1)
22225 .qmax(128)
22226 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22227 }
22228
TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST,strided_cm)22229 TEST(F32_GEMM_MINMAX_4X8__FMA3_BROADCAST, strided_cm) {
22230 TEST_REQUIRES_X86_FMA3;
22231 GemmMicrokernelTester()
22232 .mr(4)
22233 .nr(8)
22234 .kr(1)
22235 .sr(1)
22236 .m(4)
22237 .n(8)
22238 .k(1)
22239 .cm_stride(11)
22240 .Test(xnn_f32_gemm_minmax_ukernel_4x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22241 }
22242 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22243
22244
22245 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,k_eq_1)22246 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, k_eq_1) {
22247 TEST_REQUIRES_X86_FMA3;
22248 GemmMicrokernelTester()
22249 .mr(5)
22250 .nr(8)
22251 .kr(1)
22252 .sr(1)
22253 .m(5)
22254 .n(8)
22255 .k(1)
22256 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22257 }
22258
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,strided_cn)22259 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, strided_cn) {
22260 TEST_REQUIRES_X86_FMA3;
22261 GemmMicrokernelTester()
22262 .mr(5)
22263 .nr(8)
22264 .kr(1)
22265 .sr(1)
22266 .m(5)
22267 .n(8)
22268 .k(1)
22269 .cn_stride(11)
22270 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22271 }
22272
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,k_eq_1_strided_a)22273 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, k_eq_1_strided_a) {
22274 TEST_REQUIRES_X86_FMA3;
22275 GemmMicrokernelTester()
22276 .mr(5)
22277 .nr(8)
22278 .kr(1)
22279 .sr(1)
22280 .m(5)
22281 .n(8)
22282 .k(1)
22283 .a_stride(3)
22284 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22285 }
22286
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,k_eq_1_subtile)22287 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, k_eq_1_subtile) {
22288 TEST_REQUIRES_X86_FMA3;
22289 for (uint32_t n = 1; n <= 8; n++) {
22290 for (uint32_t m = 1; m <= 5; m++) {
22291 GemmMicrokernelTester()
22292 .mr(5)
22293 .nr(8)
22294 .kr(1)
22295 .sr(1)
22296 .m(m)
22297 .n(n)
22298 .k(1)
22299 .iterations(1)
22300 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22301 }
22302 }
22303 }
22304
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,k_eq_1_subtile_m)22305 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
22306 TEST_REQUIRES_X86_FMA3;
22307 for (uint32_t m = 1; m <= 5; m++) {
22308 GemmMicrokernelTester()
22309 .mr(5)
22310 .nr(8)
22311 .kr(1)
22312 .sr(1)
22313 .m(m)
22314 .n(8)
22315 .k(1)
22316 .iterations(1)
22317 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22318 }
22319 }
22320
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,k_eq_1_subtile_n)22321 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
22322 TEST_REQUIRES_X86_FMA3;
22323 for (uint32_t n = 1; n <= 8; n++) {
22324 GemmMicrokernelTester()
22325 .mr(5)
22326 .nr(8)
22327 .kr(1)
22328 .sr(1)
22329 .m(5)
22330 .n(n)
22331 .k(1)
22332 .iterations(1)
22333 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22334 }
22335 }
22336
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,k_gt_1)22337 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, k_gt_1) {
22338 TEST_REQUIRES_X86_FMA3;
22339 for (size_t k = 2; k < 10; k++) {
22340 GemmMicrokernelTester()
22341 .mr(5)
22342 .nr(8)
22343 .kr(1)
22344 .sr(1)
22345 .m(5)
22346 .n(8)
22347 .k(k)
22348 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22349 }
22350 }
22351
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,k_gt_1_strided_a)22352 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, k_gt_1_strided_a) {
22353 TEST_REQUIRES_X86_FMA3;
22354 for (size_t k = 2; k < 10; k++) {
22355 GemmMicrokernelTester()
22356 .mr(5)
22357 .nr(8)
22358 .kr(1)
22359 .sr(1)
22360 .m(5)
22361 .n(8)
22362 .k(k)
22363 .a_stride(11)
22364 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22365 }
22366 }
22367
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,k_gt_1_subtile)22368 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, k_gt_1_subtile) {
22369 TEST_REQUIRES_X86_FMA3;
22370 for (size_t k = 2; k < 10; k++) {
22371 for (uint32_t n = 1; n <= 8; n++) {
22372 for (uint32_t m = 1; m <= 5; m++) {
22373 GemmMicrokernelTester()
22374 .mr(5)
22375 .nr(8)
22376 .kr(1)
22377 .sr(1)
22378 .m(m)
22379 .n(n)
22380 .k(k)
22381 .iterations(1)
22382 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22383 }
22384 }
22385 }
22386 }
22387
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,n_gt_8)22388 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, n_gt_8) {
22389 TEST_REQUIRES_X86_FMA3;
22390 for (uint32_t n = 9; n < 16; n++) {
22391 for (size_t k = 1; k <= 5; k += 2) {
22392 GemmMicrokernelTester()
22393 .mr(5)
22394 .nr(8)
22395 .kr(1)
22396 .sr(1)
22397 .m(5)
22398 .n(n)
22399 .k(k)
22400 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22401 }
22402 }
22403 }
22404
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,n_gt_8_strided_cn)22405 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
22406 TEST_REQUIRES_X86_FMA3;
22407 for (uint32_t n = 9; n < 16; n++) {
22408 for (size_t k = 1; k <= 5; k += 2) {
22409 GemmMicrokernelTester()
22410 .mr(5)
22411 .nr(8)
22412 .kr(1)
22413 .sr(1)
22414 .m(5)
22415 .n(n)
22416 .k(k)
22417 .cn_stride(11)
22418 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22419 }
22420 }
22421 }
22422
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,n_gt_8_strided_a)22423 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, n_gt_8_strided_a) {
22424 TEST_REQUIRES_X86_FMA3;
22425 for (uint32_t n = 9; n < 16; n++) {
22426 for (size_t k = 1; k <= 5; k += 2) {
22427 GemmMicrokernelTester()
22428 .mr(5)
22429 .nr(8)
22430 .kr(1)
22431 .sr(1)
22432 .m(5)
22433 .n(n)
22434 .k(k)
22435 .a_stride(7)
22436 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22437 }
22438 }
22439 }
22440
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,n_gt_8_subtile)22441 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, n_gt_8_subtile) {
22442 TEST_REQUIRES_X86_FMA3;
22443 for (uint32_t n = 9; n < 16; n++) {
22444 for (size_t k = 1; k <= 5; k += 2) {
22445 for (uint32_t m = 1; m <= 5; m++) {
22446 GemmMicrokernelTester()
22447 .mr(5)
22448 .nr(8)
22449 .kr(1)
22450 .sr(1)
22451 .m(m)
22452 .n(n)
22453 .k(k)
22454 .iterations(1)
22455 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22456 }
22457 }
22458 }
22459 }
22460
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,n_div_8)22461 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, n_div_8) {
22462 TEST_REQUIRES_X86_FMA3;
22463 for (uint32_t n = 16; n <= 24; n += 8) {
22464 for (size_t k = 1; k <= 5; k += 2) {
22465 GemmMicrokernelTester()
22466 .mr(5)
22467 .nr(8)
22468 .kr(1)
22469 .sr(1)
22470 .m(5)
22471 .n(n)
22472 .k(k)
22473 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22474 }
22475 }
22476 }
22477
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,n_div_8_strided_cn)22478 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, n_div_8_strided_cn) {
22479 TEST_REQUIRES_X86_FMA3;
22480 for (uint32_t n = 16; n <= 24; n += 8) {
22481 for (size_t k = 1; k <= 5; k += 2) {
22482 GemmMicrokernelTester()
22483 .mr(5)
22484 .nr(8)
22485 .kr(1)
22486 .sr(1)
22487 .m(5)
22488 .n(n)
22489 .k(k)
22490 .cn_stride(11)
22491 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22492 }
22493 }
22494 }
22495
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,n_div_8_strided_a)22496 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, n_div_8_strided_a) {
22497 TEST_REQUIRES_X86_FMA3;
22498 for (uint32_t n = 16; n <= 24; n += 8) {
22499 for (size_t k = 1; k <= 5; k += 2) {
22500 GemmMicrokernelTester()
22501 .mr(5)
22502 .nr(8)
22503 .kr(1)
22504 .sr(1)
22505 .m(5)
22506 .n(n)
22507 .k(k)
22508 .a_stride(7)
22509 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22510 }
22511 }
22512 }
22513
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,n_div_8_subtile)22514 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, n_div_8_subtile) {
22515 TEST_REQUIRES_X86_FMA3;
22516 for (uint32_t n = 16; n <= 24; n += 8) {
22517 for (size_t k = 1; k <= 5; k += 2) {
22518 for (uint32_t m = 1; m <= 5; m++) {
22519 GemmMicrokernelTester()
22520 .mr(5)
22521 .nr(8)
22522 .kr(1)
22523 .sr(1)
22524 .m(m)
22525 .n(n)
22526 .k(k)
22527 .iterations(1)
22528 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22529 }
22530 }
22531 }
22532 }
22533
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,strided_cm_subtile)22534 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, strided_cm_subtile) {
22535 TEST_REQUIRES_X86_FMA3;
22536 for (size_t k = 1; k <= 5; k += 2) {
22537 for (uint32_t n = 1; n <= 8; n++) {
22538 for (uint32_t m = 1; m <= 5; m++) {
22539 GemmMicrokernelTester()
22540 .mr(5)
22541 .nr(8)
22542 .kr(1)
22543 .sr(1)
22544 .m(m)
22545 .n(n)
22546 .k(k)
22547 .cm_stride(11)
22548 .iterations(1)
22549 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22550 }
22551 }
22552 }
22553 }
22554
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,qmin)22555 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, qmin) {
22556 TEST_REQUIRES_X86_FMA3;
22557 GemmMicrokernelTester()
22558 .mr(5)
22559 .nr(8)
22560 .kr(1)
22561 .sr(1)
22562 .m(5)
22563 .n(8)
22564 .k(1)
22565 .qmin(128)
22566 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22567 }
22568
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,qmax)22569 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, qmax) {
22570 TEST_REQUIRES_X86_FMA3;
22571 GemmMicrokernelTester()
22572 .mr(5)
22573 .nr(8)
22574 .kr(1)
22575 .sr(1)
22576 .m(5)
22577 .n(8)
22578 .k(1)
22579 .qmax(128)
22580 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22581 }
22582
TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST,strided_cm)22583 TEST(F32_GEMM_MINMAX_5X8__FMA3_BROADCAST, strided_cm) {
22584 TEST_REQUIRES_X86_FMA3;
22585 GemmMicrokernelTester()
22586 .mr(5)
22587 .nr(8)
22588 .kr(1)
22589 .sr(1)
22590 .m(5)
22591 .n(8)
22592 .k(1)
22593 .cm_stride(11)
22594 .Test(xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast, xnn_init_f32_minmax_avx_params);
22595 }
22596 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22597
22598
22599 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,k_eq_1)22600 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, k_eq_1) {
22601 TEST_REQUIRES_X86_AVX512F;
22602 GemmMicrokernelTester()
22603 .mr(1)
22604 .nr(16)
22605 .kr(1)
22606 .sr(1)
22607 .m(1)
22608 .n(16)
22609 .k(1)
22610 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22611 }
22612
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,strided_cn)22613 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, strided_cn) {
22614 TEST_REQUIRES_X86_AVX512F;
22615 GemmMicrokernelTester()
22616 .mr(1)
22617 .nr(16)
22618 .kr(1)
22619 .sr(1)
22620 .m(1)
22621 .n(16)
22622 .k(1)
22623 .cn_stride(19)
22624 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22625 }
22626
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,k_eq_1_strided_a)22627 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
22628 TEST_REQUIRES_X86_AVX512F;
22629 GemmMicrokernelTester()
22630 .mr(1)
22631 .nr(16)
22632 .kr(1)
22633 .sr(1)
22634 .m(1)
22635 .n(16)
22636 .k(1)
22637 .a_stride(3)
22638 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22639 }
22640
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,k_eq_1_subtile)22641 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, k_eq_1_subtile) {
22642 TEST_REQUIRES_X86_AVX512F;
22643 for (uint32_t n = 1; n <= 16; n++) {
22644 for (uint32_t m = 1; m <= 1; m++) {
22645 GemmMicrokernelTester()
22646 .mr(1)
22647 .nr(16)
22648 .kr(1)
22649 .sr(1)
22650 .m(m)
22651 .n(n)
22652 .k(1)
22653 .iterations(1)
22654 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22655 }
22656 }
22657 }
22658
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,k_eq_1_subtile_m)22659 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
22660 TEST_REQUIRES_X86_AVX512F;
22661 for (uint32_t m = 1; m <= 1; m++) {
22662 GemmMicrokernelTester()
22663 .mr(1)
22664 .nr(16)
22665 .kr(1)
22666 .sr(1)
22667 .m(m)
22668 .n(16)
22669 .k(1)
22670 .iterations(1)
22671 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22672 }
22673 }
22674
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,k_eq_1_subtile_n)22675 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
22676 TEST_REQUIRES_X86_AVX512F;
22677 for (uint32_t n = 1; n <= 16; n++) {
22678 GemmMicrokernelTester()
22679 .mr(1)
22680 .nr(16)
22681 .kr(1)
22682 .sr(1)
22683 .m(1)
22684 .n(n)
22685 .k(1)
22686 .iterations(1)
22687 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22688 }
22689 }
22690
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,k_gt_1)22691 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, k_gt_1) {
22692 TEST_REQUIRES_X86_AVX512F;
22693 for (size_t k = 2; k < 10; k++) {
22694 GemmMicrokernelTester()
22695 .mr(1)
22696 .nr(16)
22697 .kr(1)
22698 .sr(1)
22699 .m(1)
22700 .n(16)
22701 .k(k)
22702 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22703 }
22704 }
22705
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,k_gt_1_strided_a)22706 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
22707 TEST_REQUIRES_X86_AVX512F;
22708 for (size_t k = 2; k < 10; k++) {
22709 GemmMicrokernelTester()
22710 .mr(1)
22711 .nr(16)
22712 .kr(1)
22713 .sr(1)
22714 .m(1)
22715 .n(16)
22716 .k(k)
22717 .a_stride(11)
22718 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22719 }
22720 }
22721
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,k_gt_1_subtile)22722 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, k_gt_1_subtile) {
22723 TEST_REQUIRES_X86_AVX512F;
22724 for (size_t k = 2; k < 10; k++) {
22725 for (uint32_t n = 1; n <= 16; n++) {
22726 for (uint32_t m = 1; m <= 1; m++) {
22727 GemmMicrokernelTester()
22728 .mr(1)
22729 .nr(16)
22730 .kr(1)
22731 .sr(1)
22732 .m(m)
22733 .n(n)
22734 .k(k)
22735 .iterations(1)
22736 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22737 }
22738 }
22739 }
22740 }
22741
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,n_gt_16)22742 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, n_gt_16) {
22743 TEST_REQUIRES_X86_AVX512F;
22744 for (uint32_t n = 17; n < 32; n++) {
22745 for (size_t k = 1; k <= 5; k += 2) {
22746 GemmMicrokernelTester()
22747 .mr(1)
22748 .nr(16)
22749 .kr(1)
22750 .sr(1)
22751 .m(1)
22752 .n(n)
22753 .k(k)
22754 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22755 }
22756 }
22757 }
22758
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,n_gt_16_strided_cn)22759 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
22760 TEST_REQUIRES_X86_AVX512F;
22761 for (uint32_t n = 17; n < 32; n++) {
22762 for (size_t k = 1; k <= 5; k += 2) {
22763 GemmMicrokernelTester()
22764 .mr(1)
22765 .nr(16)
22766 .kr(1)
22767 .sr(1)
22768 .m(1)
22769 .n(n)
22770 .k(k)
22771 .cn_stride(19)
22772 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22773 }
22774 }
22775 }
22776
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,n_gt_16_strided_a)22777 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
22778 TEST_REQUIRES_X86_AVX512F;
22779 for (uint32_t n = 17; n < 32; n++) {
22780 for (size_t k = 1; k <= 5; k += 2) {
22781 GemmMicrokernelTester()
22782 .mr(1)
22783 .nr(16)
22784 .kr(1)
22785 .sr(1)
22786 .m(1)
22787 .n(n)
22788 .k(k)
22789 .a_stride(7)
22790 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22791 }
22792 }
22793 }
22794
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,n_gt_16_subtile)22795 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, n_gt_16_subtile) {
22796 TEST_REQUIRES_X86_AVX512F;
22797 for (uint32_t n = 17; n < 32; n++) {
22798 for (size_t k = 1; k <= 5; k += 2) {
22799 for (uint32_t m = 1; m <= 1; m++) {
22800 GemmMicrokernelTester()
22801 .mr(1)
22802 .nr(16)
22803 .kr(1)
22804 .sr(1)
22805 .m(m)
22806 .n(n)
22807 .k(k)
22808 .iterations(1)
22809 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22810 }
22811 }
22812 }
22813 }
22814
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,n_div_16)22815 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, n_div_16) {
22816 TEST_REQUIRES_X86_AVX512F;
22817 for (uint32_t n = 32; n <= 48; n += 16) {
22818 for (size_t k = 1; k <= 5; k += 2) {
22819 GemmMicrokernelTester()
22820 .mr(1)
22821 .nr(16)
22822 .kr(1)
22823 .sr(1)
22824 .m(1)
22825 .n(n)
22826 .k(k)
22827 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22828 }
22829 }
22830 }
22831
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,n_div_16_strided_cn)22832 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
22833 TEST_REQUIRES_X86_AVX512F;
22834 for (uint32_t n = 32; n <= 48; n += 16) {
22835 for (size_t k = 1; k <= 5; k += 2) {
22836 GemmMicrokernelTester()
22837 .mr(1)
22838 .nr(16)
22839 .kr(1)
22840 .sr(1)
22841 .m(1)
22842 .n(n)
22843 .k(k)
22844 .cn_stride(19)
22845 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22846 }
22847 }
22848 }
22849
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,n_div_16_strided_a)22850 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, n_div_16_strided_a) {
22851 TEST_REQUIRES_X86_AVX512F;
22852 for (uint32_t n = 32; n <= 48; n += 16) {
22853 for (size_t k = 1; k <= 5; k += 2) {
22854 GemmMicrokernelTester()
22855 .mr(1)
22856 .nr(16)
22857 .kr(1)
22858 .sr(1)
22859 .m(1)
22860 .n(n)
22861 .k(k)
22862 .a_stride(7)
22863 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22864 }
22865 }
22866 }
22867
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,n_div_16_subtile)22868 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, n_div_16_subtile) {
22869 TEST_REQUIRES_X86_AVX512F;
22870 for (uint32_t n = 32; n <= 48; n += 16) {
22871 for (size_t k = 1; k <= 5; k += 2) {
22872 for (uint32_t m = 1; m <= 1; m++) {
22873 GemmMicrokernelTester()
22874 .mr(1)
22875 .nr(16)
22876 .kr(1)
22877 .sr(1)
22878 .m(m)
22879 .n(n)
22880 .k(k)
22881 .iterations(1)
22882 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22883 }
22884 }
22885 }
22886 }
22887
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,strided_cm_subtile)22888 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, strided_cm_subtile) {
22889 TEST_REQUIRES_X86_AVX512F;
22890 for (size_t k = 1; k <= 5; k += 2) {
22891 for (uint32_t n = 1; n <= 16; n++) {
22892 for (uint32_t m = 1; m <= 1; m++) {
22893 GemmMicrokernelTester()
22894 .mr(1)
22895 .nr(16)
22896 .kr(1)
22897 .sr(1)
22898 .m(m)
22899 .n(n)
22900 .k(k)
22901 .cm_stride(19)
22902 .iterations(1)
22903 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22904 }
22905 }
22906 }
22907 }
22908
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,qmin)22909 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, qmin) {
22910 TEST_REQUIRES_X86_AVX512F;
22911 GemmMicrokernelTester()
22912 .mr(1)
22913 .nr(16)
22914 .kr(1)
22915 .sr(1)
22916 .m(1)
22917 .n(16)
22918 .k(1)
22919 .qmin(128)
22920 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22921 }
22922
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,qmax)22923 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, qmax) {
22924 TEST_REQUIRES_X86_AVX512F;
22925 GemmMicrokernelTester()
22926 .mr(1)
22927 .nr(16)
22928 .kr(1)
22929 .sr(1)
22930 .m(1)
22931 .n(16)
22932 .k(1)
22933 .qmax(128)
22934 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22935 }
22936
TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST,strided_cm)22937 TEST(F32_GEMM_MINMAX_1X16__AVX512F_BROADCAST, strided_cm) {
22938 TEST_REQUIRES_X86_AVX512F;
22939 GemmMicrokernelTester()
22940 .mr(1)
22941 .nr(16)
22942 .kr(1)
22943 .sr(1)
22944 .m(1)
22945 .n(16)
22946 .k(1)
22947 .cm_stride(19)
22948 .Test(xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22949 }
22950 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22951
22952
22953 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,k_eq_1)22954 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, k_eq_1) {
22955 TEST_REQUIRES_X86_AVX512F;
22956 GemmMicrokernelTester()
22957 .mr(6)
22958 .nr(16)
22959 .kr(1)
22960 .sr(1)
22961 .m(6)
22962 .n(16)
22963 .k(1)
22964 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22965 }
22966
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,strided_cn)22967 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, strided_cn) {
22968 TEST_REQUIRES_X86_AVX512F;
22969 GemmMicrokernelTester()
22970 .mr(6)
22971 .nr(16)
22972 .kr(1)
22973 .sr(1)
22974 .m(6)
22975 .n(16)
22976 .k(1)
22977 .cn_stride(19)
22978 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22979 }
22980
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,k_eq_1_strided_a)22981 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
22982 TEST_REQUIRES_X86_AVX512F;
22983 GemmMicrokernelTester()
22984 .mr(6)
22985 .nr(16)
22986 .kr(1)
22987 .sr(1)
22988 .m(6)
22989 .n(16)
22990 .k(1)
22991 .a_stride(3)
22992 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
22993 }
22994
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,k_eq_1_subtile)22995 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, k_eq_1_subtile) {
22996 TEST_REQUIRES_X86_AVX512F;
22997 for (uint32_t n = 1; n <= 16; n++) {
22998 for (uint32_t m = 1; m <= 6; m++) {
22999 GemmMicrokernelTester()
23000 .mr(6)
23001 .nr(16)
23002 .kr(1)
23003 .sr(1)
23004 .m(m)
23005 .n(n)
23006 .k(1)
23007 .iterations(1)
23008 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23009 }
23010 }
23011 }
23012
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,k_eq_1_subtile_m)23013 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
23014 TEST_REQUIRES_X86_AVX512F;
23015 for (uint32_t m = 1; m <= 6; m++) {
23016 GemmMicrokernelTester()
23017 .mr(6)
23018 .nr(16)
23019 .kr(1)
23020 .sr(1)
23021 .m(m)
23022 .n(16)
23023 .k(1)
23024 .iterations(1)
23025 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23026 }
23027 }
23028
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,k_eq_1_subtile_n)23029 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
23030 TEST_REQUIRES_X86_AVX512F;
23031 for (uint32_t n = 1; n <= 16; n++) {
23032 GemmMicrokernelTester()
23033 .mr(6)
23034 .nr(16)
23035 .kr(1)
23036 .sr(1)
23037 .m(6)
23038 .n(n)
23039 .k(1)
23040 .iterations(1)
23041 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23042 }
23043 }
23044
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,k_gt_1)23045 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, k_gt_1) {
23046 TEST_REQUIRES_X86_AVX512F;
23047 for (size_t k = 2; k < 10; k++) {
23048 GemmMicrokernelTester()
23049 .mr(6)
23050 .nr(16)
23051 .kr(1)
23052 .sr(1)
23053 .m(6)
23054 .n(16)
23055 .k(k)
23056 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23057 }
23058 }
23059
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,k_gt_1_strided_a)23060 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
23061 TEST_REQUIRES_X86_AVX512F;
23062 for (size_t k = 2; k < 10; k++) {
23063 GemmMicrokernelTester()
23064 .mr(6)
23065 .nr(16)
23066 .kr(1)
23067 .sr(1)
23068 .m(6)
23069 .n(16)
23070 .k(k)
23071 .a_stride(11)
23072 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23073 }
23074 }
23075
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,k_gt_1_subtile)23076 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, k_gt_1_subtile) {
23077 TEST_REQUIRES_X86_AVX512F;
23078 for (size_t k = 2; k < 10; k++) {
23079 for (uint32_t n = 1; n <= 16; n++) {
23080 for (uint32_t m = 1; m <= 6; m++) {
23081 GemmMicrokernelTester()
23082 .mr(6)
23083 .nr(16)
23084 .kr(1)
23085 .sr(1)
23086 .m(m)
23087 .n(n)
23088 .k(k)
23089 .iterations(1)
23090 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23091 }
23092 }
23093 }
23094 }
23095
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,n_gt_16)23096 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, n_gt_16) {
23097 TEST_REQUIRES_X86_AVX512F;
23098 for (uint32_t n = 17; n < 32; n++) {
23099 for (size_t k = 1; k <= 5; k += 2) {
23100 GemmMicrokernelTester()
23101 .mr(6)
23102 .nr(16)
23103 .kr(1)
23104 .sr(1)
23105 .m(6)
23106 .n(n)
23107 .k(k)
23108 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23109 }
23110 }
23111 }
23112
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,n_gt_16_strided_cn)23113 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
23114 TEST_REQUIRES_X86_AVX512F;
23115 for (uint32_t n = 17; n < 32; n++) {
23116 for (size_t k = 1; k <= 5; k += 2) {
23117 GemmMicrokernelTester()
23118 .mr(6)
23119 .nr(16)
23120 .kr(1)
23121 .sr(1)
23122 .m(6)
23123 .n(n)
23124 .k(k)
23125 .cn_stride(19)
23126 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23127 }
23128 }
23129 }
23130
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,n_gt_16_strided_a)23131 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
23132 TEST_REQUIRES_X86_AVX512F;
23133 for (uint32_t n = 17; n < 32; n++) {
23134 for (size_t k = 1; k <= 5; k += 2) {
23135 GemmMicrokernelTester()
23136 .mr(6)
23137 .nr(16)
23138 .kr(1)
23139 .sr(1)
23140 .m(6)
23141 .n(n)
23142 .k(k)
23143 .a_stride(7)
23144 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23145 }
23146 }
23147 }
23148
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,n_gt_16_subtile)23149 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, n_gt_16_subtile) {
23150 TEST_REQUIRES_X86_AVX512F;
23151 for (uint32_t n = 17; n < 32; n++) {
23152 for (size_t k = 1; k <= 5; k += 2) {
23153 for (uint32_t m = 1; m <= 6; m++) {
23154 GemmMicrokernelTester()
23155 .mr(6)
23156 .nr(16)
23157 .kr(1)
23158 .sr(1)
23159 .m(m)
23160 .n(n)
23161 .k(k)
23162 .iterations(1)
23163 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23164 }
23165 }
23166 }
23167 }
23168
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,n_div_16)23169 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, n_div_16) {
23170 TEST_REQUIRES_X86_AVX512F;
23171 for (uint32_t n = 32; n <= 48; n += 16) {
23172 for (size_t k = 1; k <= 5; k += 2) {
23173 GemmMicrokernelTester()
23174 .mr(6)
23175 .nr(16)
23176 .kr(1)
23177 .sr(1)
23178 .m(6)
23179 .n(n)
23180 .k(k)
23181 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23182 }
23183 }
23184 }
23185
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,n_div_16_strided_cn)23186 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
23187 TEST_REQUIRES_X86_AVX512F;
23188 for (uint32_t n = 32; n <= 48; n += 16) {
23189 for (size_t k = 1; k <= 5; k += 2) {
23190 GemmMicrokernelTester()
23191 .mr(6)
23192 .nr(16)
23193 .kr(1)
23194 .sr(1)
23195 .m(6)
23196 .n(n)
23197 .k(k)
23198 .cn_stride(19)
23199 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23200 }
23201 }
23202 }
23203
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,n_div_16_strided_a)23204 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, n_div_16_strided_a) {
23205 TEST_REQUIRES_X86_AVX512F;
23206 for (uint32_t n = 32; n <= 48; n += 16) {
23207 for (size_t k = 1; k <= 5; k += 2) {
23208 GemmMicrokernelTester()
23209 .mr(6)
23210 .nr(16)
23211 .kr(1)
23212 .sr(1)
23213 .m(6)
23214 .n(n)
23215 .k(k)
23216 .a_stride(7)
23217 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23218 }
23219 }
23220 }
23221
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,n_div_16_subtile)23222 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, n_div_16_subtile) {
23223 TEST_REQUIRES_X86_AVX512F;
23224 for (uint32_t n = 32; n <= 48; n += 16) {
23225 for (size_t k = 1; k <= 5; k += 2) {
23226 for (uint32_t m = 1; m <= 6; m++) {
23227 GemmMicrokernelTester()
23228 .mr(6)
23229 .nr(16)
23230 .kr(1)
23231 .sr(1)
23232 .m(m)
23233 .n(n)
23234 .k(k)
23235 .iterations(1)
23236 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23237 }
23238 }
23239 }
23240 }
23241
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,strided_cm_subtile)23242 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, strided_cm_subtile) {
23243 TEST_REQUIRES_X86_AVX512F;
23244 for (size_t k = 1; k <= 5; k += 2) {
23245 for (uint32_t n = 1; n <= 16; n++) {
23246 for (uint32_t m = 1; m <= 6; m++) {
23247 GemmMicrokernelTester()
23248 .mr(6)
23249 .nr(16)
23250 .kr(1)
23251 .sr(1)
23252 .m(m)
23253 .n(n)
23254 .k(k)
23255 .cm_stride(19)
23256 .iterations(1)
23257 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23258 }
23259 }
23260 }
23261 }
23262
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,qmin)23263 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, qmin) {
23264 TEST_REQUIRES_X86_AVX512F;
23265 GemmMicrokernelTester()
23266 .mr(6)
23267 .nr(16)
23268 .kr(1)
23269 .sr(1)
23270 .m(6)
23271 .n(16)
23272 .k(1)
23273 .qmin(128)
23274 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23275 }
23276
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,qmax)23277 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, qmax) {
23278 TEST_REQUIRES_X86_AVX512F;
23279 GemmMicrokernelTester()
23280 .mr(6)
23281 .nr(16)
23282 .kr(1)
23283 .sr(1)
23284 .m(6)
23285 .n(16)
23286 .k(1)
23287 .qmax(128)
23288 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23289 }
23290
TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST,strided_cm)23291 TEST(F32_GEMM_MINMAX_6X16__AVX512F_BROADCAST, strided_cm) {
23292 TEST_REQUIRES_X86_AVX512F;
23293 GemmMicrokernelTester()
23294 .mr(6)
23295 .nr(16)
23296 .kr(1)
23297 .sr(1)
23298 .m(6)
23299 .n(16)
23300 .k(1)
23301 .cm_stride(19)
23302 .Test(xnn_f32_gemm_minmax_ukernel_6x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23303 }
23304 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
23305
23306
23307 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,k_eq_1)23308 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, k_eq_1) {
23309 TEST_REQUIRES_X86_AVX512F;
23310 GemmMicrokernelTester()
23311 .mr(7)
23312 .nr(16)
23313 .kr(1)
23314 .sr(1)
23315 .m(7)
23316 .n(16)
23317 .k(1)
23318 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23319 }
23320
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,strided_cn)23321 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, strided_cn) {
23322 TEST_REQUIRES_X86_AVX512F;
23323 GemmMicrokernelTester()
23324 .mr(7)
23325 .nr(16)
23326 .kr(1)
23327 .sr(1)
23328 .m(7)
23329 .n(16)
23330 .k(1)
23331 .cn_stride(19)
23332 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23333 }
23334
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,k_eq_1_strided_a)23335 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
23336 TEST_REQUIRES_X86_AVX512F;
23337 GemmMicrokernelTester()
23338 .mr(7)
23339 .nr(16)
23340 .kr(1)
23341 .sr(1)
23342 .m(7)
23343 .n(16)
23344 .k(1)
23345 .a_stride(3)
23346 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23347 }
23348
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,k_eq_1_subtile)23349 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, k_eq_1_subtile) {
23350 TEST_REQUIRES_X86_AVX512F;
23351 for (uint32_t n = 1; n <= 16; n++) {
23352 for (uint32_t m = 1; m <= 7; m++) {
23353 GemmMicrokernelTester()
23354 .mr(7)
23355 .nr(16)
23356 .kr(1)
23357 .sr(1)
23358 .m(m)
23359 .n(n)
23360 .k(1)
23361 .iterations(1)
23362 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23363 }
23364 }
23365 }
23366
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,k_eq_1_subtile_m)23367 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
23368 TEST_REQUIRES_X86_AVX512F;
23369 for (uint32_t m = 1; m <= 7; m++) {
23370 GemmMicrokernelTester()
23371 .mr(7)
23372 .nr(16)
23373 .kr(1)
23374 .sr(1)
23375 .m(m)
23376 .n(16)
23377 .k(1)
23378 .iterations(1)
23379 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23380 }
23381 }
23382
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,k_eq_1_subtile_n)23383 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
23384 TEST_REQUIRES_X86_AVX512F;
23385 for (uint32_t n = 1; n <= 16; n++) {
23386 GemmMicrokernelTester()
23387 .mr(7)
23388 .nr(16)
23389 .kr(1)
23390 .sr(1)
23391 .m(7)
23392 .n(n)
23393 .k(1)
23394 .iterations(1)
23395 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23396 }
23397 }
23398
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,k_gt_1)23399 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, k_gt_1) {
23400 TEST_REQUIRES_X86_AVX512F;
23401 for (size_t k = 2; k < 10; k++) {
23402 GemmMicrokernelTester()
23403 .mr(7)
23404 .nr(16)
23405 .kr(1)
23406 .sr(1)
23407 .m(7)
23408 .n(16)
23409 .k(k)
23410 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23411 }
23412 }
23413
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,k_gt_1_strided_a)23414 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
23415 TEST_REQUIRES_X86_AVX512F;
23416 for (size_t k = 2; k < 10; k++) {
23417 GemmMicrokernelTester()
23418 .mr(7)
23419 .nr(16)
23420 .kr(1)
23421 .sr(1)
23422 .m(7)
23423 .n(16)
23424 .k(k)
23425 .a_stride(11)
23426 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23427 }
23428 }
23429
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,k_gt_1_subtile)23430 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, k_gt_1_subtile) {
23431 TEST_REQUIRES_X86_AVX512F;
23432 for (size_t k = 2; k < 10; k++) {
23433 for (uint32_t n = 1; n <= 16; n++) {
23434 for (uint32_t m = 1; m <= 7; m++) {
23435 GemmMicrokernelTester()
23436 .mr(7)
23437 .nr(16)
23438 .kr(1)
23439 .sr(1)
23440 .m(m)
23441 .n(n)
23442 .k(k)
23443 .iterations(1)
23444 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23445 }
23446 }
23447 }
23448 }
23449
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,n_gt_16)23450 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, n_gt_16) {
23451 TEST_REQUIRES_X86_AVX512F;
23452 for (uint32_t n = 17; n < 32; n++) {
23453 for (size_t k = 1; k <= 5; k += 2) {
23454 GemmMicrokernelTester()
23455 .mr(7)
23456 .nr(16)
23457 .kr(1)
23458 .sr(1)
23459 .m(7)
23460 .n(n)
23461 .k(k)
23462 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23463 }
23464 }
23465 }
23466
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,n_gt_16_strided_cn)23467 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
23468 TEST_REQUIRES_X86_AVX512F;
23469 for (uint32_t n = 17; n < 32; n++) {
23470 for (size_t k = 1; k <= 5; k += 2) {
23471 GemmMicrokernelTester()
23472 .mr(7)
23473 .nr(16)
23474 .kr(1)
23475 .sr(1)
23476 .m(7)
23477 .n(n)
23478 .k(k)
23479 .cn_stride(19)
23480 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23481 }
23482 }
23483 }
23484
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,n_gt_16_strided_a)23485 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
23486 TEST_REQUIRES_X86_AVX512F;
23487 for (uint32_t n = 17; n < 32; n++) {
23488 for (size_t k = 1; k <= 5; k += 2) {
23489 GemmMicrokernelTester()
23490 .mr(7)
23491 .nr(16)
23492 .kr(1)
23493 .sr(1)
23494 .m(7)
23495 .n(n)
23496 .k(k)
23497 .a_stride(7)
23498 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23499 }
23500 }
23501 }
23502
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,n_gt_16_subtile)23503 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, n_gt_16_subtile) {
23504 TEST_REQUIRES_X86_AVX512F;
23505 for (uint32_t n = 17; n < 32; n++) {
23506 for (size_t k = 1; k <= 5; k += 2) {
23507 for (uint32_t m = 1; m <= 7; m++) {
23508 GemmMicrokernelTester()
23509 .mr(7)
23510 .nr(16)
23511 .kr(1)
23512 .sr(1)
23513 .m(m)
23514 .n(n)
23515 .k(k)
23516 .iterations(1)
23517 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23518 }
23519 }
23520 }
23521 }
23522
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,n_div_16)23523 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, n_div_16) {
23524 TEST_REQUIRES_X86_AVX512F;
23525 for (uint32_t n = 32; n <= 48; n += 16) {
23526 for (size_t k = 1; k <= 5; k += 2) {
23527 GemmMicrokernelTester()
23528 .mr(7)
23529 .nr(16)
23530 .kr(1)
23531 .sr(1)
23532 .m(7)
23533 .n(n)
23534 .k(k)
23535 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23536 }
23537 }
23538 }
23539
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,n_div_16_strided_cn)23540 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
23541 TEST_REQUIRES_X86_AVX512F;
23542 for (uint32_t n = 32; n <= 48; n += 16) {
23543 for (size_t k = 1; k <= 5; k += 2) {
23544 GemmMicrokernelTester()
23545 .mr(7)
23546 .nr(16)
23547 .kr(1)
23548 .sr(1)
23549 .m(7)
23550 .n(n)
23551 .k(k)
23552 .cn_stride(19)
23553 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23554 }
23555 }
23556 }
23557
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,n_div_16_strided_a)23558 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, n_div_16_strided_a) {
23559 TEST_REQUIRES_X86_AVX512F;
23560 for (uint32_t n = 32; n <= 48; n += 16) {
23561 for (size_t k = 1; k <= 5; k += 2) {
23562 GemmMicrokernelTester()
23563 .mr(7)
23564 .nr(16)
23565 .kr(1)
23566 .sr(1)
23567 .m(7)
23568 .n(n)
23569 .k(k)
23570 .a_stride(7)
23571 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23572 }
23573 }
23574 }
23575
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,n_div_16_subtile)23576 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, n_div_16_subtile) {
23577 TEST_REQUIRES_X86_AVX512F;
23578 for (uint32_t n = 32; n <= 48; n += 16) {
23579 for (size_t k = 1; k <= 5; k += 2) {
23580 for (uint32_t m = 1; m <= 7; m++) {
23581 GemmMicrokernelTester()
23582 .mr(7)
23583 .nr(16)
23584 .kr(1)
23585 .sr(1)
23586 .m(m)
23587 .n(n)
23588 .k(k)
23589 .iterations(1)
23590 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23591 }
23592 }
23593 }
23594 }
23595
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,strided_cm_subtile)23596 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, strided_cm_subtile) {
23597 TEST_REQUIRES_X86_AVX512F;
23598 for (size_t k = 1; k <= 5; k += 2) {
23599 for (uint32_t n = 1; n <= 16; n++) {
23600 for (uint32_t m = 1; m <= 7; m++) {
23601 GemmMicrokernelTester()
23602 .mr(7)
23603 .nr(16)
23604 .kr(1)
23605 .sr(1)
23606 .m(m)
23607 .n(n)
23608 .k(k)
23609 .cm_stride(19)
23610 .iterations(1)
23611 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23612 }
23613 }
23614 }
23615 }
23616
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,qmin)23617 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, qmin) {
23618 TEST_REQUIRES_X86_AVX512F;
23619 GemmMicrokernelTester()
23620 .mr(7)
23621 .nr(16)
23622 .kr(1)
23623 .sr(1)
23624 .m(7)
23625 .n(16)
23626 .k(1)
23627 .qmin(128)
23628 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23629 }
23630
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,qmax)23631 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, qmax) {
23632 TEST_REQUIRES_X86_AVX512F;
23633 GemmMicrokernelTester()
23634 .mr(7)
23635 .nr(16)
23636 .kr(1)
23637 .sr(1)
23638 .m(7)
23639 .n(16)
23640 .k(1)
23641 .qmax(128)
23642 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23643 }
23644
TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST,strided_cm)23645 TEST(F32_GEMM_MINMAX_7X16__AVX512F_BROADCAST, strided_cm) {
23646 TEST_REQUIRES_X86_AVX512F;
23647 GemmMicrokernelTester()
23648 .mr(7)
23649 .nr(16)
23650 .kr(1)
23651 .sr(1)
23652 .m(7)
23653 .n(16)
23654 .k(1)
23655 .cm_stride(19)
23656 .Test(xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23657 }
23658 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
23659
23660
23661 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,k_eq_1)23662 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, k_eq_1) {
23663 TEST_REQUIRES_X86_AVX512F;
23664 GemmMicrokernelTester()
23665 .mr(8)
23666 .nr(16)
23667 .kr(1)
23668 .sr(1)
23669 .m(8)
23670 .n(16)
23671 .k(1)
23672 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23673 }
23674
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,strided_cn)23675 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, strided_cn) {
23676 TEST_REQUIRES_X86_AVX512F;
23677 GemmMicrokernelTester()
23678 .mr(8)
23679 .nr(16)
23680 .kr(1)
23681 .sr(1)
23682 .m(8)
23683 .n(16)
23684 .k(1)
23685 .cn_stride(19)
23686 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23687 }
23688
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,k_eq_1_strided_a)23689 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
23690 TEST_REQUIRES_X86_AVX512F;
23691 GemmMicrokernelTester()
23692 .mr(8)
23693 .nr(16)
23694 .kr(1)
23695 .sr(1)
23696 .m(8)
23697 .n(16)
23698 .k(1)
23699 .a_stride(3)
23700 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23701 }
23702
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,k_eq_1_subtile)23703 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, k_eq_1_subtile) {
23704 TEST_REQUIRES_X86_AVX512F;
23705 for (uint32_t n = 1; n <= 16; n++) {
23706 for (uint32_t m = 1; m <= 8; m++) {
23707 GemmMicrokernelTester()
23708 .mr(8)
23709 .nr(16)
23710 .kr(1)
23711 .sr(1)
23712 .m(m)
23713 .n(n)
23714 .k(1)
23715 .iterations(1)
23716 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23717 }
23718 }
23719 }
23720
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,k_eq_1_subtile_m)23721 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
23722 TEST_REQUIRES_X86_AVX512F;
23723 for (uint32_t m = 1; m <= 8; m++) {
23724 GemmMicrokernelTester()
23725 .mr(8)
23726 .nr(16)
23727 .kr(1)
23728 .sr(1)
23729 .m(m)
23730 .n(16)
23731 .k(1)
23732 .iterations(1)
23733 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23734 }
23735 }
23736
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,k_eq_1_subtile_n)23737 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
23738 TEST_REQUIRES_X86_AVX512F;
23739 for (uint32_t n = 1; n <= 16; n++) {
23740 GemmMicrokernelTester()
23741 .mr(8)
23742 .nr(16)
23743 .kr(1)
23744 .sr(1)
23745 .m(8)
23746 .n(n)
23747 .k(1)
23748 .iterations(1)
23749 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23750 }
23751 }
23752
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,k_gt_1)23753 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, k_gt_1) {
23754 TEST_REQUIRES_X86_AVX512F;
23755 for (size_t k = 2; k < 10; k++) {
23756 GemmMicrokernelTester()
23757 .mr(8)
23758 .nr(16)
23759 .kr(1)
23760 .sr(1)
23761 .m(8)
23762 .n(16)
23763 .k(k)
23764 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23765 }
23766 }
23767
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,k_gt_1_strided_a)23768 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
23769 TEST_REQUIRES_X86_AVX512F;
23770 for (size_t k = 2; k < 10; k++) {
23771 GemmMicrokernelTester()
23772 .mr(8)
23773 .nr(16)
23774 .kr(1)
23775 .sr(1)
23776 .m(8)
23777 .n(16)
23778 .k(k)
23779 .a_stride(11)
23780 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23781 }
23782 }
23783
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,k_gt_1_subtile)23784 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, k_gt_1_subtile) {
23785 TEST_REQUIRES_X86_AVX512F;
23786 for (size_t k = 2; k < 10; k++) {
23787 for (uint32_t n = 1; n <= 16; n++) {
23788 for (uint32_t m = 1; m <= 8; m++) {
23789 GemmMicrokernelTester()
23790 .mr(8)
23791 .nr(16)
23792 .kr(1)
23793 .sr(1)
23794 .m(m)
23795 .n(n)
23796 .k(k)
23797 .iterations(1)
23798 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23799 }
23800 }
23801 }
23802 }
23803
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,n_gt_16)23804 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, n_gt_16) {
23805 TEST_REQUIRES_X86_AVX512F;
23806 for (uint32_t n = 17; n < 32; n++) {
23807 for (size_t k = 1; k <= 5; k += 2) {
23808 GemmMicrokernelTester()
23809 .mr(8)
23810 .nr(16)
23811 .kr(1)
23812 .sr(1)
23813 .m(8)
23814 .n(n)
23815 .k(k)
23816 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23817 }
23818 }
23819 }
23820
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,n_gt_16_strided_cn)23821 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
23822 TEST_REQUIRES_X86_AVX512F;
23823 for (uint32_t n = 17; n < 32; n++) {
23824 for (size_t k = 1; k <= 5; k += 2) {
23825 GemmMicrokernelTester()
23826 .mr(8)
23827 .nr(16)
23828 .kr(1)
23829 .sr(1)
23830 .m(8)
23831 .n(n)
23832 .k(k)
23833 .cn_stride(19)
23834 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23835 }
23836 }
23837 }
23838
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,n_gt_16_strided_a)23839 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
23840 TEST_REQUIRES_X86_AVX512F;
23841 for (uint32_t n = 17; n < 32; n++) {
23842 for (size_t k = 1; k <= 5; k += 2) {
23843 GemmMicrokernelTester()
23844 .mr(8)
23845 .nr(16)
23846 .kr(1)
23847 .sr(1)
23848 .m(8)
23849 .n(n)
23850 .k(k)
23851 .a_stride(7)
23852 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23853 }
23854 }
23855 }
23856
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,n_gt_16_subtile)23857 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, n_gt_16_subtile) {
23858 TEST_REQUIRES_X86_AVX512F;
23859 for (uint32_t n = 17; n < 32; n++) {
23860 for (size_t k = 1; k <= 5; k += 2) {
23861 for (uint32_t m = 1; m <= 8; m++) {
23862 GemmMicrokernelTester()
23863 .mr(8)
23864 .nr(16)
23865 .kr(1)
23866 .sr(1)
23867 .m(m)
23868 .n(n)
23869 .k(k)
23870 .iterations(1)
23871 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23872 }
23873 }
23874 }
23875 }
23876
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,n_div_16)23877 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, n_div_16) {
23878 TEST_REQUIRES_X86_AVX512F;
23879 for (uint32_t n = 32; n <= 48; n += 16) {
23880 for (size_t k = 1; k <= 5; k += 2) {
23881 GemmMicrokernelTester()
23882 .mr(8)
23883 .nr(16)
23884 .kr(1)
23885 .sr(1)
23886 .m(8)
23887 .n(n)
23888 .k(k)
23889 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23890 }
23891 }
23892 }
23893
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,n_div_16_strided_cn)23894 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
23895 TEST_REQUIRES_X86_AVX512F;
23896 for (uint32_t n = 32; n <= 48; n += 16) {
23897 for (size_t k = 1; k <= 5; k += 2) {
23898 GemmMicrokernelTester()
23899 .mr(8)
23900 .nr(16)
23901 .kr(1)
23902 .sr(1)
23903 .m(8)
23904 .n(n)
23905 .k(k)
23906 .cn_stride(19)
23907 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23908 }
23909 }
23910 }
23911
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,n_div_16_strided_a)23912 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, n_div_16_strided_a) {
23913 TEST_REQUIRES_X86_AVX512F;
23914 for (uint32_t n = 32; n <= 48; n += 16) {
23915 for (size_t k = 1; k <= 5; k += 2) {
23916 GemmMicrokernelTester()
23917 .mr(8)
23918 .nr(16)
23919 .kr(1)
23920 .sr(1)
23921 .m(8)
23922 .n(n)
23923 .k(k)
23924 .a_stride(7)
23925 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23926 }
23927 }
23928 }
23929
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,n_div_16_subtile)23930 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, n_div_16_subtile) {
23931 TEST_REQUIRES_X86_AVX512F;
23932 for (uint32_t n = 32; n <= 48; n += 16) {
23933 for (size_t k = 1; k <= 5; k += 2) {
23934 for (uint32_t m = 1; m <= 8; m++) {
23935 GemmMicrokernelTester()
23936 .mr(8)
23937 .nr(16)
23938 .kr(1)
23939 .sr(1)
23940 .m(m)
23941 .n(n)
23942 .k(k)
23943 .iterations(1)
23944 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23945 }
23946 }
23947 }
23948 }
23949
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,strided_cm_subtile)23950 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, strided_cm_subtile) {
23951 TEST_REQUIRES_X86_AVX512F;
23952 for (size_t k = 1; k <= 5; k += 2) {
23953 for (uint32_t n = 1; n <= 16; n++) {
23954 for (uint32_t m = 1; m <= 8; m++) {
23955 GemmMicrokernelTester()
23956 .mr(8)
23957 .nr(16)
23958 .kr(1)
23959 .sr(1)
23960 .m(m)
23961 .n(n)
23962 .k(k)
23963 .cm_stride(19)
23964 .iterations(1)
23965 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23966 }
23967 }
23968 }
23969 }
23970
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,qmin)23971 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, qmin) {
23972 TEST_REQUIRES_X86_AVX512F;
23973 GemmMicrokernelTester()
23974 .mr(8)
23975 .nr(16)
23976 .kr(1)
23977 .sr(1)
23978 .m(8)
23979 .n(16)
23980 .k(1)
23981 .qmin(128)
23982 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23983 }
23984
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,qmax)23985 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, qmax) {
23986 TEST_REQUIRES_X86_AVX512F;
23987 GemmMicrokernelTester()
23988 .mr(8)
23989 .nr(16)
23990 .kr(1)
23991 .sr(1)
23992 .m(8)
23993 .n(16)
23994 .k(1)
23995 .qmax(128)
23996 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
23997 }
23998
TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST,strided_cm)23999 TEST(F32_GEMM_MINMAX_8X16__AVX512F_BROADCAST, strided_cm) {
24000 TEST_REQUIRES_X86_AVX512F;
24001 GemmMicrokernelTester()
24002 .mr(8)
24003 .nr(16)
24004 .kr(1)
24005 .sr(1)
24006 .m(8)
24007 .n(16)
24008 .k(1)
24009 .cm_stride(19)
24010 .Test(xnn_f32_gemm_minmax_ukernel_8x16__avx512f_broadcast, xnn_init_f32_minmax_scalar_params);
24011 }
24012 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24013
24014
24015 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_eq_4)24016 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_eq_4) {
24017 GemmMicrokernelTester()
24018 .mr(1)
24019 .nr(8)
24020 .kr(1)
24021 .sr(1)
24022 .m(1)
24023 .n(8)
24024 .k(4)
24025 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24026 }
24027
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,strided_cn)24028 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, strided_cn) {
24029 GemmMicrokernelTester()
24030 .mr(1)
24031 .nr(8)
24032 .kr(1)
24033 .sr(1)
24034 .m(1)
24035 .n(8)
24036 .k(4)
24037 .cn_stride(11)
24038 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24039 }
24040
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_eq_4_strided_a)24041 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_eq_4_strided_a) {
24042 GemmMicrokernelTester()
24043 .mr(1)
24044 .nr(8)
24045 .kr(1)
24046 .sr(1)
24047 .m(1)
24048 .n(8)
24049 .k(4)
24050 .a_stride(7)
24051 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24052 }
24053
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_eq_4_subtile)24054 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_eq_4_subtile) {
24055 for (uint32_t n = 1; n <= 8; n++) {
24056 for (uint32_t m = 1; m <= 1; m++) {
24057 GemmMicrokernelTester()
24058 .mr(1)
24059 .nr(8)
24060 .kr(1)
24061 .sr(1)
24062 .m(m)
24063 .n(n)
24064 .k(4)
24065 .iterations(1)
24066 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24067 }
24068 }
24069 }
24070
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_eq_4_subtile_m)24071 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_eq_4_subtile_m) {
24072 for (uint32_t m = 1; m <= 1; m++) {
24073 GemmMicrokernelTester()
24074 .mr(1)
24075 .nr(8)
24076 .kr(1)
24077 .sr(1)
24078 .m(m)
24079 .n(8)
24080 .k(4)
24081 .iterations(1)
24082 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24083 }
24084 }
24085
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_eq_4_subtile_n)24086 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_eq_4_subtile_n) {
24087 for (uint32_t n = 1; n <= 8; n++) {
24088 GemmMicrokernelTester()
24089 .mr(1)
24090 .nr(8)
24091 .kr(1)
24092 .sr(1)
24093 .m(1)
24094 .n(n)
24095 .k(4)
24096 .iterations(1)
24097 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24098 }
24099 }
24100
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_lt_4)24101 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_lt_4) {
24102 for (size_t k = 1; k < 4; k++) {
24103 GemmMicrokernelTester()
24104 .mr(1)
24105 .nr(8)
24106 .kr(1)
24107 .sr(1)
24108 .m(1)
24109 .n(8)
24110 .k(k)
24111 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24112 }
24113 }
24114
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_lt_4_strided_a)24115 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_lt_4_strided_a) {
24116 for (size_t k = 1; k < 4; k++) {
24117 GemmMicrokernelTester()
24118 .mr(1)
24119 .nr(8)
24120 .kr(1)
24121 .sr(1)
24122 .m(1)
24123 .n(8)
24124 .k(k)
24125 .a_stride(7)
24126 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24127 }
24128 }
24129
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_lt_4_subtile)24130 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_lt_4_subtile) {
24131 for (size_t k = 1; k < 4; k++) {
24132 for (uint32_t n = 1; n <= 8; n++) {
24133 for (uint32_t m = 1; m <= 1; m++) {
24134 GemmMicrokernelTester()
24135 .mr(1)
24136 .nr(8)
24137 .kr(1)
24138 .sr(1)
24139 .m(m)
24140 .n(n)
24141 .k(k)
24142 .iterations(1)
24143 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24144 }
24145 }
24146 }
24147 }
24148
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_gt_4)24149 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_gt_4) {
24150 for (size_t k = 5; k < 8; k++) {
24151 GemmMicrokernelTester()
24152 .mr(1)
24153 .nr(8)
24154 .kr(1)
24155 .sr(1)
24156 .m(1)
24157 .n(8)
24158 .k(k)
24159 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24160 }
24161 }
24162
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_gt_4_strided_a)24163 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_gt_4_strided_a) {
24164 for (size_t k = 5; k < 8; k++) {
24165 GemmMicrokernelTester()
24166 .mr(1)
24167 .nr(8)
24168 .kr(1)
24169 .sr(1)
24170 .m(1)
24171 .n(8)
24172 .k(k)
24173 .a_stride(11)
24174 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24175 }
24176 }
24177
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_gt_4_subtile)24178 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_gt_4_subtile) {
24179 for (size_t k = 5; k < 8; k++) {
24180 for (uint32_t n = 1; n <= 8; n++) {
24181 for (uint32_t m = 1; m <= 1; m++) {
24182 GemmMicrokernelTester()
24183 .mr(1)
24184 .nr(8)
24185 .kr(1)
24186 .sr(1)
24187 .m(m)
24188 .n(n)
24189 .k(k)
24190 .iterations(1)
24191 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24192 }
24193 }
24194 }
24195 }
24196
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_div_4)24197 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_div_4) {
24198 for (size_t k = 8; k <= 40; k += 4) {
24199 GemmMicrokernelTester()
24200 .mr(1)
24201 .nr(8)
24202 .kr(1)
24203 .sr(1)
24204 .m(1)
24205 .n(8)
24206 .k(k)
24207 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24208 }
24209 }
24210
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_div_4_strided_a)24211 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_div_4_strided_a) {
24212 for (size_t k = 8; k <= 40; k += 4) {
24213 GemmMicrokernelTester()
24214 .mr(1)
24215 .nr(8)
24216 .kr(1)
24217 .sr(1)
24218 .m(1)
24219 .n(8)
24220 .k(k)
24221 .a_stride(43)
24222 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24223 }
24224 }
24225
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,k_div_4_subtile)24226 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, k_div_4_subtile) {
24227 for (size_t k = 8; k <= 40; k += 4) {
24228 for (uint32_t n = 1; n <= 8; n++) {
24229 for (uint32_t m = 1; m <= 1; m++) {
24230 GemmMicrokernelTester()
24231 .mr(1)
24232 .nr(8)
24233 .kr(1)
24234 .sr(1)
24235 .m(m)
24236 .n(n)
24237 .k(k)
24238 .iterations(1)
24239 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24240 }
24241 }
24242 }
24243 }
24244
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,n_gt_8)24245 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, n_gt_8) {
24246 for (uint32_t n = 9; n < 16; n++) {
24247 for (size_t k = 1; k <= 20; k += 5) {
24248 GemmMicrokernelTester()
24249 .mr(1)
24250 .nr(8)
24251 .kr(1)
24252 .sr(1)
24253 .m(1)
24254 .n(n)
24255 .k(k)
24256 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24257 }
24258 }
24259 }
24260
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,n_gt_8_strided_cn)24261 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, n_gt_8_strided_cn) {
24262 for (uint32_t n = 9; n < 16; n++) {
24263 for (size_t k = 1; k <= 20; k += 5) {
24264 GemmMicrokernelTester()
24265 .mr(1)
24266 .nr(8)
24267 .kr(1)
24268 .sr(1)
24269 .m(1)
24270 .n(n)
24271 .k(k)
24272 .cn_stride(11)
24273 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24274 }
24275 }
24276 }
24277
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,n_gt_8_strided_a)24278 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, n_gt_8_strided_a) {
24279 for (uint32_t n = 9; n < 16; n++) {
24280 for (size_t k = 1; k <= 20; k += 5) {
24281 GemmMicrokernelTester()
24282 .mr(1)
24283 .nr(8)
24284 .kr(1)
24285 .sr(1)
24286 .m(1)
24287 .n(n)
24288 .k(k)
24289 .a_stride(23)
24290 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24291 }
24292 }
24293 }
24294
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,n_gt_8_subtile)24295 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, n_gt_8_subtile) {
24296 for (uint32_t n = 9; n < 16; n++) {
24297 for (size_t k = 1; k <= 20; k += 5) {
24298 for (uint32_t m = 1; m <= 1; m++) {
24299 GemmMicrokernelTester()
24300 .mr(1)
24301 .nr(8)
24302 .kr(1)
24303 .sr(1)
24304 .m(m)
24305 .n(n)
24306 .k(k)
24307 .iterations(1)
24308 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24309 }
24310 }
24311 }
24312 }
24313
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,n_div_8)24314 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, n_div_8) {
24315 for (uint32_t n = 16; n <= 24; n += 8) {
24316 for (size_t k = 1; k <= 20; k += 5) {
24317 GemmMicrokernelTester()
24318 .mr(1)
24319 .nr(8)
24320 .kr(1)
24321 .sr(1)
24322 .m(1)
24323 .n(n)
24324 .k(k)
24325 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24326 }
24327 }
24328 }
24329
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,n_div_8_strided_cn)24330 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, n_div_8_strided_cn) {
24331 for (uint32_t n = 16; n <= 24; n += 8) {
24332 for (size_t k = 1; k <= 20; k += 5) {
24333 GemmMicrokernelTester()
24334 .mr(1)
24335 .nr(8)
24336 .kr(1)
24337 .sr(1)
24338 .m(1)
24339 .n(n)
24340 .k(k)
24341 .cn_stride(11)
24342 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24343 }
24344 }
24345 }
24346
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,n_div_8_strided_a)24347 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, n_div_8_strided_a) {
24348 for (uint32_t n = 16; n <= 24; n += 8) {
24349 for (size_t k = 1; k <= 20; k += 5) {
24350 GemmMicrokernelTester()
24351 .mr(1)
24352 .nr(8)
24353 .kr(1)
24354 .sr(1)
24355 .m(1)
24356 .n(n)
24357 .k(k)
24358 .a_stride(23)
24359 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24360 }
24361 }
24362 }
24363
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,n_div_8_subtile)24364 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, n_div_8_subtile) {
24365 for (uint32_t n = 16; n <= 24; n += 8) {
24366 for (size_t k = 1; k <= 20; k += 5) {
24367 for (uint32_t m = 1; m <= 1; m++) {
24368 GemmMicrokernelTester()
24369 .mr(1)
24370 .nr(8)
24371 .kr(1)
24372 .sr(1)
24373 .m(m)
24374 .n(n)
24375 .k(k)
24376 .iterations(1)
24377 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24378 }
24379 }
24380 }
24381 }
24382
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,strided_cm_subtile)24383 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, strided_cm_subtile) {
24384 for (size_t k = 1; k <= 20; k += 5) {
24385 for (uint32_t n = 1; n <= 8; n++) {
24386 for (uint32_t m = 1; m <= 1; m++) {
24387 GemmMicrokernelTester()
24388 .mr(1)
24389 .nr(8)
24390 .kr(1)
24391 .sr(1)
24392 .m(m)
24393 .n(n)
24394 .k(k)
24395 .cm_stride(11)
24396 .iterations(1)
24397 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24398 }
24399 }
24400 }
24401 }
24402
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,qmin)24403 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, qmin) {
24404 GemmMicrokernelTester()
24405 .mr(1)
24406 .nr(8)
24407 .kr(1)
24408 .sr(1)
24409 .m(1)
24410 .n(8)
24411 .k(4)
24412 .qmin(128)
24413 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24414 }
24415
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,qmax)24416 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, qmax) {
24417 GemmMicrokernelTester()
24418 .mr(1)
24419 .nr(8)
24420 .kr(1)
24421 .sr(1)
24422 .m(1)
24423 .n(8)
24424 .k(4)
24425 .qmax(128)
24426 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24427 }
24428
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT,strided_cm)24429 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_ARM_SPLAT, strided_cm) {
24430 GemmMicrokernelTester()
24431 .mr(1)
24432 .nr(8)
24433 .kr(1)
24434 .sr(1)
24435 .m(1)
24436 .n(8)
24437 .k(4)
24438 .cm_stride(11)
24439 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
24440 }
24441 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
24442
24443
24444 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,k_eq_1)24445 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, k_eq_1) {
24446 GemmMicrokernelTester()
24447 .mr(1)
24448 .nr(8)
24449 .kr(1)
24450 .sr(1)
24451 .m(1)
24452 .n(8)
24453 .k(1)
24454 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24455 }
24456
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,strided_cn)24457 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, strided_cn) {
24458 GemmMicrokernelTester()
24459 .mr(1)
24460 .nr(8)
24461 .kr(1)
24462 .sr(1)
24463 .m(1)
24464 .n(8)
24465 .k(1)
24466 .cn_stride(11)
24467 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24468 }
24469
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_strided_a)24470 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_strided_a) {
24471 GemmMicrokernelTester()
24472 .mr(1)
24473 .nr(8)
24474 .kr(1)
24475 .sr(1)
24476 .m(1)
24477 .n(8)
24478 .k(1)
24479 .a_stride(3)
24480 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24481 }
24482
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile)24483 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile) {
24484 for (uint32_t n = 1; n <= 8; n++) {
24485 for (uint32_t m = 1; m <= 1; m++) {
24486 GemmMicrokernelTester()
24487 .mr(1)
24488 .nr(8)
24489 .kr(1)
24490 .sr(1)
24491 .m(m)
24492 .n(n)
24493 .k(1)
24494 .iterations(1)
24495 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24496 }
24497 }
24498 }
24499
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile_m)24500 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile_m) {
24501 for (uint32_t m = 1; m <= 1; m++) {
24502 GemmMicrokernelTester()
24503 .mr(1)
24504 .nr(8)
24505 .kr(1)
24506 .sr(1)
24507 .m(m)
24508 .n(8)
24509 .k(1)
24510 .iterations(1)
24511 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24512 }
24513 }
24514
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile_n)24515 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile_n) {
24516 for (uint32_t n = 1; n <= 8; n++) {
24517 GemmMicrokernelTester()
24518 .mr(1)
24519 .nr(8)
24520 .kr(1)
24521 .sr(1)
24522 .m(1)
24523 .n(n)
24524 .k(1)
24525 .iterations(1)
24526 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24527 }
24528 }
24529
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,k_gt_1)24530 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, k_gt_1) {
24531 for (size_t k = 2; k < 10; k++) {
24532 GemmMicrokernelTester()
24533 .mr(1)
24534 .nr(8)
24535 .kr(1)
24536 .sr(1)
24537 .m(1)
24538 .n(8)
24539 .k(k)
24540 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24541 }
24542 }
24543
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,k_gt_1_strided_a)24544 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, k_gt_1_strided_a) {
24545 for (size_t k = 2; k < 10; k++) {
24546 GemmMicrokernelTester()
24547 .mr(1)
24548 .nr(8)
24549 .kr(1)
24550 .sr(1)
24551 .m(1)
24552 .n(8)
24553 .k(k)
24554 .a_stride(11)
24555 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24556 }
24557 }
24558
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,k_gt_1_subtile)24559 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, k_gt_1_subtile) {
24560 for (size_t k = 2; k < 10; k++) {
24561 for (uint32_t n = 1; n <= 8; n++) {
24562 for (uint32_t m = 1; m <= 1; m++) {
24563 GemmMicrokernelTester()
24564 .mr(1)
24565 .nr(8)
24566 .kr(1)
24567 .sr(1)
24568 .m(m)
24569 .n(n)
24570 .k(k)
24571 .iterations(1)
24572 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24573 }
24574 }
24575 }
24576 }
24577
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,n_gt_8)24578 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, n_gt_8) {
24579 for (uint32_t n = 9; n < 16; n++) {
24580 for (size_t k = 1; k <= 5; k += 2) {
24581 GemmMicrokernelTester()
24582 .mr(1)
24583 .nr(8)
24584 .kr(1)
24585 .sr(1)
24586 .m(1)
24587 .n(n)
24588 .k(k)
24589 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24590 }
24591 }
24592 }
24593
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_strided_cn)24594 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_strided_cn) {
24595 for (uint32_t n = 9; n < 16; n++) {
24596 for (size_t k = 1; k <= 5; k += 2) {
24597 GemmMicrokernelTester()
24598 .mr(1)
24599 .nr(8)
24600 .kr(1)
24601 .sr(1)
24602 .m(1)
24603 .n(n)
24604 .k(k)
24605 .cn_stride(11)
24606 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24607 }
24608 }
24609 }
24610
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_strided_a)24611 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_strided_a) {
24612 for (uint32_t n = 9; n < 16; n++) {
24613 for (size_t k = 1; k <= 5; k += 2) {
24614 GemmMicrokernelTester()
24615 .mr(1)
24616 .nr(8)
24617 .kr(1)
24618 .sr(1)
24619 .m(1)
24620 .n(n)
24621 .k(k)
24622 .a_stride(7)
24623 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24624 }
24625 }
24626 }
24627
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_subtile)24628 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_subtile) {
24629 for (uint32_t n = 9; n < 16; n++) {
24630 for (size_t k = 1; k <= 5; k += 2) {
24631 for (uint32_t m = 1; m <= 1; m++) {
24632 GemmMicrokernelTester()
24633 .mr(1)
24634 .nr(8)
24635 .kr(1)
24636 .sr(1)
24637 .m(m)
24638 .n(n)
24639 .k(k)
24640 .iterations(1)
24641 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24642 }
24643 }
24644 }
24645 }
24646
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,n_div_8)24647 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, n_div_8) {
24648 for (uint32_t n = 16; n <= 24; n += 8) {
24649 for (size_t k = 1; k <= 5; k += 2) {
24650 GemmMicrokernelTester()
24651 .mr(1)
24652 .nr(8)
24653 .kr(1)
24654 .sr(1)
24655 .m(1)
24656 .n(n)
24657 .k(k)
24658 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24659 }
24660 }
24661 }
24662
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,n_div_8_strided_cn)24663 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, n_div_8_strided_cn) {
24664 for (uint32_t n = 16; n <= 24; n += 8) {
24665 for (size_t k = 1; k <= 5; k += 2) {
24666 GemmMicrokernelTester()
24667 .mr(1)
24668 .nr(8)
24669 .kr(1)
24670 .sr(1)
24671 .m(1)
24672 .n(n)
24673 .k(k)
24674 .cn_stride(11)
24675 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24676 }
24677 }
24678 }
24679
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,n_div_8_strided_a)24680 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, n_div_8_strided_a) {
24681 for (uint32_t n = 16; n <= 24; n += 8) {
24682 for (size_t k = 1; k <= 5; k += 2) {
24683 GemmMicrokernelTester()
24684 .mr(1)
24685 .nr(8)
24686 .kr(1)
24687 .sr(1)
24688 .m(1)
24689 .n(n)
24690 .k(k)
24691 .a_stride(7)
24692 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24693 }
24694 }
24695 }
24696
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,n_div_8_subtile)24697 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, n_div_8_subtile) {
24698 for (uint32_t n = 16; n <= 24; n += 8) {
24699 for (size_t k = 1; k <= 5; k += 2) {
24700 for (uint32_t m = 1; m <= 1; m++) {
24701 GemmMicrokernelTester()
24702 .mr(1)
24703 .nr(8)
24704 .kr(1)
24705 .sr(1)
24706 .m(m)
24707 .n(n)
24708 .k(k)
24709 .iterations(1)
24710 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24711 }
24712 }
24713 }
24714 }
24715
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,strided_cm_subtile)24716 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, strided_cm_subtile) {
24717 for (size_t k = 1; k <= 5; k += 2) {
24718 for (uint32_t n = 1; n <= 8; n++) {
24719 for (uint32_t m = 1; m <= 1; m++) {
24720 GemmMicrokernelTester()
24721 .mr(1)
24722 .nr(8)
24723 .kr(1)
24724 .sr(1)
24725 .m(m)
24726 .n(n)
24727 .k(k)
24728 .cm_stride(11)
24729 .iterations(1)
24730 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24731 }
24732 }
24733 }
24734 }
24735
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,qmin)24736 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, qmin) {
24737 GemmMicrokernelTester()
24738 .mr(1)
24739 .nr(8)
24740 .kr(1)
24741 .sr(1)
24742 .m(1)
24743 .n(8)
24744 .k(1)
24745 .qmin(128)
24746 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24747 }
24748
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,qmax)24749 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, qmax) {
24750 GemmMicrokernelTester()
24751 .mr(1)
24752 .nr(8)
24753 .kr(1)
24754 .sr(1)
24755 .m(1)
24756 .n(8)
24757 .k(1)
24758 .qmax(128)
24759 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24760 }
24761
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT,strided_cm)24762 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_LOADSPLAT, strided_cm) {
24763 GemmMicrokernelTester()
24764 .mr(1)
24765 .nr(8)
24766 .kr(1)
24767 .sr(1)
24768 .m(1)
24769 .n(8)
24770 .k(1)
24771 .cm_stride(11)
24772 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
24773 }
24774 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
24775
24776
24777 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_eq_4)24778 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_eq_4) {
24779 GemmMicrokernelTester()
24780 .mr(1)
24781 .nr(8)
24782 .kr(1)
24783 .sr(1)
24784 .m(1)
24785 .n(8)
24786 .k(4)
24787 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24788 }
24789
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,strided_cn)24790 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, strided_cn) {
24791 GemmMicrokernelTester()
24792 .mr(1)
24793 .nr(8)
24794 .kr(1)
24795 .sr(1)
24796 .m(1)
24797 .n(8)
24798 .k(4)
24799 .cn_stride(11)
24800 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24801 }
24802
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_eq_4_strided_a)24803 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_eq_4_strided_a) {
24804 GemmMicrokernelTester()
24805 .mr(1)
24806 .nr(8)
24807 .kr(1)
24808 .sr(1)
24809 .m(1)
24810 .n(8)
24811 .k(4)
24812 .a_stride(7)
24813 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24814 }
24815
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_eq_4_subtile)24816 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_eq_4_subtile) {
24817 for (uint32_t n = 1; n <= 8; n++) {
24818 for (uint32_t m = 1; m <= 1; m++) {
24819 GemmMicrokernelTester()
24820 .mr(1)
24821 .nr(8)
24822 .kr(1)
24823 .sr(1)
24824 .m(m)
24825 .n(n)
24826 .k(4)
24827 .iterations(1)
24828 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24829 }
24830 }
24831 }
24832
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_eq_4_subtile_m)24833 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_eq_4_subtile_m) {
24834 for (uint32_t m = 1; m <= 1; m++) {
24835 GemmMicrokernelTester()
24836 .mr(1)
24837 .nr(8)
24838 .kr(1)
24839 .sr(1)
24840 .m(m)
24841 .n(8)
24842 .k(4)
24843 .iterations(1)
24844 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24845 }
24846 }
24847
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_eq_4_subtile_n)24848 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_eq_4_subtile_n) {
24849 for (uint32_t n = 1; n <= 8; n++) {
24850 GemmMicrokernelTester()
24851 .mr(1)
24852 .nr(8)
24853 .kr(1)
24854 .sr(1)
24855 .m(1)
24856 .n(n)
24857 .k(4)
24858 .iterations(1)
24859 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24860 }
24861 }
24862
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_lt_4)24863 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_lt_4) {
24864 for (size_t k = 1; k < 4; k++) {
24865 GemmMicrokernelTester()
24866 .mr(1)
24867 .nr(8)
24868 .kr(1)
24869 .sr(1)
24870 .m(1)
24871 .n(8)
24872 .k(k)
24873 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24874 }
24875 }
24876
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_lt_4_strided_a)24877 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_lt_4_strided_a) {
24878 for (size_t k = 1; k < 4; k++) {
24879 GemmMicrokernelTester()
24880 .mr(1)
24881 .nr(8)
24882 .kr(1)
24883 .sr(1)
24884 .m(1)
24885 .n(8)
24886 .k(k)
24887 .a_stride(7)
24888 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24889 }
24890 }
24891
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_lt_4_subtile)24892 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_lt_4_subtile) {
24893 for (size_t k = 1; k < 4; k++) {
24894 for (uint32_t n = 1; n <= 8; n++) {
24895 for (uint32_t m = 1; m <= 1; m++) {
24896 GemmMicrokernelTester()
24897 .mr(1)
24898 .nr(8)
24899 .kr(1)
24900 .sr(1)
24901 .m(m)
24902 .n(n)
24903 .k(k)
24904 .iterations(1)
24905 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24906 }
24907 }
24908 }
24909 }
24910
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_gt_4)24911 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_gt_4) {
24912 for (size_t k = 5; k < 8; k++) {
24913 GemmMicrokernelTester()
24914 .mr(1)
24915 .nr(8)
24916 .kr(1)
24917 .sr(1)
24918 .m(1)
24919 .n(8)
24920 .k(k)
24921 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24922 }
24923 }
24924
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_gt_4_strided_a)24925 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_gt_4_strided_a) {
24926 for (size_t k = 5; k < 8; k++) {
24927 GemmMicrokernelTester()
24928 .mr(1)
24929 .nr(8)
24930 .kr(1)
24931 .sr(1)
24932 .m(1)
24933 .n(8)
24934 .k(k)
24935 .a_stride(11)
24936 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24937 }
24938 }
24939
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_gt_4_subtile)24940 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_gt_4_subtile) {
24941 for (size_t k = 5; k < 8; k++) {
24942 for (uint32_t n = 1; n <= 8; n++) {
24943 for (uint32_t m = 1; m <= 1; m++) {
24944 GemmMicrokernelTester()
24945 .mr(1)
24946 .nr(8)
24947 .kr(1)
24948 .sr(1)
24949 .m(m)
24950 .n(n)
24951 .k(k)
24952 .iterations(1)
24953 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24954 }
24955 }
24956 }
24957 }
24958
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_div_4)24959 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_div_4) {
24960 for (size_t k = 8; k <= 40; k += 4) {
24961 GemmMicrokernelTester()
24962 .mr(1)
24963 .nr(8)
24964 .kr(1)
24965 .sr(1)
24966 .m(1)
24967 .n(8)
24968 .k(k)
24969 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24970 }
24971 }
24972
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_div_4_strided_a)24973 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_div_4_strided_a) {
24974 for (size_t k = 8; k <= 40; k += 4) {
24975 GemmMicrokernelTester()
24976 .mr(1)
24977 .nr(8)
24978 .kr(1)
24979 .sr(1)
24980 .m(1)
24981 .n(8)
24982 .k(k)
24983 .a_stride(43)
24984 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
24985 }
24986 }
24987
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,k_div_4_subtile)24988 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, k_div_4_subtile) {
24989 for (size_t k = 8; k <= 40; k += 4) {
24990 for (uint32_t n = 1; n <= 8; n++) {
24991 for (uint32_t m = 1; m <= 1; m++) {
24992 GemmMicrokernelTester()
24993 .mr(1)
24994 .nr(8)
24995 .kr(1)
24996 .sr(1)
24997 .m(m)
24998 .n(n)
24999 .k(k)
25000 .iterations(1)
25001 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25002 }
25003 }
25004 }
25005 }
25006
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,n_gt_8)25007 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, n_gt_8) {
25008 for (uint32_t n = 9; n < 16; n++) {
25009 for (size_t k = 1; k <= 20; k += 5) {
25010 GemmMicrokernelTester()
25011 .mr(1)
25012 .nr(8)
25013 .kr(1)
25014 .sr(1)
25015 .m(1)
25016 .n(n)
25017 .k(k)
25018 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25019 }
25020 }
25021 }
25022
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,n_gt_8_strided_cn)25023 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, n_gt_8_strided_cn) {
25024 for (uint32_t n = 9; n < 16; n++) {
25025 for (size_t k = 1; k <= 20; k += 5) {
25026 GemmMicrokernelTester()
25027 .mr(1)
25028 .nr(8)
25029 .kr(1)
25030 .sr(1)
25031 .m(1)
25032 .n(n)
25033 .k(k)
25034 .cn_stride(11)
25035 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25036 }
25037 }
25038 }
25039
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,n_gt_8_strided_a)25040 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, n_gt_8_strided_a) {
25041 for (uint32_t n = 9; n < 16; n++) {
25042 for (size_t k = 1; k <= 20; k += 5) {
25043 GemmMicrokernelTester()
25044 .mr(1)
25045 .nr(8)
25046 .kr(1)
25047 .sr(1)
25048 .m(1)
25049 .n(n)
25050 .k(k)
25051 .a_stride(23)
25052 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25053 }
25054 }
25055 }
25056
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,n_gt_8_subtile)25057 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, n_gt_8_subtile) {
25058 for (uint32_t n = 9; n < 16; n++) {
25059 for (size_t k = 1; k <= 20; k += 5) {
25060 for (uint32_t m = 1; m <= 1; m++) {
25061 GemmMicrokernelTester()
25062 .mr(1)
25063 .nr(8)
25064 .kr(1)
25065 .sr(1)
25066 .m(m)
25067 .n(n)
25068 .k(k)
25069 .iterations(1)
25070 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25071 }
25072 }
25073 }
25074 }
25075
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,n_div_8)25076 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, n_div_8) {
25077 for (uint32_t n = 16; n <= 24; n += 8) {
25078 for (size_t k = 1; k <= 20; k += 5) {
25079 GemmMicrokernelTester()
25080 .mr(1)
25081 .nr(8)
25082 .kr(1)
25083 .sr(1)
25084 .m(1)
25085 .n(n)
25086 .k(k)
25087 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25088 }
25089 }
25090 }
25091
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,n_div_8_strided_cn)25092 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, n_div_8_strided_cn) {
25093 for (uint32_t n = 16; n <= 24; n += 8) {
25094 for (size_t k = 1; k <= 20; k += 5) {
25095 GemmMicrokernelTester()
25096 .mr(1)
25097 .nr(8)
25098 .kr(1)
25099 .sr(1)
25100 .m(1)
25101 .n(n)
25102 .k(k)
25103 .cn_stride(11)
25104 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25105 }
25106 }
25107 }
25108
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,n_div_8_strided_a)25109 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, n_div_8_strided_a) {
25110 for (uint32_t n = 16; n <= 24; n += 8) {
25111 for (size_t k = 1; k <= 20; k += 5) {
25112 GemmMicrokernelTester()
25113 .mr(1)
25114 .nr(8)
25115 .kr(1)
25116 .sr(1)
25117 .m(1)
25118 .n(n)
25119 .k(k)
25120 .a_stride(23)
25121 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25122 }
25123 }
25124 }
25125
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,n_div_8_subtile)25126 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, n_div_8_subtile) {
25127 for (uint32_t n = 16; n <= 24; n += 8) {
25128 for (size_t k = 1; k <= 20; k += 5) {
25129 for (uint32_t m = 1; m <= 1; m++) {
25130 GemmMicrokernelTester()
25131 .mr(1)
25132 .nr(8)
25133 .kr(1)
25134 .sr(1)
25135 .m(m)
25136 .n(n)
25137 .k(k)
25138 .iterations(1)
25139 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25140 }
25141 }
25142 }
25143 }
25144
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,strided_cm_subtile)25145 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, strided_cm_subtile) {
25146 for (size_t k = 1; k <= 20; k += 5) {
25147 for (uint32_t n = 1; n <= 8; n++) {
25148 for (uint32_t m = 1; m <= 1; m++) {
25149 GemmMicrokernelTester()
25150 .mr(1)
25151 .nr(8)
25152 .kr(1)
25153 .sr(1)
25154 .m(m)
25155 .n(n)
25156 .k(k)
25157 .cm_stride(11)
25158 .iterations(1)
25159 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25160 }
25161 }
25162 }
25163 }
25164
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,qmin)25165 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, qmin) {
25166 GemmMicrokernelTester()
25167 .mr(1)
25168 .nr(8)
25169 .kr(1)
25170 .sr(1)
25171 .m(1)
25172 .n(8)
25173 .k(4)
25174 .qmin(128)
25175 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25176 }
25177
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,qmax)25178 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, qmax) {
25179 GemmMicrokernelTester()
25180 .mr(1)
25181 .nr(8)
25182 .kr(1)
25183 .sr(1)
25184 .m(1)
25185 .n(8)
25186 .k(4)
25187 .qmax(128)
25188 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25189 }
25190
TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT,strided_cm)25191 TEST(F32_GEMM_MINMAX_1X8__WASMSIMD_X86_SPLAT, strided_cm) {
25192 GemmMicrokernelTester()
25193 .mr(1)
25194 .nr(8)
25195 .kr(1)
25196 .sr(1)
25197 .m(1)
25198 .n(8)
25199 .k(4)
25200 .cm_stride(11)
25201 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
25202 }
25203 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
25204
25205
25206 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,k_eq_1)25207 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, k_eq_1) {
25208 GemmMicrokernelTester()
25209 .mr(3)
25210 .nr(8)
25211 .kr(1)
25212 .sr(1)
25213 .m(3)
25214 .n(8)
25215 .k(1)
25216 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25217 }
25218
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,strided_cn)25219 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, strided_cn) {
25220 GemmMicrokernelTester()
25221 .mr(3)
25222 .nr(8)
25223 .kr(1)
25224 .sr(1)
25225 .m(3)
25226 .n(8)
25227 .k(1)
25228 .cn_stride(11)
25229 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25230 }
25231
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_strided_a)25232 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_strided_a) {
25233 GemmMicrokernelTester()
25234 .mr(3)
25235 .nr(8)
25236 .kr(1)
25237 .sr(1)
25238 .m(3)
25239 .n(8)
25240 .k(1)
25241 .a_stride(3)
25242 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25243 }
25244
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile)25245 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile) {
25246 for (uint32_t n = 1; n <= 8; n++) {
25247 for (uint32_t m = 1; m <= 3; m++) {
25248 GemmMicrokernelTester()
25249 .mr(3)
25250 .nr(8)
25251 .kr(1)
25252 .sr(1)
25253 .m(m)
25254 .n(n)
25255 .k(1)
25256 .iterations(1)
25257 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25258 }
25259 }
25260 }
25261
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile_m)25262 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile_m) {
25263 for (uint32_t m = 1; m <= 3; m++) {
25264 GemmMicrokernelTester()
25265 .mr(3)
25266 .nr(8)
25267 .kr(1)
25268 .sr(1)
25269 .m(m)
25270 .n(8)
25271 .k(1)
25272 .iterations(1)
25273 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25274 }
25275 }
25276
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile_n)25277 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile_n) {
25278 for (uint32_t n = 1; n <= 8; n++) {
25279 GemmMicrokernelTester()
25280 .mr(3)
25281 .nr(8)
25282 .kr(1)
25283 .sr(1)
25284 .m(3)
25285 .n(n)
25286 .k(1)
25287 .iterations(1)
25288 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25289 }
25290 }
25291
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,k_gt_1)25292 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, k_gt_1) {
25293 for (size_t k = 2; k < 10; k++) {
25294 GemmMicrokernelTester()
25295 .mr(3)
25296 .nr(8)
25297 .kr(1)
25298 .sr(1)
25299 .m(3)
25300 .n(8)
25301 .k(k)
25302 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25303 }
25304 }
25305
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,k_gt_1_strided_a)25306 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, k_gt_1_strided_a) {
25307 for (size_t k = 2; k < 10; k++) {
25308 GemmMicrokernelTester()
25309 .mr(3)
25310 .nr(8)
25311 .kr(1)
25312 .sr(1)
25313 .m(3)
25314 .n(8)
25315 .k(k)
25316 .a_stride(11)
25317 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25318 }
25319 }
25320
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,k_gt_1_subtile)25321 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, k_gt_1_subtile) {
25322 for (size_t k = 2; k < 10; k++) {
25323 for (uint32_t n = 1; n <= 8; n++) {
25324 for (uint32_t m = 1; m <= 3; m++) {
25325 GemmMicrokernelTester()
25326 .mr(3)
25327 .nr(8)
25328 .kr(1)
25329 .sr(1)
25330 .m(m)
25331 .n(n)
25332 .k(k)
25333 .iterations(1)
25334 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25335 }
25336 }
25337 }
25338 }
25339
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,n_gt_8)25340 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, n_gt_8) {
25341 for (uint32_t n = 9; n < 16; n++) {
25342 for (size_t k = 1; k <= 5; k += 2) {
25343 GemmMicrokernelTester()
25344 .mr(3)
25345 .nr(8)
25346 .kr(1)
25347 .sr(1)
25348 .m(3)
25349 .n(n)
25350 .k(k)
25351 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25352 }
25353 }
25354 }
25355
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_strided_cn)25356 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_strided_cn) {
25357 for (uint32_t n = 9; n < 16; n++) {
25358 for (size_t k = 1; k <= 5; k += 2) {
25359 GemmMicrokernelTester()
25360 .mr(3)
25361 .nr(8)
25362 .kr(1)
25363 .sr(1)
25364 .m(3)
25365 .n(n)
25366 .k(k)
25367 .cn_stride(11)
25368 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25369 }
25370 }
25371 }
25372
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_strided_a)25373 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_strided_a) {
25374 for (uint32_t n = 9; n < 16; n++) {
25375 for (size_t k = 1; k <= 5; k += 2) {
25376 GemmMicrokernelTester()
25377 .mr(3)
25378 .nr(8)
25379 .kr(1)
25380 .sr(1)
25381 .m(3)
25382 .n(n)
25383 .k(k)
25384 .a_stride(7)
25385 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25386 }
25387 }
25388 }
25389
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_subtile)25390 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_subtile) {
25391 for (uint32_t n = 9; n < 16; n++) {
25392 for (size_t k = 1; k <= 5; k += 2) {
25393 for (uint32_t m = 1; m <= 3; m++) {
25394 GemmMicrokernelTester()
25395 .mr(3)
25396 .nr(8)
25397 .kr(1)
25398 .sr(1)
25399 .m(m)
25400 .n(n)
25401 .k(k)
25402 .iterations(1)
25403 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25404 }
25405 }
25406 }
25407 }
25408
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,n_div_8)25409 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, n_div_8) {
25410 for (uint32_t n = 16; n <= 24; n += 8) {
25411 for (size_t k = 1; k <= 5; k += 2) {
25412 GemmMicrokernelTester()
25413 .mr(3)
25414 .nr(8)
25415 .kr(1)
25416 .sr(1)
25417 .m(3)
25418 .n(n)
25419 .k(k)
25420 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25421 }
25422 }
25423 }
25424
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,n_div_8_strided_cn)25425 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, n_div_8_strided_cn) {
25426 for (uint32_t n = 16; n <= 24; n += 8) {
25427 for (size_t k = 1; k <= 5; k += 2) {
25428 GemmMicrokernelTester()
25429 .mr(3)
25430 .nr(8)
25431 .kr(1)
25432 .sr(1)
25433 .m(3)
25434 .n(n)
25435 .k(k)
25436 .cn_stride(11)
25437 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25438 }
25439 }
25440 }
25441
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,n_div_8_strided_a)25442 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, n_div_8_strided_a) {
25443 for (uint32_t n = 16; n <= 24; n += 8) {
25444 for (size_t k = 1; k <= 5; k += 2) {
25445 GemmMicrokernelTester()
25446 .mr(3)
25447 .nr(8)
25448 .kr(1)
25449 .sr(1)
25450 .m(3)
25451 .n(n)
25452 .k(k)
25453 .a_stride(7)
25454 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25455 }
25456 }
25457 }
25458
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,n_div_8_subtile)25459 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, n_div_8_subtile) {
25460 for (uint32_t n = 16; n <= 24; n += 8) {
25461 for (size_t k = 1; k <= 5; k += 2) {
25462 for (uint32_t m = 1; m <= 3; m++) {
25463 GemmMicrokernelTester()
25464 .mr(3)
25465 .nr(8)
25466 .kr(1)
25467 .sr(1)
25468 .m(m)
25469 .n(n)
25470 .k(k)
25471 .iterations(1)
25472 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25473 }
25474 }
25475 }
25476 }
25477
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,strided_cm_subtile)25478 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, strided_cm_subtile) {
25479 for (size_t k = 1; k <= 5; k += 2) {
25480 for (uint32_t n = 1; n <= 8; n++) {
25481 for (uint32_t m = 1; m <= 3; m++) {
25482 GemmMicrokernelTester()
25483 .mr(3)
25484 .nr(8)
25485 .kr(1)
25486 .sr(1)
25487 .m(m)
25488 .n(n)
25489 .k(k)
25490 .cm_stride(11)
25491 .iterations(1)
25492 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25493 }
25494 }
25495 }
25496 }
25497
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,qmin)25498 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, qmin) {
25499 GemmMicrokernelTester()
25500 .mr(3)
25501 .nr(8)
25502 .kr(1)
25503 .sr(1)
25504 .m(3)
25505 .n(8)
25506 .k(1)
25507 .qmin(128)
25508 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25509 }
25510
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,qmax)25511 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, qmax) {
25512 GemmMicrokernelTester()
25513 .mr(3)
25514 .nr(8)
25515 .kr(1)
25516 .sr(1)
25517 .m(3)
25518 .n(8)
25519 .k(1)
25520 .qmax(128)
25521 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25522 }
25523
TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT,strided_cm)25524 TEST(F32_GEMM_MINMAX_3X8__WASMSIMD_X86_LOADSPLAT, strided_cm) {
25525 GemmMicrokernelTester()
25526 .mr(3)
25527 .nr(8)
25528 .kr(1)
25529 .sr(1)
25530 .m(3)
25531 .n(8)
25532 .k(1)
25533 .cm_stride(11)
25534 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
25535 }
25536 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
25537
25538
25539 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_eq_4)25540 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_eq_4) {
25541 GemmMicrokernelTester()
25542 .mr(3)
25543 .nr(8)
25544 .kr(1)
25545 .sr(4)
25546 .m(3)
25547 .n(8)
25548 .k(4)
25549 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25550 }
25551
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,strided_cn)25552 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, strided_cn) {
25553 GemmMicrokernelTester()
25554 .mr(3)
25555 .nr(8)
25556 .kr(1)
25557 .sr(4)
25558 .m(3)
25559 .n(8)
25560 .k(4)
25561 .cn_stride(11)
25562 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25563 }
25564
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_eq_4_strided_a)25565 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_eq_4_strided_a) {
25566 GemmMicrokernelTester()
25567 .mr(3)
25568 .nr(8)
25569 .kr(1)
25570 .sr(4)
25571 .m(3)
25572 .n(8)
25573 .k(4)
25574 .a_stride(7)
25575 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25576 }
25577
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_eq_4_subtile)25578 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_eq_4_subtile) {
25579 for (uint32_t n = 1; n <= 8; n++) {
25580 for (uint32_t m = 1; m <= 3; m++) {
25581 GemmMicrokernelTester()
25582 .mr(3)
25583 .nr(8)
25584 .kr(1)
25585 .sr(4)
25586 .m(m)
25587 .n(n)
25588 .k(4)
25589 .iterations(1)
25590 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25591 }
25592 }
25593 }
25594
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_eq_4_subtile_m)25595 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_eq_4_subtile_m) {
25596 for (uint32_t m = 1; m <= 3; m++) {
25597 GemmMicrokernelTester()
25598 .mr(3)
25599 .nr(8)
25600 .kr(1)
25601 .sr(4)
25602 .m(m)
25603 .n(8)
25604 .k(4)
25605 .iterations(1)
25606 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25607 }
25608 }
25609
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_eq_4_subtile_n)25610 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_eq_4_subtile_n) {
25611 for (uint32_t n = 1; n <= 8; n++) {
25612 GemmMicrokernelTester()
25613 .mr(3)
25614 .nr(8)
25615 .kr(1)
25616 .sr(4)
25617 .m(3)
25618 .n(n)
25619 .k(4)
25620 .iterations(1)
25621 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25622 }
25623 }
25624
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_lt_4)25625 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_lt_4) {
25626 for (size_t k = 1; k < 4; k++) {
25627 GemmMicrokernelTester()
25628 .mr(3)
25629 .nr(8)
25630 .kr(1)
25631 .sr(4)
25632 .m(3)
25633 .n(8)
25634 .k(k)
25635 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25636 }
25637 }
25638
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_lt_4_strided_a)25639 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_lt_4_strided_a) {
25640 for (size_t k = 1; k < 4; k++) {
25641 GemmMicrokernelTester()
25642 .mr(3)
25643 .nr(8)
25644 .kr(1)
25645 .sr(4)
25646 .m(3)
25647 .n(8)
25648 .k(k)
25649 .a_stride(7)
25650 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25651 }
25652 }
25653
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_lt_4_subtile)25654 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_lt_4_subtile) {
25655 for (size_t k = 1; k < 4; k++) {
25656 for (uint32_t n = 1; n <= 8; n++) {
25657 for (uint32_t m = 1; m <= 3; m++) {
25658 GemmMicrokernelTester()
25659 .mr(3)
25660 .nr(8)
25661 .kr(1)
25662 .sr(4)
25663 .m(m)
25664 .n(n)
25665 .k(k)
25666 .iterations(1)
25667 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25668 }
25669 }
25670 }
25671 }
25672
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_gt_4)25673 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_gt_4) {
25674 for (size_t k = 5; k < 8; k++) {
25675 GemmMicrokernelTester()
25676 .mr(3)
25677 .nr(8)
25678 .kr(1)
25679 .sr(4)
25680 .m(3)
25681 .n(8)
25682 .k(k)
25683 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25684 }
25685 }
25686
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_gt_4_strided_a)25687 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_gt_4_strided_a) {
25688 for (size_t k = 5; k < 8; k++) {
25689 GemmMicrokernelTester()
25690 .mr(3)
25691 .nr(8)
25692 .kr(1)
25693 .sr(4)
25694 .m(3)
25695 .n(8)
25696 .k(k)
25697 .a_stride(11)
25698 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25699 }
25700 }
25701
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_gt_4_subtile)25702 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_gt_4_subtile) {
25703 for (size_t k = 5; k < 8; k++) {
25704 for (uint32_t n = 1; n <= 8; n++) {
25705 for (uint32_t m = 1; m <= 3; m++) {
25706 GemmMicrokernelTester()
25707 .mr(3)
25708 .nr(8)
25709 .kr(1)
25710 .sr(4)
25711 .m(m)
25712 .n(n)
25713 .k(k)
25714 .iterations(1)
25715 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25716 }
25717 }
25718 }
25719 }
25720
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_div_4)25721 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_div_4) {
25722 for (size_t k = 8; k <= 40; k += 4) {
25723 GemmMicrokernelTester()
25724 .mr(3)
25725 .nr(8)
25726 .kr(1)
25727 .sr(4)
25728 .m(3)
25729 .n(8)
25730 .k(k)
25731 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25732 }
25733 }
25734
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_div_4_strided_a)25735 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_div_4_strided_a) {
25736 for (size_t k = 8; k <= 40; k += 4) {
25737 GemmMicrokernelTester()
25738 .mr(3)
25739 .nr(8)
25740 .kr(1)
25741 .sr(4)
25742 .m(3)
25743 .n(8)
25744 .k(k)
25745 .a_stride(43)
25746 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25747 }
25748 }
25749
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,k_div_4_subtile)25750 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, k_div_4_subtile) {
25751 for (size_t k = 8; k <= 40; k += 4) {
25752 for (uint32_t n = 1; n <= 8; n++) {
25753 for (uint32_t m = 1; m <= 3; m++) {
25754 GemmMicrokernelTester()
25755 .mr(3)
25756 .nr(8)
25757 .kr(1)
25758 .sr(4)
25759 .m(m)
25760 .n(n)
25761 .k(k)
25762 .iterations(1)
25763 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25764 }
25765 }
25766 }
25767 }
25768
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,n_gt_8)25769 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, n_gt_8) {
25770 for (uint32_t n = 9; n < 16; n++) {
25771 for (size_t k = 1; k <= 20; k += 5) {
25772 GemmMicrokernelTester()
25773 .mr(3)
25774 .nr(8)
25775 .kr(1)
25776 .sr(4)
25777 .m(3)
25778 .n(n)
25779 .k(k)
25780 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25781 }
25782 }
25783 }
25784
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,n_gt_8_strided_cn)25785 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, n_gt_8_strided_cn) {
25786 for (uint32_t n = 9; n < 16; n++) {
25787 for (size_t k = 1; k <= 20; k += 5) {
25788 GemmMicrokernelTester()
25789 .mr(3)
25790 .nr(8)
25791 .kr(1)
25792 .sr(4)
25793 .m(3)
25794 .n(n)
25795 .k(k)
25796 .cn_stride(11)
25797 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25798 }
25799 }
25800 }
25801
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,n_gt_8_strided_a)25802 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, n_gt_8_strided_a) {
25803 for (uint32_t n = 9; n < 16; n++) {
25804 for (size_t k = 1; k <= 20; k += 5) {
25805 GemmMicrokernelTester()
25806 .mr(3)
25807 .nr(8)
25808 .kr(1)
25809 .sr(4)
25810 .m(3)
25811 .n(n)
25812 .k(k)
25813 .a_stride(23)
25814 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25815 }
25816 }
25817 }
25818
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,n_gt_8_subtile)25819 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, n_gt_8_subtile) {
25820 for (uint32_t n = 9; n < 16; n++) {
25821 for (size_t k = 1; k <= 20; k += 5) {
25822 for (uint32_t m = 1; m <= 3; m++) {
25823 GemmMicrokernelTester()
25824 .mr(3)
25825 .nr(8)
25826 .kr(1)
25827 .sr(4)
25828 .m(m)
25829 .n(n)
25830 .k(k)
25831 .iterations(1)
25832 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25833 }
25834 }
25835 }
25836 }
25837
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,n_div_8)25838 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, n_div_8) {
25839 for (uint32_t n = 16; n <= 24; n += 8) {
25840 for (size_t k = 1; k <= 20; k += 5) {
25841 GemmMicrokernelTester()
25842 .mr(3)
25843 .nr(8)
25844 .kr(1)
25845 .sr(4)
25846 .m(3)
25847 .n(n)
25848 .k(k)
25849 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25850 }
25851 }
25852 }
25853
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,n_div_8_strided_cn)25854 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, n_div_8_strided_cn) {
25855 for (uint32_t n = 16; n <= 24; n += 8) {
25856 for (size_t k = 1; k <= 20; k += 5) {
25857 GemmMicrokernelTester()
25858 .mr(3)
25859 .nr(8)
25860 .kr(1)
25861 .sr(4)
25862 .m(3)
25863 .n(n)
25864 .k(k)
25865 .cn_stride(11)
25866 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25867 }
25868 }
25869 }
25870
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,n_div_8_strided_a)25871 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, n_div_8_strided_a) {
25872 for (uint32_t n = 16; n <= 24; n += 8) {
25873 for (size_t k = 1; k <= 20; k += 5) {
25874 GemmMicrokernelTester()
25875 .mr(3)
25876 .nr(8)
25877 .kr(1)
25878 .sr(4)
25879 .m(3)
25880 .n(n)
25881 .k(k)
25882 .a_stride(23)
25883 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25884 }
25885 }
25886 }
25887
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,n_div_8_subtile)25888 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, n_div_8_subtile) {
25889 for (uint32_t n = 16; n <= 24; n += 8) {
25890 for (size_t k = 1; k <= 20; k += 5) {
25891 for (uint32_t m = 1; m <= 3; m++) {
25892 GemmMicrokernelTester()
25893 .mr(3)
25894 .nr(8)
25895 .kr(1)
25896 .sr(4)
25897 .m(m)
25898 .n(n)
25899 .k(k)
25900 .iterations(1)
25901 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25902 }
25903 }
25904 }
25905 }
25906
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,strided_cm_subtile)25907 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, strided_cm_subtile) {
25908 for (size_t k = 1; k <= 20; k += 5) {
25909 for (uint32_t n = 1; n <= 8; n++) {
25910 for (uint32_t m = 1; m <= 3; m++) {
25911 GemmMicrokernelTester()
25912 .mr(3)
25913 .nr(8)
25914 .kr(1)
25915 .sr(4)
25916 .m(m)
25917 .n(n)
25918 .k(k)
25919 .cm_stride(11)
25920 .iterations(1)
25921 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25922 }
25923 }
25924 }
25925 }
25926
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,qmin)25927 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, qmin) {
25928 GemmMicrokernelTester()
25929 .mr(3)
25930 .nr(8)
25931 .kr(1)
25932 .sr(4)
25933 .m(3)
25934 .n(8)
25935 .k(4)
25936 .qmin(128)
25937 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25938 }
25939
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,qmax)25940 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, qmax) {
25941 GemmMicrokernelTester()
25942 .mr(3)
25943 .nr(8)
25944 .kr(1)
25945 .sr(4)
25946 .m(3)
25947 .n(8)
25948 .k(4)
25949 .qmax(128)
25950 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25951 }
25952
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM,strided_cm)25953 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_ARM, strided_cm) {
25954 GemmMicrokernelTester()
25955 .mr(3)
25956 .nr(8)
25957 .kr(1)
25958 .sr(4)
25959 .m(3)
25960 .n(8)
25961 .k(4)
25962 .cm_stride(11)
25963 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
25964 }
25965 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
25966
25967
25968 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_eq_4)25969 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_eq_4) {
25970 GemmMicrokernelTester()
25971 .mr(3)
25972 .nr(8)
25973 .kr(1)
25974 .sr(4)
25975 .m(3)
25976 .n(8)
25977 .k(4)
25978 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
25979 }
25980
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,strided_cn)25981 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, strided_cn) {
25982 GemmMicrokernelTester()
25983 .mr(3)
25984 .nr(8)
25985 .kr(1)
25986 .sr(4)
25987 .m(3)
25988 .n(8)
25989 .k(4)
25990 .cn_stride(11)
25991 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
25992 }
25993
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_eq_4_strided_a)25994 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_eq_4_strided_a) {
25995 GemmMicrokernelTester()
25996 .mr(3)
25997 .nr(8)
25998 .kr(1)
25999 .sr(4)
26000 .m(3)
26001 .n(8)
26002 .k(4)
26003 .a_stride(7)
26004 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26005 }
26006
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_eq_4_subtile)26007 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_eq_4_subtile) {
26008 for (uint32_t n = 1; n <= 8; n++) {
26009 for (uint32_t m = 1; m <= 3; m++) {
26010 GemmMicrokernelTester()
26011 .mr(3)
26012 .nr(8)
26013 .kr(1)
26014 .sr(4)
26015 .m(m)
26016 .n(n)
26017 .k(4)
26018 .iterations(1)
26019 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26020 }
26021 }
26022 }
26023
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_eq_4_subtile_m)26024 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_eq_4_subtile_m) {
26025 for (uint32_t m = 1; m <= 3; m++) {
26026 GemmMicrokernelTester()
26027 .mr(3)
26028 .nr(8)
26029 .kr(1)
26030 .sr(4)
26031 .m(m)
26032 .n(8)
26033 .k(4)
26034 .iterations(1)
26035 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26036 }
26037 }
26038
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_eq_4_subtile_n)26039 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_eq_4_subtile_n) {
26040 for (uint32_t n = 1; n <= 8; n++) {
26041 GemmMicrokernelTester()
26042 .mr(3)
26043 .nr(8)
26044 .kr(1)
26045 .sr(4)
26046 .m(3)
26047 .n(n)
26048 .k(4)
26049 .iterations(1)
26050 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26051 }
26052 }
26053
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_lt_4)26054 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_lt_4) {
26055 for (size_t k = 1; k < 4; k++) {
26056 GemmMicrokernelTester()
26057 .mr(3)
26058 .nr(8)
26059 .kr(1)
26060 .sr(4)
26061 .m(3)
26062 .n(8)
26063 .k(k)
26064 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26065 }
26066 }
26067
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_lt_4_strided_a)26068 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_lt_4_strided_a) {
26069 for (size_t k = 1; k < 4; k++) {
26070 GemmMicrokernelTester()
26071 .mr(3)
26072 .nr(8)
26073 .kr(1)
26074 .sr(4)
26075 .m(3)
26076 .n(8)
26077 .k(k)
26078 .a_stride(7)
26079 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26080 }
26081 }
26082
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_lt_4_subtile)26083 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_lt_4_subtile) {
26084 for (size_t k = 1; k < 4; k++) {
26085 for (uint32_t n = 1; n <= 8; n++) {
26086 for (uint32_t m = 1; m <= 3; m++) {
26087 GemmMicrokernelTester()
26088 .mr(3)
26089 .nr(8)
26090 .kr(1)
26091 .sr(4)
26092 .m(m)
26093 .n(n)
26094 .k(k)
26095 .iterations(1)
26096 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26097 }
26098 }
26099 }
26100 }
26101
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_gt_4)26102 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_gt_4) {
26103 for (size_t k = 5; k < 8; k++) {
26104 GemmMicrokernelTester()
26105 .mr(3)
26106 .nr(8)
26107 .kr(1)
26108 .sr(4)
26109 .m(3)
26110 .n(8)
26111 .k(k)
26112 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26113 }
26114 }
26115
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_gt_4_strided_a)26116 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_gt_4_strided_a) {
26117 for (size_t k = 5; k < 8; k++) {
26118 GemmMicrokernelTester()
26119 .mr(3)
26120 .nr(8)
26121 .kr(1)
26122 .sr(4)
26123 .m(3)
26124 .n(8)
26125 .k(k)
26126 .a_stride(11)
26127 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26128 }
26129 }
26130
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_gt_4_subtile)26131 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_gt_4_subtile) {
26132 for (size_t k = 5; k < 8; k++) {
26133 for (uint32_t n = 1; n <= 8; n++) {
26134 for (uint32_t m = 1; m <= 3; m++) {
26135 GemmMicrokernelTester()
26136 .mr(3)
26137 .nr(8)
26138 .kr(1)
26139 .sr(4)
26140 .m(m)
26141 .n(n)
26142 .k(k)
26143 .iterations(1)
26144 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26145 }
26146 }
26147 }
26148 }
26149
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_div_4)26150 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_div_4) {
26151 for (size_t k = 8; k <= 40; k += 4) {
26152 GemmMicrokernelTester()
26153 .mr(3)
26154 .nr(8)
26155 .kr(1)
26156 .sr(4)
26157 .m(3)
26158 .n(8)
26159 .k(k)
26160 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26161 }
26162 }
26163
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_div_4_strided_a)26164 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_div_4_strided_a) {
26165 for (size_t k = 8; k <= 40; k += 4) {
26166 GemmMicrokernelTester()
26167 .mr(3)
26168 .nr(8)
26169 .kr(1)
26170 .sr(4)
26171 .m(3)
26172 .n(8)
26173 .k(k)
26174 .a_stride(43)
26175 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26176 }
26177 }
26178
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,k_div_4_subtile)26179 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, k_div_4_subtile) {
26180 for (size_t k = 8; k <= 40; k += 4) {
26181 for (uint32_t n = 1; n <= 8; n++) {
26182 for (uint32_t m = 1; m <= 3; m++) {
26183 GemmMicrokernelTester()
26184 .mr(3)
26185 .nr(8)
26186 .kr(1)
26187 .sr(4)
26188 .m(m)
26189 .n(n)
26190 .k(k)
26191 .iterations(1)
26192 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26193 }
26194 }
26195 }
26196 }
26197
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,n_gt_8)26198 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, n_gt_8) {
26199 for (uint32_t n = 9; n < 16; n++) {
26200 for (size_t k = 1; k <= 20; k += 5) {
26201 GemmMicrokernelTester()
26202 .mr(3)
26203 .nr(8)
26204 .kr(1)
26205 .sr(4)
26206 .m(3)
26207 .n(n)
26208 .k(k)
26209 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26210 }
26211 }
26212 }
26213
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,n_gt_8_strided_cn)26214 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, n_gt_8_strided_cn) {
26215 for (uint32_t n = 9; n < 16; n++) {
26216 for (size_t k = 1; k <= 20; k += 5) {
26217 GemmMicrokernelTester()
26218 .mr(3)
26219 .nr(8)
26220 .kr(1)
26221 .sr(4)
26222 .m(3)
26223 .n(n)
26224 .k(k)
26225 .cn_stride(11)
26226 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26227 }
26228 }
26229 }
26230
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,n_gt_8_strided_a)26231 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, n_gt_8_strided_a) {
26232 for (uint32_t n = 9; n < 16; n++) {
26233 for (size_t k = 1; k <= 20; k += 5) {
26234 GemmMicrokernelTester()
26235 .mr(3)
26236 .nr(8)
26237 .kr(1)
26238 .sr(4)
26239 .m(3)
26240 .n(n)
26241 .k(k)
26242 .a_stride(23)
26243 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26244 }
26245 }
26246 }
26247
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,n_gt_8_subtile)26248 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, n_gt_8_subtile) {
26249 for (uint32_t n = 9; n < 16; n++) {
26250 for (size_t k = 1; k <= 20; k += 5) {
26251 for (uint32_t m = 1; m <= 3; m++) {
26252 GemmMicrokernelTester()
26253 .mr(3)
26254 .nr(8)
26255 .kr(1)
26256 .sr(4)
26257 .m(m)
26258 .n(n)
26259 .k(k)
26260 .iterations(1)
26261 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26262 }
26263 }
26264 }
26265 }
26266
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,n_div_8)26267 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, n_div_8) {
26268 for (uint32_t n = 16; n <= 24; n += 8) {
26269 for (size_t k = 1; k <= 20; k += 5) {
26270 GemmMicrokernelTester()
26271 .mr(3)
26272 .nr(8)
26273 .kr(1)
26274 .sr(4)
26275 .m(3)
26276 .n(n)
26277 .k(k)
26278 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26279 }
26280 }
26281 }
26282
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,n_div_8_strided_cn)26283 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, n_div_8_strided_cn) {
26284 for (uint32_t n = 16; n <= 24; n += 8) {
26285 for (size_t k = 1; k <= 20; k += 5) {
26286 GemmMicrokernelTester()
26287 .mr(3)
26288 .nr(8)
26289 .kr(1)
26290 .sr(4)
26291 .m(3)
26292 .n(n)
26293 .k(k)
26294 .cn_stride(11)
26295 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26296 }
26297 }
26298 }
26299
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,n_div_8_strided_a)26300 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, n_div_8_strided_a) {
26301 for (uint32_t n = 16; n <= 24; n += 8) {
26302 for (size_t k = 1; k <= 20; k += 5) {
26303 GemmMicrokernelTester()
26304 .mr(3)
26305 .nr(8)
26306 .kr(1)
26307 .sr(4)
26308 .m(3)
26309 .n(n)
26310 .k(k)
26311 .a_stride(23)
26312 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26313 }
26314 }
26315 }
26316
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,n_div_8_subtile)26317 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, n_div_8_subtile) {
26318 for (uint32_t n = 16; n <= 24; n += 8) {
26319 for (size_t k = 1; k <= 20; k += 5) {
26320 for (uint32_t m = 1; m <= 3; m++) {
26321 GemmMicrokernelTester()
26322 .mr(3)
26323 .nr(8)
26324 .kr(1)
26325 .sr(4)
26326 .m(m)
26327 .n(n)
26328 .k(k)
26329 .iterations(1)
26330 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26331 }
26332 }
26333 }
26334 }
26335
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,strided_cm_subtile)26336 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, strided_cm_subtile) {
26337 for (size_t k = 1; k <= 20; k += 5) {
26338 for (uint32_t n = 1; n <= 8; n++) {
26339 for (uint32_t m = 1; m <= 3; m++) {
26340 GemmMicrokernelTester()
26341 .mr(3)
26342 .nr(8)
26343 .kr(1)
26344 .sr(4)
26345 .m(m)
26346 .n(n)
26347 .k(k)
26348 .cm_stride(11)
26349 .iterations(1)
26350 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26351 }
26352 }
26353 }
26354 }
26355
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,qmin)26356 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, qmin) {
26357 GemmMicrokernelTester()
26358 .mr(3)
26359 .nr(8)
26360 .kr(1)
26361 .sr(4)
26362 .m(3)
26363 .n(8)
26364 .k(4)
26365 .qmin(128)
26366 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26367 }
26368
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,qmax)26369 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, qmax) {
26370 GemmMicrokernelTester()
26371 .mr(3)
26372 .nr(8)
26373 .kr(1)
26374 .sr(4)
26375 .m(3)
26376 .n(8)
26377 .k(4)
26378 .qmax(128)
26379 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26380 }
26381
TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86,strided_cm)26382 TEST(F32_GEMM_MINMAX_3X8S4__WASMSIMD_X86, strided_cm) {
26383 GemmMicrokernelTester()
26384 .mr(3)
26385 .nr(8)
26386 .kr(1)
26387 .sr(4)
26388 .m(3)
26389 .n(8)
26390 .k(4)
26391 .cm_stride(11)
26392 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26393 }
26394 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
26395
26396
26397 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_eq_4)26398 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4) {
26399 GemmMicrokernelTester()
26400 .mr(4)
26401 .nr(2)
26402 .kr(4)
26403 .sr(1)
26404 .m(4)
26405 .n(2)
26406 .k(4)
26407 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26408 }
26409
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,strided_cn)26410 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, strided_cn) {
26411 GemmMicrokernelTester()
26412 .mr(4)
26413 .nr(2)
26414 .kr(4)
26415 .sr(1)
26416 .m(4)
26417 .n(2)
26418 .k(4)
26419 .cn_stride(5)
26420 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26421 }
26422
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_eq_4_strided_a)26423 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_strided_a) {
26424 GemmMicrokernelTester()
26425 .mr(4)
26426 .nr(2)
26427 .kr(4)
26428 .sr(1)
26429 .m(4)
26430 .n(2)
26431 .k(4)
26432 .a_stride(7)
26433 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26434 }
26435
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_eq_4_subtile)26436 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_subtile) {
26437 for (uint32_t n = 1; n <= 2; n++) {
26438 for (uint32_t m = 1; m <= 4; m++) {
26439 GemmMicrokernelTester()
26440 .mr(4)
26441 .nr(2)
26442 .kr(4)
26443 .sr(1)
26444 .m(m)
26445 .n(n)
26446 .k(4)
26447 .iterations(1)
26448 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26449 }
26450 }
26451 }
26452
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_eq_4_subtile_m)26453 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_m) {
26454 for (uint32_t m = 1; m <= 4; m++) {
26455 GemmMicrokernelTester()
26456 .mr(4)
26457 .nr(2)
26458 .kr(4)
26459 .sr(1)
26460 .m(m)
26461 .n(2)
26462 .k(4)
26463 .iterations(1)
26464 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26465 }
26466 }
26467
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_eq_4_subtile_n)26468 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_eq_4_subtile_n) {
26469 for (uint32_t n = 1; n <= 2; n++) {
26470 GemmMicrokernelTester()
26471 .mr(4)
26472 .nr(2)
26473 .kr(4)
26474 .sr(1)
26475 .m(4)
26476 .n(n)
26477 .k(4)
26478 .iterations(1)
26479 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26480 }
26481 }
26482
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_lt_4)26483 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_lt_4) {
26484 for (size_t k = 1; k < 4; k++) {
26485 GemmMicrokernelTester()
26486 .mr(4)
26487 .nr(2)
26488 .kr(4)
26489 .sr(1)
26490 .m(4)
26491 .n(2)
26492 .k(k)
26493 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26494 }
26495 }
26496
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_lt_4_strided_a)26497 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_lt_4_strided_a) {
26498 for (size_t k = 1; k < 4; k++) {
26499 GemmMicrokernelTester()
26500 .mr(4)
26501 .nr(2)
26502 .kr(4)
26503 .sr(1)
26504 .m(4)
26505 .n(2)
26506 .k(k)
26507 .a_stride(7)
26508 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26509 }
26510 }
26511
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_lt_4_subtile)26512 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_lt_4_subtile) {
26513 for (size_t k = 1; k < 4; k++) {
26514 for (uint32_t n = 1; n <= 2; n++) {
26515 for (uint32_t m = 1; m <= 4; m++) {
26516 GemmMicrokernelTester()
26517 .mr(4)
26518 .nr(2)
26519 .kr(4)
26520 .sr(1)
26521 .m(m)
26522 .n(n)
26523 .k(k)
26524 .iterations(1)
26525 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26526 }
26527 }
26528 }
26529 }
26530
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_gt_4)26531 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_gt_4) {
26532 for (size_t k = 5; k < 8; k++) {
26533 GemmMicrokernelTester()
26534 .mr(4)
26535 .nr(2)
26536 .kr(4)
26537 .sr(1)
26538 .m(4)
26539 .n(2)
26540 .k(k)
26541 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26542 }
26543 }
26544
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_gt_4_strided_a)26545 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_gt_4_strided_a) {
26546 for (size_t k = 5; k < 8; k++) {
26547 GemmMicrokernelTester()
26548 .mr(4)
26549 .nr(2)
26550 .kr(4)
26551 .sr(1)
26552 .m(4)
26553 .n(2)
26554 .k(k)
26555 .a_stride(11)
26556 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26557 }
26558 }
26559
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_gt_4_subtile)26560 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_gt_4_subtile) {
26561 for (size_t k = 5; k < 8; k++) {
26562 for (uint32_t n = 1; n <= 2; n++) {
26563 for (uint32_t m = 1; m <= 4; m++) {
26564 GemmMicrokernelTester()
26565 .mr(4)
26566 .nr(2)
26567 .kr(4)
26568 .sr(1)
26569 .m(m)
26570 .n(n)
26571 .k(k)
26572 .iterations(1)
26573 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26574 }
26575 }
26576 }
26577 }
26578
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_div_4)26579 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_div_4) {
26580 for (size_t k = 8; k <= 40; k += 4) {
26581 GemmMicrokernelTester()
26582 .mr(4)
26583 .nr(2)
26584 .kr(4)
26585 .sr(1)
26586 .m(4)
26587 .n(2)
26588 .k(k)
26589 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26590 }
26591 }
26592
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_div_4_strided_a)26593 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_div_4_strided_a) {
26594 for (size_t k = 8; k <= 40; k += 4) {
26595 GemmMicrokernelTester()
26596 .mr(4)
26597 .nr(2)
26598 .kr(4)
26599 .sr(1)
26600 .m(4)
26601 .n(2)
26602 .k(k)
26603 .a_stride(43)
26604 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26605 }
26606 }
26607
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,k_div_4_subtile)26608 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, k_div_4_subtile) {
26609 for (size_t k = 8; k <= 40; k += 4) {
26610 for (uint32_t n = 1; n <= 2; n++) {
26611 for (uint32_t m = 1; m <= 4; m++) {
26612 GemmMicrokernelTester()
26613 .mr(4)
26614 .nr(2)
26615 .kr(4)
26616 .sr(1)
26617 .m(m)
26618 .n(n)
26619 .k(k)
26620 .iterations(1)
26621 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26622 }
26623 }
26624 }
26625 }
26626
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,n_gt_2)26627 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2) {
26628 for (uint32_t n = 3; n < 4; n++) {
26629 for (size_t k = 1; k <= 20; k += 5) {
26630 GemmMicrokernelTester()
26631 .mr(4)
26632 .nr(2)
26633 .kr(4)
26634 .sr(1)
26635 .m(4)
26636 .n(n)
26637 .k(k)
26638 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26639 }
26640 }
26641 }
26642
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,n_gt_2_strided_cn)26643 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2_strided_cn) {
26644 for (uint32_t n = 3; n < 4; n++) {
26645 for (size_t k = 1; k <= 20; k += 5) {
26646 GemmMicrokernelTester()
26647 .mr(4)
26648 .nr(2)
26649 .kr(4)
26650 .sr(1)
26651 .m(4)
26652 .n(n)
26653 .k(k)
26654 .cn_stride(5)
26655 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26656 }
26657 }
26658 }
26659
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,n_gt_2_strided_a)26660 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2_strided_a) {
26661 for (uint32_t n = 3; n < 4; n++) {
26662 for (size_t k = 1; k <= 20; k += 5) {
26663 GemmMicrokernelTester()
26664 .mr(4)
26665 .nr(2)
26666 .kr(4)
26667 .sr(1)
26668 .m(4)
26669 .n(n)
26670 .k(k)
26671 .a_stride(23)
26672 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26673 }
26674 }
26675 }
26676
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,n_gt_2_subtile)26677 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_gt_2_subtile) {
26678 for (uint32_t n = 3; n < 4; n++) {
26679 for (size_t k = 1; k <= 20; k += 5) {
26680 for (uint32_t m = 1; m <= 4; m++) {
26681 GemmMicrokernelTester()
26682 .mr(4)
26683 .nr(2)
26684 .kr(4)
26685 .sr(1)
26686 .m(m)
26687 .n(n)
26688 .k(k)
26689 .iterations(1)
26690 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26691 }
26692 }
26693 }
26694 }
26695
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,n_div_2)26696 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2) {
26697 for (uint32_t n = 4; n <= 6; n += 2) {
26698 for (size_t k = 1; k <= 20; k += 5) {
26699 GemmMicrokernelTester()
26700 .mr(4)
26701 .nr(2)
26702 .kr(4)
26703 .sr(1)
26704 .m(4)
26705 .n(n)
26706 .k(k)
26707 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26708 }
26709 }
26710 }
26711
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,n_div_2_strided_cn)26712 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2_strided_cn) {
26713 for (uint32_t n = 4; n <= 6; n += 2) {
26714 for (size_t k = 1; k <= 20; k += 5) {
26715 GemmMicrokernelTester()
26716 .mr(4)
26717 .nr(2)
26718 .kr(4)
26719 .sr(1)
26720 .m(4)
26721 .n(n)
26722 .k(k)
26723 .cn_stride(5)
26724 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26725 }
26726 }
26727 }
26728
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,n_div_2_strided_a)26729 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2_strided_a) {
26730 for (uint32_t n = 4; n <= 6; n += 2) {
26731 for (size_t k = 1; k <= 20; k += 5) {
26732 GemmMicrokernelTester()
26733 .mr(4)
26734 .nr(2)
26735 .kr(4)
26736 .sr(1)
26737 .m(4)
26738 .n(n)
26739 .k(k)
26740 .a_stride(23)
26741 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26742 }
26743 }
26744 }
26745
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,n_div_2_subtile)26746 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, n_div_2_subtile) {
26747 for (uint32_t n = 4; n <= 6; n += 2) {
26748 for (size_t k = 1; k <= 20; k += 5) {
26749 for (uint32_t m = 1; m <= 4; m++) {
26750 GemmMicrokernelTester()
26751 .mr(4)
26752 .nr(2)
26753 .kr(4)
26754 .sr(1)
26755 .m(m)
26756 .n(n)
26757 .k(k)
26758 .iterations(1)
26759 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26760 }
26761 }
26762 }
26763 }
26764
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,strided_cm_subtile)26765 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, strided_cm_subtile) {
26766 for (size_t k = 1; k <= 20; k += 5) {
26767 for (uint32_t n = 1; n <= 2; n++) {
26768 for (uint32_t m = 1; m <= 4; m++) {
26769 GemmMicrokernelTester()
26770 .mr(4)
26771 .nr(2)
26772 .kr(4)
26773 .sr(1)
26774 .m(m)
26775 .n(n)
26776 .k(k)
26777 .cm_stride(5)
26778 .iterations(1)
26779 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26780 }
26781 }
26782 }
26783 }
26784
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,qmin)26785 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, qmin) {
26786 GemmMicrokernelTester()
26787 .mr(4)
26788 .nr(2)
26789 .kr(4)
26790 .sr(1)
26791 .m(4)
26792 .n(2)
26793 .k(4)
26794 .qmin(128)
26795 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26796 }
26797
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,qmax)26798 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, qmax) {
26799 GemmMicrokernelTester()
26800 .mr(4)
26801 .nr(2)
26802 .kr(4)
26803 .sr(1)
26804 .m(4)
26805 .n(2)
26806 .k(4)
26807 .qmax(128)
26808 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26809 }
26810
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM,strided_cm)26811 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_ARM, strided_cm) {
26812 GemmMicrokernelTester()
26813 .mr(4)
26814 .nr(2)
26815 .kr(4)
26816 .sr(1)
26817 .m(4)
26818 .n(2)
26819 .k(4)
26820 .cm_stride(5)
26821 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
26822 }
26823 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
26824
26825
26826 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_eq_4)26827 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4) {
26828 GemmMicrokernelTester()
26829 .mr(4)
26830 .nr(2)
26831 .kr(4)
26832 .sr(1)
26833 .m(4)
26834 .n(2)
26835 .k(4)
26836 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26837 }
26838
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,strided_cn)26839 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, strided_cn) {
26840 GemmMicrokernelTester()
26841 .mr(4)
26842 .nr(2)
26843 .kr(4)
26844 .sr(1)
26845 .m(4)
26846 .n(2)
26847 .k(4)
26848 .cn_stride(5)
26849 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26850 }
26851
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_eq_4_strided_a)26852 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_strided_a) {
26853 GemmMicrokernelTester()
26854 .mr(4)
26855 .nr(2)
26856 .kr(4)
26857 .sr(1)
26858 .m(4)
26859 .n(2)
26860 .k(4)
26861 .a_stride(7)
26862 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26863 }
26864
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_eq_4_subtile)26865 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_subtile) {
26866 for (uint32_t n = 1; n <= 2; n++) {
26867 for (uint32_t m = 1; m <= 4; m++) {
26868 GemmMicrokernelTester()
26869 .mr(4)
26870 .nr(2)
26871 .kr(4)
26872 .sr(1)
26873 .m(m)
26874 .n(n)
26875 .k(4)
26876 .iterations(1)
26877 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26878 }
26879 }
26880 }
26881
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_eq_4_subtile_m)26882 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_subtile_m) {
26883 for (uint32_t m = 1; m <= 4; m++) {
26884 GemmMicrokernelTester()
26885 .mr(4)
26886 .nr(2)
26887 .kr(4)
26888 .sr(1)
26889 .m(m)
26890 .n(2)
26891 .k(4)
26892 .iterations(1)
26893 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26894 }
26895 }
26896
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_eq_4_subtile_n)26897 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_eq_4_subtile_n) {
26898 for (uint32_t n = 1; n <= 2; n++) {
26899 GemmMicrokernelTester()
26900 .mr(4)
26901 .nr(2)
26902 .kr(4)
26903 .sr(1)
26904 .m(4)
26905 .n(n)
26906 .k(4)
26907 .iterations(1)
26908 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26909 }
26910 }
26911
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_lt_4)26912 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_lt_4) {
26913 for (size_t k = 1; k < 4; k++) {
26914 GemmMicrokernelTester()
26915 .mr(4)
26916 .nr(2)
26917 .kr(4)
26918 .sr(1)
26919 .m(4)
26920 .n(2)
26921 .k(k)
26922 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26923 }
26924 }
26925
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_lt_4_strided_a)26926 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_lt_4_strided_a) {
26927 for (size_t k = 1; k < 4; k++) {
26928 GemmMicrokernelTester()
26929 .mr(4)
26930 .nr(2)
26931 .kr(4)
26932 .sr(1)
26933 .m(4)
26934 .n(2)
26935 .k(k)
26936 .a_stride(7)
26937 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26938 }
26939 }
26940
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_lt_4_subtile)26941 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_lt_4_subtile) {
26942 for (size_t k = 1; k < 4; k++) {
26943 for (uint32_t n = 1; n <= 2; n++) {
26944 for (uint32_t m = 1; m <= 4; m++) {
26945 GemmMicrokernelTester()
26946 .mr(4)
26947 .nr(2)
26948 .kr(4)
26949 .sr(1)
26950 .m(m)
26951 .n(n)
26952 .k(k)
26953 .iterations(1)
26954 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26955 }
26956 }
26957 }
26958 }
26959
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_gt_4)26960 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_gt_4) {
26961 for (size_t k = 5; k < 8; k++) {
26962 GemmMicrokernelTester()
26963 .mr(4)
26964 .nr(2)
26965 .kr(4)
26966 .sr(1)
26967 .m(4)
26968 .n(2)
26969 .k(k)
26970 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26971 }
26972 }
26973
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_gt_4_strided_a)26974 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_gt_4_strided_a) {
26975 for (size_t k = 5; k < 8; k++) {
26976 GemmMicrokernelTester()
26977 .mr(4)
26978 .nr(2)
26979 .kr(4)
26980 .sr(1)
26981 .m(4)
26982 .n(2)
26983 .k(k)
26984 .a_stride(11)
26985 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
26986 }
26987 }
26988
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_gt_4_subtile)26989 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_gt_4_subtile) {
26990 for (size_t k = 5; k < 8; k++) {
26991 for (uint32_t n = 1; n <= 2; n++) {
26992 for (uint32_t m = 1; m <= 4; m++) {
26993 GemmMicrokernelTester()
26994 .mr(4)
26995 .nr(2)
26996 .kr(4)
26997 .sr(1)
26998 .m(m)
26999 .n(n)
27000 .k(k)
27001 .iterations(1)
27002 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27003 }
27004 }
27005 }
27006 }
27007
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_div_4)27008 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_div_4) {
27009 for (size_t k = 8; k <= 40; k += 4) {
27010 GemmMicrokernelTester()
27011 .mr(4)
27012 .nr(2)
27013 .kr(4)
27014 .sr(1)
27015 .m(4)
27016 .n(2)
27017 .k(k)
27018 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27019 }
27020 }
27021
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_div_4_strided_a)27022 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_div_4_strided_a) {
27023 for (size_t k = 8; k <= 40; k += 4) {
27024 GemmMicrokernelTester()
27025 .mr(4)
27026 .nr(2)
27027 .kr(4)
27028 .sr(1)
27029 .m(4)
27030 .n(2)
27031 .k(k)
27032 .a_stride(43)
27033 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27034 }
27035 }
27036
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,k_div_4_subtile)27037 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, k_div_4_subtile) {
27038 for (size_t k = 8; k <= 40; k += 4) {
27039 for (uint32_t n = 1; n <= 2; n++) {
27040 for (uint32_t m = 1; m <= 4; m++) {
27041 GemmMicrokernelTester()
27042 .mr(4)
27043 .nr(2)
27044 .kr(4)
27045 .sr(1)
27046 .m(m)
27047 .n(n)
27048 .k(k)
27049 .iterations(1)
27050 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27051 }
27052 }
27053 }
27054 }
27055
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,n_gt_2)27056 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2) {
27057 for (uint32_t n = 3; n < 4; n++) {
27058 for (size_t k = 1; k <= 20; k += 5) {
27059 GemmMicrokernelTester()
27060 .mr(4)
27061 .nr(2)
27062 .kr(4)
27063 .sr(1)
27064 .m(4)
27065 .n(n)
27066 .k(k)
27067 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27068 }
27069 }
27070 }
27071
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,n_gt_2_strided_cn)27072 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2_strided_cn) {
27073 for (uint32_t n = 3; n < 4; n++) {
27074 for (size_t k = 1; k <= 20; k += 5) {
27075 GemmMicrokernelTester()
27076 .mr(4)
27077 .nr(2)
27078 .kr(4)
27079 .sr(1)
27080 .m(4)
27081 .n(n)
27082 .k(k)
27083 .cn_stride(5)
27084 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27085 }
27086 }
27087 }
27088
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,n_gt_2_strided_a)27089 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2_strided_a) {
27090 for (uint32_t n = 3; n < 4; n++) {
27091 for (size_t k = 1; k <= 20; k += 5) {
27092 GemmMicrokernelTester()
27093 .mr(4)
27094 .nr(2)
27095 .kr(4)
27096 .sr(1)
27097 .m(4)
27098 .n(n)
27099 .k(k)
27100 .a_stride(23)
27101 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27102 }
27103 }
27104 }
27105
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,n_gt_2_subtile)27106 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_gt_2_subtile) {
27107 for (uint32_t n = 3; n < 4; n++) {
27108 for (size_t k = 1; k <= 20; k += 5) {
27109 for (uint32_t m = 1; m <= 4; m++) {
27110 GemmMicrokernelTester()
27111 .mr(4)
27112 .nr(2)
27113 .kr(4)
27114 .sr(1)
27115 .m(m)
27116 .n(n)
27117 .k(k)
27118 .iterations(1)
27119 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27120 }
27121 }
27122 }
27123 }
27124
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,n_div_2)27125 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2) {
27126 for (uint32_t n = 4; n <= 6; n += 2) {
27127 for (size_t k = 1; k <= 20; k += 5) {
27128 GemmMicrokernelTester()
27129 .mr(4)
27130 .nr(2)
27131 .kr(4)
27132 .sr(1)
27133 .m(4)
27134 .n(n)
27135 .k(k)
27136 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27137 }
27138 }
27139 }
27140
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,n_div_2_strided_cn)27141 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2_strided_cn) {
27142 for (uint32_t n = 4; n <= 6; n += 2) {
27143 for (size_t k = 1; k <= 20; k += 5) {
27144 GemmMicrokernelTester()
27145 .mr(4)
27146 .nr(2)
27147 .kr(4)
27148 .sr(1)
27149 .m(4)
27150 .n(n)
27151 .k(k)
27152 .cn_stride(5)
27153 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27154 }
27155 }
27156 }
27157
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,n_div_2_strided_a)27158 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2_strided_a) {
27159 for (uint32_t n = 4; n <= 6; n += 2) {
27160 for (size_t k = 1; k <= 20; k += 5) {
27161 GemmMicrokernelTester()
27162 .mr(4)
27163 .nr(2)
27164 .kr(4)
27165 .sr(1)
27166 .m(4)
27167 .n(n)
27168 .k(k)
27169 .a_stride(23)
27170 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27171 }
27172 }
27173 }
27174
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,n_div_2_subtile)27175 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, n_div_2_subtile) {
27176 for (uint32_t n = 4; n <= 6; n += 2) {
27177 for (size_t k = 1; k <= 20; k += 5) {
27178 for (uint32_t m = 1; m <= 4; m++) {
27179 GemmMicrokernelTester()
27180 .mr(4)
27181 .nr(2)
27182 .kr(4)
27183 .sr(1)
27184 .m(m)
27185 .n(n)
27186 .k(k)
27187 .iterations(1)
27188 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27189 }
27190 }
27191 }
27192 }
27193
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,strided_cm_subtile)27194 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, strided_cm_subtile) {
27195 for (size_t k = 1; k <= 20; k += 5) {
27196 for (uint32_t n = 1; n <= 2; n++) {
27197 for (uint32_t m = 1; m <= 4; m++) {
27198 GemmMicrokernelTester()
27199 .mr(4)
27200 .nr(2)
27201 .kr(4)
27202 .sr(1)
27203 .m(m)
27204 .n(n)
27205 .k(k)
27206 .cm_stride(5)
27207 .iterations(1)
27208 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27209 }
27210 }
27211 }
27212 }
27213
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,qmin)27214 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, qmin) {
27215 GemmMicrokernelTester()
27216 .mr(4)
27217 .nr(2)
27218 .kr(4)
27219 .sr(1)
27220 .m(4)
27221 .n(2)
27222 .k(4)
27223 .qmin(128)
27224 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27225 }
27226
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,qmax)27227 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, qmax) {
27228 GemmMicrokernelTester()
27229 .mr(4)
27230 .nr(2)
27231 .kr(4)
27232 .sr(1)
27233 .m(4)
27234 .n(2)
27235 .k(4)
27236 .qmax(128)
27237 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27238 }
27239
TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86,strided_cm)27240 TEST(F32_GEMM_MINMAX_4X2C4__WASMSIMD_X86, strided_cm) {
27241 GemmMicrokernelTester()
27242 .mr(4)
27243 .nr(2)
27244 .kr(4)
27245 .sr(1)
27246 .m(4)
27247 .n(2)
27248 .k(4)
27249 .cm_stride(5)
27250 .Test(xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
27251 }
27252 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
27253
27254
27255 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,k_eq_1)27256 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, k_eq_1) {
27257 GemmMicrokernelTester()
27258 .mr(4)
27259 .nr(8)
27260 .kr(1)
27261 .sr(1)
27262 .m(4)
27263 .n(8)
27264 .k(1)
27265 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27266 }
27267
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,strided_cn)27268 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, strided_cn) {
27269 GemmMicrokernelTester()
27270 .mr(4)
27271 .nr(8)
27272 .kr(1)
27273 .sr(1)
27274 .m(4)
27275 .n(8)
27276 .k(1)
27277 .cn_stride(11)
27278 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27279 }
27280
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_strided_a)27281 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_strided_a) {
27282 GemmMicrokernelTester()
27283 .mr(4)
27284 .nr(8)
27285 .kr(1)
27286 .sr(1)
27287 .m(4)
27288 .n(8)
27289 .k(1)
27290 .a_stride(3)
27291 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27292 }
27293
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile)27294 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile) {
27295 for (uint32_t n = 1; n <= 8; n++) {
27296 for (uint32_t m = 1; m <= 4; m++) {
27297 GemmMicrokernelTester()
27298 .mr(4)
27299 .nr(8)
27300 .kr(1)
27301 .sr(1)
27302 .m(m)
27303 .n(n)
27304 .k(1)
27305 .iterations(1)
27306 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27307 }
27308 }
27309 }
27310
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile_m)27311 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile_m) {
27312 for (uint32_t m = 1; m <= 4; m++) {
27313 GemmMicrokernelTester()
27314 .mr(4)
27315 .nr(8)
27316 .kr(1)
27317 .sr(1)
27318 .m(m)
27319 .n(8)
27320 .k(1)
27321 .iterations(1)
27322 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27323 }
27324 }
27325
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile_n)27326 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile_n) {
27327 for (uint32_t n = 1; n <= 8; n++) {
27328 GemmMicrokernelTester()
27329 .mr(4)
27330 .nr(8)
27331 .kr(1)
27332 .sr(1)
27333 .m(4)
27334 .n(n)
27335 .k(1)
27336 .iterations(1)
27337 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27338 }
27339 }
27340
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,k_gt_1)27341 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, k_gt_1) {
27342 for (size_t k = 2; k < 10; k++) {
27343 GemmMicrokernelTester()
27344 .mr(4)
27345 .nr(8)
27346 .kr(1)
27347 .sr(1)
27348 .m(4)
27349 .n(8)
27350 .k(k)
27351 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27352 }
27353 }
27354
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,k_gt_1_strided_a)27355 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, k_gt_1_strided_a) {
27356 for (size_t k = 2; k < 10; k++) {
27357 GemmMicrokernelTester()
27358 .mr(4)
27359 .nr(8)
27360 .kr(1)
27361 .sr(1)
27362 .m(4)
27363 .n(8)
27364 .k(k)
27365 .a_stride(11)
27366 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27367 }
27368 }
27369
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,k_gt_1_subtile)27370 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, k_gt_1_subtile) {
27371 for (size_t k = 2; k < 10; k++) {
27372 for (uint32_t n = 1; n <= 8; n++) {
27373 for (uint32_t m = 1; m <= 4; m++) {
27374 GemmMicrokernelTester()
27375 .mr(4)
27376 .nr(8)
27377 .kr(1)
27378 .sr(1)
27379 .m(m)
27380 .n(n)
27381 .k(k)
27382 .iterations(1)
27383 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27384 }
27385 }
27386 }
27387 }
27388
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,n_gt_8)27389 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, n_gt_8) {
27390 for (uint32_t n = 9; n < 16; n++) {
27391 for (size_t k = 1; k <= 5; k += 2) {
27392 GemmMicrokernelTester()
27393 .mr(4)
27394 .nr(8)
27395 .kr(1)
27396 .sr(1)
27397 .m(4)
27398 .n(n)
27399 .k(k)
27400 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27401 }
27402 }
27403 }
27404
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_strided_cn)27405 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_strided_cn) {
27406 for (uint32_t n = 9; n < 16; n++) {
27407 for (size_t k = 1; k <= 5; k += 2) {
27408 GemmMicrokernelTester()
27409 .mr(4)
27410 .nr(8)
27411 .kr(1)
27412 .sr(1)
27413 .m(4)
27414 .n(n)
27415 .k(k)
27416 .cn_stride(11)
27417 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27418 }
27419 }
27420 }
27421
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_strided_a)27422 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_strided_a) {
27423 for (uint32_t n = 9; n < 16; n++) {
27424 for (size_t k = 1; k <= 5; k += 2) {
27425 GemmMicrokernelTester()
27426 .mr(4)
27427 .nr(8)
27428 .kr(1)
27429 .sr(1)
27430 .m(4)
27431 .n(n)
27432 .k(k)
27433 .a_stride(7)
27434 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27435 }
27436 }
27437 }
27438
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_subtile)27439 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_subtile) {
27440 for (uint32_t n = 9; n < 16; n++) {
27441 for (size_t k = 1; k <= 5; k += 2) {
27442 for (uint32_t m = 1; m <= 4; m++) {
27443 GemmMicrokernelTester()
27444 .mr(4)
27445 .nr(8)
27446 .kr(1)
27447 .sr(1)
27448 .m(m)
27449 .n(n)
27450 .k(k)
27451 .iterations(1)
27452 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27453 }
27454 }
27455 }
27456 }
27457
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,n_div_8)27458 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, n_div_8) {
27459 for (uint32_t n = 16; n <= 24; n += 8) {
27460 for (size_t k = 1; k <= 5; k += 2) {
27461 GemmMicrokernelTester()
27462 .mr(4)
27463 .nr(8)
27464 .kr(1)
27465 .sr(1)
27466 .m(4)
27467 .n(n)
27468 .k(k)
27469 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27470 }
27471 }
27472 }
27473
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,n_div_8_strided_cn)27474 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, n_div_8_strided_cn) {
27475 for (uint32_t n = 16; n <= 24; n += 8) {
27476 for (size_t k = 1; k <= 5; k += 2) {
27477 GemmMicrokernelTester()
27478 .mr(4)
27479 .nr(8)
27480 .kr(1)
27481 .sr(1)
27482 .m(4)
27483 .n(n)
27484 .k(k)
27485 .cn_stride(11)
27486 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27487 }
27488 }
27489 }
27490
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,n_div_8_strided_a)27491 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, n_div_8_strided_a) {
27492 for (uint32_t n = 16; n <= 24; n += 8) {
27493 for (size_t k = 1; k <= 5; k += 2) {
27494 GemmMicrokernelTester()
27495 .mr(4)
27496 .nr(8)
27497 .kr(1)
27498 .sr(1)
27499 .m(4)
27500 .n(n)
27501 .k(k)
27502 .a_stride(7)
27503 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27504 }
27505 }
27506 }
27507
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,n_div_8_subtile)27508 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, n_div_8_subtile) {
27509 for (uint32_t n = 16; n <= 24; n += 8) {
27510 for (size_t k = 1; k <= 5; k += 2) {
27511 for (uint32_t m = 1; m <= 4; m++) {
27512 GemmMicrokernelTester()
27513 .mr(4)
27514 .nr(8)
27515 .kr(1)
27516 .sr(1)
27517 .m(m)
27518 .n(n)
27519 .k(k)
27520 .iterations(1)
27521 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27522 }
27523 }
27524 }
27525 }
27526
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,strided_cm_subtile)27527 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, strided_cm_subtile) {
27528 for (size_t k = 1; k <= 5; k += 2) {
27529 for (uint32_t n = 1; n <= 8; n++) {
27530 for (uint32_t m = 1; m <= 4; m++) {
27531 GemmMicrokernelTester()
27532 .mr(4)
27533 .nr(8)
27534 .kr(1)
27535 .sr(1)
27536 .m(m)
27537 .n(n)
27538 .k(k)
27539 .cm_stride(11)
27540 .iterations(1)
27541 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27542 }
27543 }
27544 }
27545 }
27546
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,qmin)27547 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, qmin) {
27548 GemmMicrokernelTester()
27549 .mr(4)
27550 .nr(8)
27551 .kr(1)
27552 .sr(1)
27553 .m(4)
27554 .n(8)
27555 .k(1)
27556 .qmin(128)
27557 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27558 }
27559
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,qmax)27560 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, qmax) {
27561 GemmMicrokernelTester()
27562 .mr(4)
27563 .nr(8)
27564 .kr(1)
27565 .sr(1)
27566 .m(4)
27567 .n(8)
27568 .k(1)
27569 .qmax(128)
27570 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27571 }
27572
TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT,strided_cm)27573 TEST(F32_GEMM_MINMAX_4X8__WASMSIMD_X86_LOADSPLAT, strided_cm) {
27574 GemmMicrokernelTester()
27575 .mr(4)
27576 .nr(8)
27577 .kr(1)
27578 .sr(1)
27579 .m(4)
27580 .n(8)
27581 .k(1)
27582 .cm_stride(11)
27583 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27584 }
27585 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
27586
27587
27588 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,k_eq_1)27589 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, k_eq_1) {
27590 GemmMicrokernelTester()
27591 .mr(5)
27592 .nr(8)
27593 .kr(1)
27594 .sr(1)
27595 .m(5)
27596 .n(8)
27597 .k(1)
27598 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27599 }
27600
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,strided_cn)27601 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, strided_cn) {
27602 GemmMicrokernelTester()
27603 .mr(5)
27604 .nr(8)
27605 .kr(1)
27606 .sr(1)
27607 .m(5)
27608 .n(8)
27609 .k(1)
27610 .cn_stride(11)
27611 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27612 }
27613
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,k_eq_1_strided_a)27614 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, k_eq_1_strided_a) {
27615 GemmMicrokernelTester()
27616 .mr(5)
27617 .nr(8)
27618 .kr(1)
27619 .sr(1)
27620 .m(5)
27621 .n(8)
27622 .k(1)
27623 .a_stride(3)
27624 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27625 }
27626
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,k_eq_1_subtile)27627 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, k_eq_1_subtile) {
27628 for (uint32_t n = 1; n <= 8; n++) {
27629 for (uint32_t m = 1; m <= 5; m++) {
27630 GemmMicrokernelTester()
27631 .mr(5)
27632 .nr(8)
27633 .kr(1)
27634 .sr(1)
27635 .m(m)
27636 .n(n)
27637 .k(1)
27638 .iterations(1)
27639 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27640 }
27641 }
27642 }
27643
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,k_eq_1_subtile_m)27644 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, k_eq_1_subtile_m) {
27645 for (uint32_t m = 1; m <= 5; m++) {
27646 GemmMicrokernelTester()
27647 .mr(5)
27648 .nr(8)
27649 .kr(1)
27650 .sr(1)
27651 .m(m)
27652 .n(8)
27653 .k(1)
27654 .iterations(1)
27655 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27656 }
27657 }
27658
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,k_eq_1_subtile_n)27659 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, k_eq_1_subtile_n) {
27660 for (uint32_t n = 1; n <= 8; n++) {
27661 GemmMicrokernelTester()
27662 .mr(5)
27663 .nr(8)
27664 .kr(1)
27665 .sr(1)
27666 .m(5)
27667 .n(n)
27668 .k(1)
27669 .iterations(1)
27670 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27671 }
27672 }
27673
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,k_gt_1)27674 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, k_gt_1) {
27675 for (size_t k = 2; k < 10; k++) {
27676 GemmMicrokernelTester()
27677 .mr(5)
27678 .nr(8)
27679 .kr(1)
27680 .sr(1)
27681 .m(5)
27682 .n(8)
27683 .k(k)
27684 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27685 }
27686 }
27687
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,k_gt_1_strided_a)27688 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, k_gt_1_strided_a) {
27689 for (size_t k = 2; k < 10; k++) {
27690 GemmMicrokernelTester()
27691 .mr(5)
27692 .nr(8)
27693 .kr(1)
27694 .sr(1)
27695 .m(5)
27696 .n(8)
27697 .k(k)
27698 .a_stride(11)
27699 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27700 }
27701 }
27702
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,k_gt_1_subtile)27703 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, k_gt_1_subtile) {
27704 for (size_t k = 2; k < 10; k++) {
27705 for (uint32_t n = 1; n <= 8; n++) {
27706 for (uint32_t m = 1; m <= 5; m++) {
27707 GemmMicrokernelTester()
27708 .mr(5)
27709 .nr(8)
27710 .kr(1)
27711 .sr(1)
27712 .m(m)
27713 .n(n)
27714 .k(k)
27715 .iterations(1)
27716 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27717 }
27718 }
27719 }
27720 }
27721
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,n_gt_8)27722 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, n_gt_8) {
27723 for (uint32_t n = 9; n < 16; n++) {
27724 for (size_t k = 1; k <= 5; k += 2) {
27725 GemmMicrokernelTester()
27726 .mr(5)
27727 .nr(8)
27728 .kr(1)
27729 .sr(1)
27730 .m(5)
27731 .n(n)
27732 .k(k)
27733 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27734 }
27735 }
27736 }
27737
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,n_gt_8_strided_cn)27738 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, n_gt_8_strided_cn) {
27739 for (uint32_t n = 9; n < 16; n++) {
27740 for (size_t k = 1; k <= 5; k += 2) {
27741 GemmMicrokernelTester()
27742 .mr(5)
27743 .nr(8)
27744 .kr(1)
27745 .sr(1)
27746 .m(5)
27747 .n(n)
27748 .k(k)
27749 .cn_stride(11)
27750 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27751 }
27752 }
27753 }
27754
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,n_gt_8_strided_a)27755 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, n_gt_8_strided_a) {
27756 for (uint32_t n = 9; n < 16; n++) {
27757 for (size_t k = 1; k <= 5; k += 2) {
27758 GemmMicrokernelTester()
27759 .mr(5)
27760 .nr(8)
27761 .kr(1)
27762 .sr(1)
27763 .m(5)
27764 .n(n)
27765 .k(k)
27766 .a_stride(7)
27767 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27768 }
27769 }
27770 }
27771
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,n_gt_8_subtile)27772 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, n_gt_8_subtile) {
27773 for (uint32_t n = 9; n < 16; n++) {
27774 for (size_t k = 1; k <= 5; k += 2) {
27775 for (uint32_t m = 1; m <= 5; m++) {
27776 GemmMicrokernelTester()
27777 .mr(5)
27778 .nr(8)
27779 .kr(1)
27780 .sr(1)
27781 .m(m)
27782 .n(n)
27783 .k(k)
27784 .iterations(1)
27785 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27786 }
27787 }
27788 }
27789 }
27790
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,n_div_8)27791 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, n_div_8) {
27792 for (uint32_t n = 16; n <= 24; n += 8) {
27793 for (size_t k = 1; k <= 5; k += 2) {
27794 GemmMicrokernelTester()
27795 .mr(5)
27796 .nr(8)
27797 .kr(1)
27798 .sr(1)
27799 .m(5)
27800 .n(n)
27801 .k(k)
27802 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27803 }
27804 }
27805 }
27806
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,n_div_8_strided_cn)27807 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, n_div_8_strided_cn) {
27808 for (uint32_t n = 16; n <= 24; n += 8) {
27809 for (size_t k = 1; k <= 5; k += 2) {
27810 GemmMicrokernelTester()
27811 .mr(5)
27812 .nr(8)
27813 .kr(1)
27814 .sr(1)
27815 .m(5)
27816 .n(n)
27817 .k(k)
27818 .cn_stride(11)
27819 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27820 }
27821 }
27822 }
27823
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,n_div_8_strided_a)27824 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, n_div_8_strided_a) {
27825 for (uint32_t n = 16; n <= 24; n += 8) {
27826 for (size_t k = 1; k <= 5; k += 2) {
27827 GemmMicrokernelTester()
27828 .mr(5)
27829 .nr(8)
27830 .kr(1)
27831 .sr(1)
27832 .m(5)
27833 .n(n)
27834 .k(k)
27835 .a_stride(7)
27836 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27837 }
27838 }
27839 }
27840
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,n_div_8_subtile)27841 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, n_div_8_subtile) {
27842 for (uint32_t n = 16; n <= 24; n += 8) {
27843 for (size_t k = 1; k <= 5; k += 2) {
27844 for (uint32_t m = 1; m <= 5; m++) {
27845 GemmMicrokernelTester()
27846 .mr(5)
27847 .nr(8)
27848 .kr(1)
27849 .sr(1)
27850 .m(m)
27851 .n(n)
27852 .k(k)
27853 .iterations(1)
27854 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27855 }
27856 }
27857 }
27858 }
27859
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,strided_cm_subtile)27860 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, strided_cm_subtile) {
27861 for (size_t k = 1; k <= 5; k += 2) {
27862 for (uint32_t n = 1; n <= 8; n++) {
27863 for (uint32_t m = 1; m <= 5; m++) {
27864 GemmMicrokernelTester()
27865 .mr(5)
27866 .nr(8)
27867 .kr(1)
27868 .sr(1)
27869 .m(m)
27870 .n(n)
27871 .k(k)
27872 .cm_stride(11)
27873 .iterations(1)
27874 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27875 }
27876 }
27877 }
27878 }
27879
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,qmin)27880 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, qmin) {
27881 GemmMicrokernelTester()
27882 .mr(5)
27883 .nr(8)
27884 .kr(1)
27885 .sr(1)
27886 .m(5)
27887 .n(8)
27888 .k(1)
27889 .qmin(128)
27890 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27891 }
27892
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,qmax)27893 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, qmax) {
27894 GemmMicrokernelTester()
27895 .mr(5)
27896 .nr(8)
27897 .kr(1)
27898 .sr(1)
27899 .m(5)
27900 .n(8)
27901 .k(1)
27902 .qmax(128)
27903 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27904 }
27905
TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT,strided_cm)27906 TEST(F32_GEMM_MINMAX_5X8__WASMSIMD_ARM_LOADSPLAT, strided_cm) {
27907 GemmMicrokernelTester()
27908 .mr(5)
27909 .nr(8)
27910 .kr(1)
27911 .sr(1)
27912 .m(5)
27913 .n(8)
27914 .k(1)
27915 .cm_stride(11)
27916 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
27917 }
27918 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
27919
27920
27921 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_eq_4)27922 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_eq_4) {
27923 GemmMicrokernelTester()
27924 .mr(6)
27925 .nr(8)
27926 .kr(1)
27927 .sr(1)
27928 .m(6)
27929 .n(8)
27930 .k(4)
27931 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
27932 }
27933
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,strided_cn)27934 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, strided_cn) {
27935 GemmMicrokernelTester()
27936 .mr(6)
27937 .nr(8)
27938 .kr(1)
27939 .sr(1)
27940 .m(6)
27941 .n(8)
27942 .k(4)
27943 .cn_stride(11)
27944 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
27945 }
27946
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_eq_4_strided_a)27947 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_eq_4_strided_a) {
27948 GemmMicrokernelTester()
27949 .mr(6)
27950 .nr(8)
27951 .kr(1)
27952 .sr(1)
27953 .m(6)
27954 .n(8)
27955 .k(4)
27956 .a_stride(7)
27957 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
27958 }
27959
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_eq_4_subtile)27960 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_eq_4_subtile) {
27961 for (uint32_t n = 1; n <= 8; n++) {
27962 for (uint32_t m = 1; m <= 6; m++) {
27963 GemmMicrokernelTester()
27964 .mr(6)
27965 .nr(8)
27966 .kr(1)
27967 .sr(1)
27968 .m(m)
27969 .n(n)
27970 .k(4)
27971 .iterations(1)
27972 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
27973 }
27974 }
27975 }
27976
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_eq_4_subtile_m)27977 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_eq_4_subtile_m) {
27978 for (uint32_t m = 1; m <= 6; m++) {
27979 GemmMicrokernelTester()
27980 .mr(6)
27981 .nr(8)
27982 .kr(1)
27983 .sr(1)
27984 .m(m)
27985 .n(8)
27986 .k(4)
27987 .iterations(1)
27988 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
27989 }
27990 }
27991
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_eq_4_subtile_n)27992 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_eq_4_subtile_n) {
27993 for (uint32_t n = 1; n <= 8; n++) {
27994 GemmMicrokernelTester()
27995 .mr(6)
27996 .nr(8)
27997 .kr(1)
27998 .sr(1)
27999 .m(6)
28000 .n(n)
28001 .k(4)
28002 .iterations(1)
28003 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28004 }
28005 }
28006
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_lt_4)28007 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_lt_4) {
28008 for (size_t k = 1; k < 4; k++) {
28009 GemmMicrokernelTester()
28010 .mr(6)
28011 .nr(8)
28012 .kr(1)
28013 .sr(1)
28014 .m(6)
28015 .n(8)
28016 .k(k)
28017 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28018 }
28019 }
28020
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_lt_4_strided_a)28021 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_lt_4_strided_a) {
28022 for (size_t k = 1; k < 4; k++) {
28023 GemmMicrokernelTester()
28024 .mr(6)
28025 .nr(8)
28026 .kr(1)
28027 .sr(1)
28028 .m(6)
28029 .n(8)
28030 .k(k)
28031 .a_stride(7)
28032 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28033 }
28034 }
28035
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_lt_4_subtile)28036 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_lt_4_subtile) {
28037 for (size_t k = 1; k < 4; k++) {
28038 for (uint32_t n = 1; n <= 8; n++) {
28039 for (uint32_t m = 1; m <= 6; m++) {
28040 GemmMicrokernelTester()
28041 .mr(6)
28042 .nr(8)
28043 .kr(1)
28044 .sr(1)
28045 .m(m)
28046 .n(n)
28047 .k(k)
28048 .iterations(1)
28049 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28050 }
28051 }
28052 }
28053 }
28054
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_gt_4)28055 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_gt_4) {
28056 for (size_t k = 5; k < 8; k++) {
28057 GemmMicrokernelTester()
28058 .mr(6)
28059 .nr(8)
28060 .kr(1)
28061 .sr(1)
28062 .m(6)
28063 .n(8)
28064 .k(k)
28065 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28066 }
28067 }
28068
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_gt_4_strided_a)28069 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_gt_4_strided_a) {
28070 for (size_t k = 5; k < 8; k++) {
28071 GemmMicrokernelTester()
28072 .mr(6)
28073 .nr(8)
28074 .kr(1)
28075 .sr(1)
28076 .m(6)
28077 .n(8)
28078 .k(k)
28079 .a_stride(11)
28080 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28081 }
28082 }
28083
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_gt_4_subtile)28084 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_gt_4_subtile) {
28085 for (size_t k = 5; k < 8; k++) {
28086 for (uint32_t n = 1; n <= 8; n++) {
28087 for (uint32_t m = 1; m <= 6; m++) {
28088 GemmMicrokernelTester()
28089 .mr(6)
28090 .nr(8)
28091 .kr(1)
28092 .sr(1)
28093 .m(m)
28094 .n(n)
28095 .k(k)
28096 .iterations(1)
28097 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28098 }
28099 }
28100 }
28101 }
28102
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_div_4)28103 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_div_4) {
28104 for (size_t k = 8; k <= 40; k += 4) {
28105 GemmMicrokernelTester()
28106 .mr(6)
28107 .nr(8)
28108 .kr(1)
28109 .sr(1)
28110 .m(6)
28111 .n(8)
28112 .k(k)
28113 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28114 }
28115 }
28116
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_div_4_strided_a)28117 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_div_4_strided_a) {
28118 for (size_t k = 8; k <= 40; k += 4) {
28119 GemmMicrokernelTester()
28120 .mr(6)
28121 .nr(8)
28122 .kr(1)
28123 .sr(1)
28124 .m(6)
28125 .n(8)
28126 .k(k)
28127 .a_stride(43)
28128 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28129 }
28130 }
28131
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,k_div_4_subtile)28132 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, k_div_4_subtile) {
28133 for (size_t k = 8; k <= 40; k += 4) {
28134 for (uint32_t n = 1; n <= 8; n++) {
28135 for (uint32_t m = 1; m <= 6; m++) {
28136 GemmMicrokernelTester()
28137 .mr(6)
28138 .nr(8)
28139 .kr(1)
28140 .sr(1)
28141 .m(m)
28142 .n(n)
28143 .k(k)
28144 .iterations(1)
28145 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28146 }
28147 }
28148 }
28149 }
28150
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,n_gt_8)28151 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, n_gt_8) {
28152 for (uint32_t n = 9; n < 16; n++) {
28153 for (size_t k = 1; k <= 20; k += 5) {
28154 GemmMicrokernelTester()
28155 .mr(6)
28156 .nr(8)
28157 .kr(1)
28158 .sr(1)
28159 .m(6)
28160 .n(n)
28161 .k(k)
28162 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28163 }
28164 }
28165 }
28166
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,n_gt_8_strided_cn)28167 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, n_gt_8_strided_cn) {
28168 for (uint32_t n = 9; n < 16; n++) {
28169 for (size_t k = 1; k <= 20; k += 5) {
28170 GemmMicrokernelTester()
28171 .mr(6)
28172 .nr(8)
28173 .kr(1)
28174 .sr(1)
28175 .m(6)
28176 .n(n)
28177 .k(k)
28178 .cn_stride(11)
28179 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28180 }
28181 }
28182 }
28183
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,n_gt_8_strided_a)28184 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, n_gt_8_strided_a) {
28185 for (uint32_t n = 9; n < 16; n++) {
28186 for (size_t k = 1; k <= 20; k += 5) {
28187 GemmMicrokernelTester()
28188 .mr(6)
28189 .nr(8)
28190 .kr(1)
28191 .sr(1)
28192 .m(6)
28193 .n(n)
28194 .k(k)
28195 .a_stride(23)
28196 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28197 }
28198 }
28199 }
28200
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,n_gt_8_subtile)28201 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, n_gt_8_subtile) {
28202 for (uint32_t n = 9; n < 16; n++) {
28203 for (size_t k = 1; k <= 20; k += 5) {
28204 for (uint32_t m = 1; m <= 6; m++) {
28205 GemmMicrokernelTester()
28206 .mr(6)
28207 .nr(8)
28208 .kr(1)
28209 .sr(1)
28210 .m(m)
28211 .n(n)
28212 .k(k)
28213 .iterations(1)
28214 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28215 }
28216 }
28217 }
28218 }
28219
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,n_div_8)28220 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, n_div_8) {
28221 for (uint32_t n = 16; n <= 24; n += 8) {
28222 for (size_t k = 1; k <= 20; k += 5) {
28223 GemmMicrokernelTester()
28224 .mr(6)
28225 .nr(8)
28226 .kr(1)
28227 .sr(1)
28228 .m(6)
28229 .n(n)
28230 .k(k)
28231 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28232 }
28233 }
28234 }
28235
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,n_div_8_strided_cn)28236 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, n_div_8_strided_cn) {
28237 for (uint32_t n = 16; n <= 24; n += 8) {
28238 for (size_t k = 1; k <= 20; k += 5) {
28239 GemmMicrokernelTester()
28240 .mr(6)
28241 .nr(8)
28242 .kr(1)
28243 .sr(1)
28244 .m(6)
28245 .n(n)
28246 .k(k)
28247 .cn_stride(11)
28248 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28249 }
28250 }
28251 }
28252
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,n_div_8_strided_a)28253 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, n_div_8_strided_a) {
28254 for (uint32_t n = 16; n <= 24; n += 8) {
28255 for (size_t k = 1; k <= 20; k += 5) {
28256 GemmMicrokernelTester()
28257 .mr(6)
28258 .nr(8)
28259 .kr(1)
28260 .sr(1)
28261 .m(6)
28262 .n(n)
28263 .k(k)
28264 .a_stride(23)
28265 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28266 }
28267 }
28268 }
28269
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,n_div_8_subtile)28270 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, n_div_8_subtile) {
28271 for (uint32_t n = 16; n <= 24; n += 8) {
28272 for (size_t k = 1; k <= 20; k += 5) {
28273 for (uint32_t m = 1; m <= 6; m++) {
28274 GemmMicrokernelTester()
28275 .mr(6)
28276 .nr(8)
28277 .kr(1)
28278 .sr(1)
28279 .m(m)
28280 .n(n)
28281 .k(k)
28282 .iterations(1)
28283 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28284 }
28285 }
28286 }
28287 }
28288
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,strided_cm_subtile)28289 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, strided_cm_subtile) {
28290 for (size_t k = 1; k <= 20; k += 5) {
28291 for (uint32_t n = 1; n <= 8; n++) {
28292 for (uint32_t m = 1; m <= 6; m++) {
28293 GemmMicrokernelTester()
28294 .mr(6)
28295 .nr(8)
28296 .kr(1)
28297 .sr(1)
28298 .m(m)
28299 .n(n)
28300 .k(k)
28301 .cm_stride(11)
28302 .iterations(1)
28303 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28304 }
28305 }
28306 }
28307 }
28308
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,qmin)28309 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, qmin) {
28310 GemmMicrokernelTester()
28311 .mr(6)
28312 .nr(8)
28313 .kr(1)
28314 .sr(1)
28315 .m(6)
28316 .n(8)
28317 .k(4)
28318 .qmin(128)
28319 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28320 }
28321
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,qmax)28322 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, qmax) {
28323 GemmMicrokernelTester()
28324 .mr(6)
28325 .nr(8)
28326 .kr(1)
28327 .sr(1)
28328 .m(6)
28329 .n(8)
28330 .k(4)
28331 .qmax(128)
28332 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28333 }
28334
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT,strided_cm)28335 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_ARM_SPLAT, strided_cm) {
28336 GemmMicrokernelTester()
28337 .mr(6)
28338 .nr(8)
28339 .kr(1)
28340 .sr(1)
28341 .m(6)
28342 .n(8)
28343 .k(4)
28344 .cm_stride(11)
28345 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_arm_splat, xnn_init_f32_minmax_wasmsimd_params);
28346 }
28347 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
28348
28349
28350 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,k_eq_1)28351 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, k_eq_1) {
28352 GemmMicrokernelTester()
28353 .mr(6)
28354 .nr(8)
28355 .kr(1)
28356 .sr(1)
28357 .m(6)
28358 .n(8)
28359 .k(1)
28360 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28361 }
28362
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,strided_cn)28363 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, strided_cn) {
28364 GemmMicrokernelTester()
28365 .mr(6)
28366 .nr(8)
28367 .kr(1)
28368 .sr(1)
28369 .m(6)
28370 .n(8)
28371 .k(1)
28372 .cn_stride(11)
28373 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28374 }
28375
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_strided_a)28376 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_strided_a) {
28377 GemmMicrokernelTester()
28378 .mr(6)
28379 .nr(8)
28380 .kr(1)
28381 .sr(1)
28382 .m(6)
28383 .n(8)
28384 .k(1)
28385 .a_stride(3)
28386 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28387 }
28388
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile)28389 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile) {
28390 for (uint32_t n = 1; n <= 8; n++) {
28391 for (uint32_t m = 1; m <= 6; m++) {
28392 GemmMicrokernelTester()
28393 .mr(6)
28394 .nr(8)
28395 .kr(1)
28396 .sr(1)
28397 .m(m)
28398 .n(n)
28399 .k(1)
28400 .iterations(1)
28401 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28402 }
28403 }
28404 }
28405
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile_m)28406 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile_m) {
28407 for (uint32_t m = 1; m <= 6; m++) {
28408 GemmMicrokernelTester()
28409 .mr(6)
28410 .nr(8)
28411 .kr(1)
28412 .sr(1)
28413 .m(m)
28414 .n(8)
28415 .k(1)
28416 .iterations(1)
28417 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28418 }
28419 }
28420
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,k_eq_1_subtile_n)28421 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, k_eq_1_subtile_n) {
28422 for (uint32_t n = 1; n <= 8; n++) {
28423 GemmMicrokernelTester()
28424 .mr(6)
28425 .nr(8)
28426 .kr(1)
28427 .sr(1)
28428 .m(6)
28429 .n(n)
28430 .k(1)
28431 .iterations(1)
28432 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28433 }
28434 }
28435
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,k_gt_1)28436 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, k_gt_1) {
28437 for (size_t k = 2; k < 10; k++) {
28438 GemmMicrokernelTester()
28439 .mr(6)
28440 .nr(8)
28441 .kr(1)
28442 .sr(1)
28443 .m(6)
28444 .n(8)
28445 .k(k)
28446 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28447 }
28448 }
28449
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,k_gt_1_strided_a)28450 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, k_gt_1_strided_a) {
28451 for (size_t k = 2; k < 10; k++) {
28452 GemmMicrokernelTester()
28453 .mr(6)
28454 .nr(8)
28455 .kr(1)
28456 .sr(1)
28457 .m(6)
28458 .n(8)
28459 .k(k)
28460 .a_stride(11)
28461 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28462 }
28463 }
28464
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,k_gt_1_subtile)28465 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, k_gt_1_subtile) {
28466 for (size_t k = 2; k < 10; k++) {
28467 for (uint32_t n = 1; n <= 8; n++) {
28468 for (uint32_t m = 1; m <= 6; m++) {
28469 GemmMicrokernelTester()
28470 .mr(6)
28471 .nr(8)
28472 .kr(1)
28473 .sr(1)
28474 .m(m)
28475 .n(n)
28476 .k(k)
28477 .iterations(1)
28478 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28479 }
28480 }
28481 }
28482 }
28483
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,n_gt_8)28484 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, n_gt_8) {
28485 for (uint32_t n = 9; n < 16; n++) {
28486 for (size_t k = 1; k <= 5; k += 2) {
28487 GemmMicrokernelTester()
28488 .mr(6)
28489 .nr(8)
28490 .kr(1)
28491 .sr(1)
28492 .m(6)
28493 .n(n)
28494 .k(k)
28495 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28496 }
28497 }
28498 }
28499
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_strided_cn)28500 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_strided_cn) {
28501 for (uint32_t n = 9; n < 16; n++) {
28502 for (size_t k = 1; k <= 5; k += 2) {
28503 GemmMicrokernelTester()
28504 .mr(6)
28505 .nr(8)
28506 .kr(1)
28507 .sr(1)
28508 .m(6)
28509 .n(n)
28510 .k(k)
28511 .cn_stride(11)
28512 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28513 }
28514 }
28515 }
28516
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_strided_a)28517 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_strided_a) {
28518 for (uint32_t n = 9; n < 16; n++) {
28519 for (size_t k = 1; k <= 5; k += 2) {
28520 GemmMicrokernelTester()
28521 .mr(6)
28522 .nr(8)
28523 .kr(1)
28524 .sr(1)
28525 .m(6)
28526 .n(n)
28527 .k(k)
28528 .a_stride(7)
28529 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28530 }
28531 }
28532 }
28533
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,n_gt_8_subtile)28534 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, n_gt_8_subtile) {
28535 for (uint32_t n = 9; n < 16; n++) {
28536 for (size_t k = 1; k <= 5; k += 2) {
28537 for (uint32_t m = 1; m <= 6; m++) {
28538 GemmMicrokernelTester()
28539 .mr(6)
28540 .nr(8)
28541 .kr(1)
28542 .sr(1)
28543 .m(m)
28544 .n(n)
28545 .k(k)
28546 .iterations(1)
28547 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28548 }
28549 }
28550 }
28551 }
28552
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,n_div_8)28553 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, n_div_8) {
28554 for (uint32_t n = 16; n <= 24; n += 8) {
28555 for (size_t k = 1; k <= 5; k += 2) {
28556 GemmMicrokernelTester()
28557 .mr(6)
28558 .nr(8)
28559 .kr(1)
28560 .sr(1)
28561 .m(6)
28562 .n(n)
28563 .k(k)
28564 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28565 }
28566 }
28567 }
28568
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,n_div_8_strided_cn)28569 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, n_div_8_strided_cn) {
28570 for (uint32_t n = 16; n <= 24; n += 8) {
28571 for (size_t k = 1; k <= 5; k += 2) {
28572 GemmMicrokernelTester()
28573 .mr(6)
28574 .nr(8)
28575 .kr(1)
28576 .sr(1)
28577 .m(6)
28578 .n(n)
28579 .k(k)
28580 .cn_stride(11)
28581 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28582 }
28583 }
28584 }
28585
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,n_div_8_strided_a)28586 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, n_div_8_strided_a) {
28587 for (uint32_t n = 16; n <= 24; n += 8) {
28588 for (size_t k = 1; k <= 5; k += 2) {
28589 GemmMicrokernelTester()
28590 .mr(6)
28591 .nr(8)
28592 .kr(1)
28593 .sr(1)
28594 .m(6)
28595 .n(n)
28596 .k(k)
28597 .a_stride(7)
28598 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28599 }
28600 }
28601 }
28602
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,n_div_8_subtile)28603 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, n_div_8_subtile) {
28604 for (uint32_t n = 16; n <= 24; n += 8) {
28605 for (size_t k = 1; k <= 5; k += 2) {
28606 for (uint32_t m = 1; m <= 6; m++) {
28607 GemmMicrokernelTester()
28608 .mr(6)
28609 .nr(8)
28610 .kr(1)
28611 .sr(1)
28612 .m(m)
28613 .n(n)
28614 .k(k)
28615 .iterations(1)
28616 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28617 }
28618 }
28619 }
28620 }
28621
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,strided_cm_subtile)28622 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, strided_cm_subtile) {
28623 for (size_t k = 1; k <= 5; k += 2) {
28624 for (uint32_t n = 1; n <= 8; n++) {
28625 for (uint32_t m = 1; m <= 6; m++) {
28626 GemmMicrokernelTester()
28627 .mr(6)
28628 .nr(8)
28629 .kr(1)
28630 .sr(1)
28631 .m(m)
28632 .n(n)
28633 .k(k)
28634 .cm_stride(11)
28635 .iterations(1)
28636 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28637 }
28638 }
28639 }
28640 }
28641
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,qmin)28642 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, qmin) {
28643 GemmMicrokernelTester()
28644 .mr(6)
28645 .nr(8)
28646 .kr(1)
28647 .sr(1)
28648 .m(6)
28649 .n(8)
28650 .k(1)
28651 .qmin(128)
28652 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28653 }
28654
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,qmax)28655 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, qmax) {
28656 GemmMicrokernelTester()
28657 .mr(6)
28658 .nr(8)
28659 .kr(1)
28660 .sr(1)
28661 .m(6)
28662 .n(8)
28663 .k(1)
28664 .qmax(128)
28665 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28666 }
28667
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT,strided_cm)28668 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_LOADSPLAT, strided_cm) {
28669 GemmMicrokernelTester()
28670 .mr(6)
28671 .nr(8)
28672 .kr(1)
28673 .sr(1)
28674 .m(6)
28675 .n(8)
28676 .k(1)
28677 .cm_stride(11)
28678 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
28679 }
28680 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
28681
28682
28683 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_eq_4)28684 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_eq_4) {
28685 GemmMicrokernelTester()
28686 .mr(6)
28687 .nr(8)
28688 .kr(1)
28689 .sr(1)
28690 .m(6)
28691 .n(8)
28692 .k(4)
28693 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28694 }
28695
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,strided_cn)28696 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, strided_cn) {
28697 GemmMicrokernelTester()
28698 .mr(6)
28699 .nr(8)
28700 .kr(1)
28701 .sr(1)
28702 .m(6)
28703 .n(8)
28704 .k(4)
28705 .cn_stride(11)
28706 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28707 }
28708
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_eq_4_strided_a)28709 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_eq_4_strided_a) {
28710 GemmMicrokernelTester()
28711 .mr(6)
28712 .nr(8)
28713 .kr(1)
28714 .sr(1)
28715 .m(6)
28716 .n(8)
28717 .k(4)
28718 .a_stride(7)
28719 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28720 }
28721
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_eq_4_subtile)28722 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_eq_4_subtile) {
28723 for (uint32_t n = 1; n <= 8; n++) {
28724 for (uint32_t m = 1; m <= 6; m++) {
28725 GemmMicrokernelTester()
28726 .mr(6)
28727 .nr(8)
28728 .kr(1)
28729 .sr(1)
28730 .m(m)
28731 .n(n)
28732 .k(4)
28733 .iterations(1)
28734 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28735 }
28736 }
28737 }
28738
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_eq_4_subtile_m)28739 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_eq_4_subtile_m) {
28740 for (uint32_t m = 1; m <= 6; m++) {
28741 GemmMicrokernelTester()
28742 .mr(6)
28743 .nr(8)
28744 .kr(1)
28745 .sr(1)
28746 .m(m)
28747 .n(8)
28748 .k(4)
28749 .iterations(1)
28750 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28751 }
28752 }
28753
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_eq_4_subtile_n)28754 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_eq_4_subtile_n) {
28755 for (uint32_t n = 1; n <= 8; n++) {
28756 GemmMicrokernelTester()
28757 .mr(6)
28758 .nr(8)
28759 .kr(1)
28760 .sr(1)
28761 .m(6)
28762 .n(n)
28763 .k(4)
28764 .iterations(1)
28765 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28766 }
28767 }
28768
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_lt_4)28769 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_lt_4) {
28770 for (size_t k = 1; k < 4; k++) {
28771 GemmMicrokernelTester()
28772 .mr(6)
28773 .nr(8)
28774 .kr(1)
28775 .sr(1)
28776 .m(6)
28777 .n(8)
28778 .k(k)
28779 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28780 }
28781 }
28782
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_lt_4_strided_a)28783 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_lt_4_strided_a) {
28784 for (size_t k = 1; k < 4; k++) {
28785 GemmMicrokernelTester()
28786 .mr(6)
28787 .nr(8)
28788 .kr(1)
28789 .sr(1)
28790 .m(6)
28791 .n(8)
28792 .k(k)
28793 .a_stride(7)
28794 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28795 }
28796 }
28797
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_lt_4_subtile)28798 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_lt_4_subtile) {
28799 for (size_t k = 1; k < 4; k++) {
28800 for (uint32_t n = 1; n <= 8; n++) {
28801 for (uint32_t m = 1; m <= 6; m++) {
28802 GemmMicrokernelTester()
28803 .mr(6)
28804 .nr(8)
28805 .kr(1)
28806 .sr(1)
28807 .m(m)
28808 .n(n)
28809 .k(k)
28810 .iterations(1)
28811 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28812 }
28813 }
28814 }
28815 }
28816
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_gt_4)28817 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_gt_4) {
28818 for (size_t k = 5; k < 8; k++) {
28819 GemmMicrokernelTester()
28820 .mr(6)
28821 .nr(8)
28822 .kr(1)
28823 .sr(1)
28824 .m(6)
28825 .n(8)
28826 .k(k)
28827 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28828 }
28829 }
28830
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_gt_4_strided_a)28831 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_gt_4_strided_a) {
28832 for (size_t k = 5; k < 8; k++) {
28833 GemmMicrokernelTester()
28834 .mr(6)
28835 .nr(8)
28836 .kr(1)
28837 .sr(1)
28838 .m(6)
28839 .n(8)
28840 .k(k)
28841 .a_stride(11)
28842 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28843 }
28844 }
28845
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_gt_4_subtile)28846 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_gt_4_subtile) {
28847 for (size_t k = 5; k < 8; k++) {
28848 for (uint32_t n = 1; n <= 8; n++) {
28849 for (uint32_t m = 1; m <= 6; m++) {
28850 GemmMicrokernelTester()
28851 .mr(6)
28852 .nr(8)
28853 .kr(1)
28854 .sr(1)
28855 .m(m)
28856 .n(n)
28857 .k(k)
28858 .iterations(1)
28859 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28860 }
28861 }
28862 }
28863 }
28864
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_div_4)28865 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_div_4) {
28866 for (size_t k = 8; k <= 40; k += 4) {
28867 GemmMicrokernelTester()
28868 .mr(6)
28869 .nr(8)
28870 .kr(1)
28871 .sr(1)
28872 .m(6)
28873 .n(8)
28874 .k(k)
28875 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28876 }
28877 }
28878
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_div_4_strided_a)28879 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_div_4_strided_a) {
28880 for (size_t k = 8; k <= 40; k += 4) {
28881 GemmMicrokernelTester()
28882 .mr(6)
28883 .nr(8)
28884 .kr(1)
28885 .sr(1)
28886 .m(6)
28887 .n(8)
28888 .k(k)
28889 .a_stride(43)
28890 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28891 }
28892 }
28893
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,k_div_4_subtile)28894 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, k_div_4_subtile) {
28895 for (size_t k = 8; k <= 40; k += 4) {
28896 for (uint32_t n = 1; n <= 8; n++) {
28897 for (uint32_t m = 1; m <= 6; m++) {
28898 GemmMicrokernelTester()
28899 .mr(6)
28900 .nr(8)
28901 .kr(1)
28902 .sr(1)
28903 .m(m)
28904 .n(n)
28905 .k(k)
28906 .iterations(1)
28907 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28908 }
28909 }
28910 }
28911 }
28912
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,n_gt_8)28913 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, n_gt_8) {
28914 for (uint32_t n = 9; n < 16; n++) {
28915 for (size_t k = 1; k <= 20; k += 5) {
28916 GemmMicrokernelTester()
28917 .mr(6)
28918 .nr(8)
28919 .kr(1)
28920 .sr(1)
28921 .m(6)
28922 .n(n)
28923 .k(k)
28924 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28925 }
28926 }
28927 }
28928
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,n_gt_8_strided_cn)28929 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, n_gt_8_strided_cn) {
28930 for (uint32_t n = 9; n < 16; n++) {
28931 for (size_t k = 1; k <= 20; k += 5) {
28932 GemmMicrokernelTester()
28933 .mr(6)
28934 .nr(8)
28935 .kr(1)
28936 .sr(1)
28937 .m(6)
28938 .n(n)
28939 .k(k)
28940 .cn_stride(11)
28941 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28942 }
28943 }
28944 }
28945
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,n_gt_8_strided_a)28946 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, n_gt_8_strided_a) {
28947 for (uint32_t n = 9; n < 16; n++) {
28948 for (size_t k = 1; k <= 20; k += 5) {
28949 GemmMicrokernelTester()
28950 .mr(6)
28951 .nr(8)
28952 .kr(1)
28953 .sr(1)
28954 .m(6)
28955 .n(n)
28956 .k(k)
28957 .a_stride(23)
28958 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28959 }
28960 }
28961 }
28962
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,n_gt_8_subtile)28963 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, n_gt_8_subtile) {
28964 for (uint32_t n = 9; n < 16; n++) {
28965 for (size_t k = 1; k <= 20; k += 5) {
28966 for (uint32_t m = 1; m <= 6; m++) {
28967 GemmMicrokernelTester()
28968 .mr(6)
28969 .nr(8)
28970 .kr(1)
28971 .sr(1)
28972 .m(m)
28973 .n(n)
28974 .k(k)
28975 .iterations(1)
28976 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28977 }
28978 }
28979 }
28980 }
28981
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,n_div_8)28982 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, n_div_8) {
28983 for (uint32_t n = 16; n <= 24; n += 8) {
28984 for (size_t k = 1; k <= 20; k += 5) {
28985 GemmMicrokernelTester()
28986 .mr(6)
28987 .nr(8)
28988 .kr(1)
28989 .sr(1)
28990 .m(6)
28991 .n(n)
28992 .k(k)
28993 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
28994 }
28995 }
28996 }
28997
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,n_div_8_strided_cn)28998 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, n_div_8_strided_cn) {
28999 for (uint32_t n = 16; n <= 24; n += 8) {
29000 for (size_t k = 1; k <= 20; k += 5) {
29001 GemmMicrokernelTester()
29002 .mr(6)
29003 .nr(8)
29004 .kr(1)
29005 .sr(1)
29006 .m(6)
29007 .n(n)
29008 .k(k)
29009 .cn_stride(11)
29010 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
29011 }
29012 }
29013 }
29014
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,n_div_8_strided_a)29015 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, n_div_8_strided_a) {
29016 for (uint32_t n = 16; n <= 24; n += 8) {
29017 for (size_t k = 1; k <= 20; k += 5) {
29018 GemmMicrokernelTester()
29019 .mr(6)
29020 .nr(8)
29021 .kr(1)
29022 .sr(1)
29023 .m(6)
29024 .n(n)
29025 .k(k)
29026 .a_stride(23)
29027 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
29028 }
29029 }
29030 }
29031
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,n_div_8_subtile)29032 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, n_div_8_subtile) {
29033 for (uint32_t n = 16; n <= 24; n += 8) {
29034 for (size_t k = 1; k <= 20; k += 5) {
29035 for (uint32_t m = 1; m <= 6; m++) {
29036 GemmMicrokernelTester()
29037 .mr(6)
29038 .nr(8)
29039 .kr(1)
29040 .sr(1)
29041 .m(m)
29042 .n(n)
29043 .k(k)
29044 .iterations(1)
29045 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
29046 }
29047 }
29048 }
29049 }
29050
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,strided_cm_subtile)29051 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, strided_cm_subtile) {
29052 for (size_t k = 1; k <= 20; k += 5) {
29053 for (uint32_t n = 1; n <= 8; n++) {
29054 for (uint32_t m = 1; m <= 6; m++) {
29055 GemmMicrokernelTester()
29056 .mr(6)
29057 .nr(8)
29058 .kr(1)
29059 .sr(1)
29060 .m(m)
29061 .n(n)
29062 .k(k)
29063 .cm_stride(11)
29064 .iterations(1)
29065 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
29066 }
29067 }
29068 }
29069 }
29070
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,qmin)29071 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, qmin) {
29072 GemmMicrokernelTester()
29073 .mr(6)
29074 .nr(8)
29075 .kr(1)
29076 .sr(1)
29077 .m(6)
29078 .n(8)
29079 .k(4)
29080 .qmin(128)
29081 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
29082 }
29083
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,qmax)29084 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, qmax) {
29085 GemmMicrokernelTester()
29086 .mr(6)
29087 .nr(8)
29088 .kr(1)
29089 .sr(1)
29090 .m(6)
29091 .n(8)
29092 .k(4)
29093 .qmax(128)
29094 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
29095 }
29096
TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT,strided_cm)29097 TEST(F32_GEMM_MINMAX_6X8__WASMSIMD_X86_SPLAT, strided_cm) {
29098 GemmMicrokernelTester()
29099 .mr(6)
29100 .nr(8)
29101 .kr(1)
29102 .sr(1)
29103 .m(6)
29104 .n(8)
29105 .k(4)
29106 .cm_stride(11)
29107 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmsimd_x86_splat, xnn_init_f32_minmax_wasmsimd_params);
29108 }
29109 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
29110
29111
29112 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_eq_4)29113 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_eq_4) {
29114 GemmMicrokernelTester()
29115 .mr(6)
29116 .nr(8)
29117 .kr(1)
29118 .sr(4)
29119 .m(6)
29120 .n(8)
29121 .k(4)
29122 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29123 }
29124
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,strided_cn)29125 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, strided_cn) {
29126 GemmMicrokernelTester()
29127 .mr(6)
29128 .nr(8)
29129 .kr(1)
29130 .sr(4)
29131 .m(6)
29132 .n(8)
29133 .k(4)
29134 .cn_stride(11)
29135 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29136 }
29137
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_eq_4_strided_a)29138 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_eq_4_strided_a) {
29139 GemmMicrokernelTester()
29140 .mr(6)
29141 .nr(8)
29142 .kr(1)
29143 .sr(4)
29144 .m(6)
29145 .n(8)
29146 .k(4)
29147 .a_stride(7)
29148 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29149 }
29150
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_eq_4_subtile)29151 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_eq_4_subtile) {
29152 for (uint32_t n = 1; n <= 8; n++) {
29153 for (uint32_t m = 1; m <= 6; m++) {
29154 GemmMicrokernelTester()
29155 .mr(6)
29156 .nr(8)
29157 .kr(1)
29158 .sr(4)
29159 .m(m)
29160 .n(n)
29161 .k(4)
29162 .iterations(1)
29163 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29164 }
29165 }
29166 }
29167
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_eq_4_subtile_m)29168 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_eq_4_subtile_m) {
29169 for (uint32_t m = 1; m <= 6; m++) {
29170 GemmMicrokernelTester()
29171 .mr(6)
29172 .nr(8)
29173 .kr(1)
29174 .sr(4)
29175 .m(m)
29176 .n(8)
29177 .k(4)
29178 .iterations(1)
29179 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29180 }
29181 }
29182
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_eq_4_subtile_n)29183 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_eq_4_subtile_n) {
29184 for (uint32_t n = 1; n <= 8; n++) {
29185 GemmMicrokernelTester()
29186 .mr(6)
29187 .nr(8)
29188 .kr(1)
29189 .sr(4)
29190 .m(6)
29191 .n(n)
29192 .k(4)
29193 .iterations(1)
29194 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29195 }
29196 }
29197
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_lt_4)29198 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_lt_4) {
29199 for (size_t k = 1; k < 4; k++) {
29200 GemmMicrokernelTester()
29201 .mr(6)
29202 .nr(8)
29203 .kr(1)
29204 .sr(4)
29205 .m(6)
29206 .n(8)
29207 .k(k)
29208 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29209 }
29210 }
29211
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_lt_4_strided_a)29212 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_lt_4_strided_a) {
29213 for (size_t k = 1; k < 4; k++) {
29214 GemmMicrokernelTester()
29215 .mr(6)
29216 .nr(8)
29217 .kr(1)
29218 .sr(4)
29219 .m(6)
29220 .n(8)
29221 .k(k)
29222 .a_stride(7)
29223 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29224 }
29225 }
29226
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_lt_4_subtile)29227 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_lt_4_subtile) {
29228 for (size_t k = 1; k < 4; k++) {
29229 for (uint32_t n = 1; n <= 8; n++) {
29230 for (uint32_t m = 1; m <= 6; m++) {
29231 GemmMicrokernelTester()
29232 .mr(6)
29233 .nr(8)
29234 .kr(1)
29235 .sr(4)
29236 .m(m)
29237 .n(n)
29238 .k(k)
29239 .iterations(1)
29240 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29241 }
29242 }
29243 }
29244 }
29245
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_gt_4)29246 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_gt_4) {
29247 for (size_t k = 5; k < 8; k++) {
29248 GemmMicrokernelTester()
29249 .mr(6)
29250 .nr(8)
29251 .kr(1)
29252 .sr(4)
29253 .m(6)
29254 .n(8)
29255 .k(k)
29256 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29257 }
29258 }
29259
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_gt_4_strided_a)29260 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_gt_4_strided_a) {
29261 for (size_t k = 5; k < 8; k++) {
29262 GemmMicrokernelTester()
29263 .mr(6)
29264 .nr(8)
29265 .kr(1)
29266 .sr(4)
29267 .m(6)
29268 .n(8)
29269 .k(k)
29270 .a_stride(11)
29271 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29272 }
29273 }
29274
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_gt_4_subtile)29275 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_gt_4_subtile) {
29276 for (size_t k = 5; k < 8; k++) {
29277 for (uint32_t n = 1; n <= 8; n++) {
29278 for (uint32_t m = 1; m <= 6; m++) {
29279 GemmMicrokernelTester()
29280 .mr(6)
29281 .nr(8)
29282 .kr(1)
29283 .sr(4)
29284 .m(m)
29285 .n(n)
29286 .k(k)
29287 .iterations(1)
29288 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29289 }
29290 }
29291 }
29292 }
29293
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_div_4)29294 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_div_4) {
29295 for (size_t k = 8; k <= 40; k += 4) {
29296 GemmMicrokernelTester()
29297 .mr(6)
29298 .nr(8)
29299 .kr(1)
29300 .sr(4)
29301 .m(6)
29302 .n(8)
29303 .k(k)
29304 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29305 }
29306 }
29307
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_div_4_strided_a)29308 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_div_4_strided_a) {
29309 for (size_t k = 8; k <= 40; k += 4) {
29310 GemmMicrokernelTester()
29311 .mr(6)
29312 .nr(8)
29313 .kr(1)
29314 .sr(4)
29315 .m(6)
29316 .n(8)
29317 .k(k)
29318 .a_stride(43)
29319 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29320 }
29321 }
29322
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,k_div_4_subtile)29323 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, k_div_4_subtile) {
29324 for (size_t k = 8; k <= 40; k += 4) {
29325 for (uint32_t n = 1; n <= 8; n++) {
29326 for (uint32_t m = 1; m <= 6; m++) {
29327 GemmMicrokernelTester()
29328 .mr(6)
29329 .nr(8)
29330 .kr(1)
29331 .sr(4)
29332 .m(m)
29333 .n(n)
29334 .k(k)
29335 .iterations(1)
29336 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29337 }
29338 }
29339 }
29340 }
29341
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,n_gt_8)29342 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, n_gt_8) {
29343 for (uint32_t n = 9; n < 16; n++) {
29344 for (size_t k = 1; k <= 20; k += 5) {
29345 GemmMicrokernelTester()
29346 .mr(6)
29347 .nr(8)
29348 .kr(1)
29349 .sr(4)
29350 .m(6)
29351 .n(n)
29352 .k(k)
29353 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29354 }
29355 }
29356 }
29357
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,n_gt_8_strided_cn)29358 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, n_gt_8_strided_cn) {
29359 for (uint32_t n = 9; n < 16; n++) {
29360 for (size_t k = 1; k <= 20; k += 5) {
29361 GemmMicrokernelTester()
29362 .mr(6)
29363 .nr(8)
29364 .kr(1)
29365 .sr(4)
29366 .m(6)
29367 .n(n)
29368 .k(k)
29369 .cn_stride(11)
29370 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29371 }
29372 }
29373 }
29374
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,n_gt_8_strided_a)29375 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, n_gt_8_strided_a) {
29376 for (uint32_t n = 9; n < 16; n++) {
29377 for (size_t k = 1; k <= 20; k += 5) {
29378 GemmMicrokernelTester()
29379 .mr(6)
29380 .nr(8)
29381 .kr(1)
29382 .sr(4)
29383 .m(6)
29384 .n(n)
29385 .k(k)
29386 .a_stride(23)
29387 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29388 }
29389 }
29390 }
29391
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,n_gt_8_subtile)29392 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, n_gt_8_subtile) {
29393 for (uint32_t n = 9; n < 16; n++) {
29394 for (size_t k = 1; k <= 20; k += 5) {
29395 for (uint32_t m = 1; m <= 6; m++) {
29396 GemmMicrokernelTester()
29397 .mr(6)
29398 .nr(8)
29399 .kr(1)
29400 .sr(4)
29401 .m(m)
29402 .n(n)
29403 .k(k)
29404 .iterations(1)
29405 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29406 }
29407 }
29408 }
29409 }
29410
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,n_div_8)29411 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, n_div_8) {
29412 for (uint32_t n = 16; n <= 24; n += 8) {
29413 for (size_t k = 1; k <= 20; k += 5) {
29414 GemmMicrokernelTester()
29415 .mr(6)
29416 .nr(8)
29417 .kr(1)
29418 .sr(4)
29419 .m(6)
29420 .n(n)
29421 .k(k)
29422 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29423 }
29424 }
29425 }
29426
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,n_div_8_strided_cn)29427 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, n_div_8_strided_cn) {
29428 for (uint32_t n = 16; n <= 24; n += 8) {
29429 for (size_t k = 1; k <= 20; k += 5) {
29430 GemmMicrokernelTester()
29431 .mr(6)
29432 .nr(8)
29433 .kr(1)
29434 .sr(4)
29435 .m(6)
29436 .n(n)
29437 .k(k)
29438 .cn_stride(11)
29439 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29440 }
29441 }
29442 }
29443
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,n_div_8_strided_a)29444 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, n_div_8_strided_a) {
29445 for (uint32_t n = 16; n <= 24; n += 8) {
29446 for (size_t k = 1; k <= 20; k += 5) {
29447 GemmMicrokernelTester()
29448 .mr(6)
29449 .nr(8)
29450 .kr(1)
29451 .sr(4)
29452 .m(6)
29453 .n(n)
29454 .k(k)
29455 .a_stride(23)
29456 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29457 }
29458 }
29459 }
29460
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,n_div_8_subtile)29461 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, n_div_8_subtile) {
29462 for (uint32_t n = 16; n <= 24; n += 8) {
29463 for (size_t k = 1; k <= 20; k += 5) {
29464 for (uint32_t m = 1; m <= 6; m++) {
29465 GemmMicrokernelTester()
29466 .mr(6)
29467 .nr(8)
29468 .kr(1)
29469 .sr(4)
29470 .m(m)
29471 .n(n)
29472 .k(k)
29473 .iterations(1)
29474 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29475 }
29476 }
29477 }
29478 }
29479
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,strided_cm_subtile)29480 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, strided_cm_subtile) {
29481 for (size_t k = 1; k <= 20; k += 5) {
29482 for (uint32_t n = 1; n <= 8; n++) {
29483 for (uint32_t m = 1; m <= 6; m++) {
29484 GemmMicrokernelTester()
29485 .mr(6)
29486 .nr(8)
29487 .kr(1)
29488 .sr(4)
29489 .m(m)
29490 .n(n)
29491 .k(k)
29492 .cm_stride(11)
29493 .iterations(1)
29494 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29495 }
29496 }
29497 }
29498 }
29499
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,qmin)29500 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, qmin) {
29501 GemmMicrokernelTester()
29502 .mr(6)
29503 .nr(8)
29504 .kr(1)
29505 .sr(4)
29506 .m(6)
29507 .n(8)
29508 .k(4)
29509 .qmin(128)
29510 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29511 }
29512
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,qmax)29513 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, qmax) {
29514 GemmMicrokernelTester()
29515 .mr(6)
29516 .nr(8)
29517 .kr(1)
29518 .sr(4)
29519 .m(6)
29520 .n(8)
29521 .k(4)
29522 .qmax(128)
29523 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29524 }
29525
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM,strided_cm)29526 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_ARM, strided_cm) {
29527 GemmMicrokernelTester()
29528 .mr(6)
29529 .nr(8)
29530 .kr(1)
29531 .sr(4)
29532 .m(6)
29533 .n(8)
29534 .k(4)
29535 .cm_stride(11)
29536 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm, xnn_init_f32_minmax_wasmsimd_params);
29537 }
29538 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
29539
29540
29541 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_eq_4)29542 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_eq_4) {
29543 GemmMicrokernelTester()
29544 .mr(6)
29545 .nr(8)
29546 .kr(1)
29547 .sr(4)
29548 .m(6)
29549 .n(8)
29550 .k(4)
29551 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29552 }
29553
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,strided_cn)29554 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, strided_cn) {
29555 GemmMicrokernelTester()
29556 .mr(6)
29557 .nr(8)
29558 .kr(1)
29559 .sr(4)
29560 .m(6)
29561 .n(8)
29562 .k(4)
29563 .cn_stride(11)
29564 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29565 }
29566
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_eq_4_strided_a)29567 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_eq_4_strided_a) {
29568 GemmMicrokernelTester()
29569 .mr(6)
29570 .nr(8)
29571 .kr(1)
29572 .sr(4)
29573 .m(6)
29574 .n(8)
29575 .k(4)
29576 .a_stride(7)
29577 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29578 }
29579
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_eq_4_subtile)29580 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_eq_4_subtile) {
29581 for (uint32_t n = 1; n <= 8; n++) {
29582 for (uint32_t m = 1; m <= 6; m++) {
29583 GemmMicrokernelTester()
29584 .mr(6)
29585 .nr(8)
29586 .kr(1)
29587 .sr(4)
29588 .m(m)
29589 .n(n)
29590 .k(4)
29591 .iterations(1)
29592 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29593 }
29594 }
29595 }
29596
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_eq_4_subtile_m)29597 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_eq_4_subtile_m) {
29598 for (uint32_t m = 1; m <= 6; m++) {
29599 GemmMicrokernelTester()
29600 .mr(6)
29601 .nr(8)
29602 .kr(1)
29603 .sr(4)
29604 .m(m)
29605 .n(8)
29606 .k(4)
29607 .iterations(1)
29608 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29609 }
29610 }
29611
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_eq_4_subtile_n)29612 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_eq_4_subtile_n) {
29613 for (uint32_t n = 1; n <= 8; n++) {
29614 GemmMicrokernelTester()
29615 .mr(6)
29616 .nr(8)
29617 .kr(1)
29618 .sr(4)
29619 .m(6)
29620 .n(n)
29621 .k(4)
29622 .iterations(1)
29623 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29624 }
29625 }
29626
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_lt_4)29627 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_lt_4) {
29628 for (size_t k = 1; k < 4; k++) {
29629 GemmMicrokernelTester()
29630 .mr(6)
29631 .nr(8)
29632 .kr(1)
29633 .sr(4)
29634 .m(6)
29635 .n(8)
29636 .k(k)
29637 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29638 }
29639 }
29640
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_lt_4_strided_a)29641 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_lt_4_strided_a) {
29642 for (size_t k = 1; k < 4; k++) {
29643 GemmMicrokernelTester()
29644 .mr(6)
29645 .nr(8)
29646 .kr(1)
29647 .sr(4)
29648 .m(6)
29649 .n(8)
29650 .k(k)
29651 .a_stride(7)
29652 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29653 }
29654 }
29655
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_lt_4_subtile)29656 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_lt_4_subtile) {
29657 for (size_t k = 1; k < 4; k++) {
29658 for (uint32_t n = 1; n <= 8; n++) {
29659 for (uint32_t m = 1; m <= 6; m++) {
29660 GemmMicrokernelTester()
29661 .mr(6)
29662 .nr(8)
29663 .kr(1)
29664 .sr(4)
29665 .m(m)
29666 .n(n)
29667 .k(k)
29668 .iterations(1)
29669 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29670 }
29671 }
29672 }
29673 }
29674
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_gt_4)29675 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_gt_4) {
29676 for (size_t k = 5; k < 8; k++) {
29677 GemmMicrokernelTester()
29678 .mr(6)
29679 .nr(8)
29680 .kr(1)
29681 .sr(4)
29682 .m(6)
29683 .n(8)
29684 .k(k)
29685 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29686 }
29687 }
29688
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_gt_4_strided_a)29689 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_gt_4_strided_a) {
29690 for (size_t k = 5; k < 8; k++) {
29691 GemmMicrokernelTester()
29692 .mr(6)
29693 .nr(8)
29694 .kr(1)
29695 .sr(4)
29696 .m(6)
29697 .n(8)
29698 .k(k)
29699 .a_stride(11)
29700 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29701 }
29702 }
29703
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_gt_4_subtile)29704 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_gt_4_subtile) {
29705 for (size_t k = 5; k < 8; k++) {
29706 for (uint32_t n = 1; n <= 8; n++) {
29707 for (uint32_t m = 1; m <= 6; m++) {
29708 GemmMicrokernelTester()
29709 .mr(6)
29710 .nr(8)
29711 .kr(1)
29712 .sr(4)
29713 .m(m)
29714 .n(n)
29715 .k(k)
29716 .iterations(1)
29717 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29718 }
29719 }
29720 }
29721 }
29722
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_div_4)29723 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_div_4) {
29724 for (size_t k = 8; k <= 40; k += 4) {
29725 GemmMicrokernelTester()
29726 .mr(6)
29727 .nr(8)
29728 .kr(1)
29729 .sr(4)
29730 .m(6)
29731 .n(8)
29732 .k(k)
29733 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29734 }
29735 }
29736
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_div_4_strided_a)29737 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_div_4_strided_a) {
29738 for (size_t k = 8; k <= 40; k += 4) {
29739 GemmMicrokernelTester()
29740 .mr(6)
29741 .nr(8)
29742 .kr(1)
29743 .sr(4)
29744 .m(6)
29745 .n(8)
29746 .k(k)
29747 .a_stride(43)
29748 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29749 }
29750 }
29751
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,k_div_4_subtile)29752 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, k_div_4_subtile) {
29753 for (size_t k = 8; k <= 40; k += 4) {
29754 for (uint32_t n = 1; n <= 8; n++) {
29755 for (uint32_t m = 1; m <= 6; m++) {
29756 GemmMicrokernelTester()
29757 .mr(6)
29758 .nr(8)
29759 .kr(1)
29760 .sr(4)
29761 .m(m)
29762 .n(n)
29763 .k(k)
29764 .iterations(1)
29765 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29766 }
29767 }
29768 }
29769 }
29770
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,n_gt_8)29771 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, n_gt_8) {
29772 for (uint32_t n = 9; n < 16; n++) {
29773 for (size_t k = 1; k <= 20; k += 5) {
29774 GemmMicrokernelTester()
29775 .mr(6)
29776 .nr(8)
29777 .kr(1)
29778 .sr(4)
29779 .m(6)
29780 .n(n)
29781 .k(k)
29782 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29783 }
29784 }
29785 }
29786
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,n_gt_8_strided_cn)29787 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, n_gt_8_strided_cn) {
29788 for (uint32_t n = 9; n < 16; n++) {
29789 for (size_t k = 1; k <= 20; k += 5) {
29790 GemmMicrokernelTester()
29791 .mr(6)
29792 .nr(8)
29793 .kr(1)
29794 .sr(4)
29795 .m(6)
29796 .n(n)
29797 .k(k)
29798 .cn_stride(11)
29799 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29800 }
29801 }
29802 }
29803
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,n_gt_8_strided_a)29804 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, n_gt_8_strided_a) {
29805 for (uint32_t n = 9; n < 16; n++) {
29806 for (size_t k = 1; k <= 20; k += 5) {
29807 GemmMicrokernelTester()
29808 .mr(6)
29809 .nr(8)
29810 .kr(1)
29811 .sr(4)
29812 .m(6)
29813 .n(n)
29814 .k(k)
29815 .a_stride(23)
29816 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29817 }
29818 }
29819 }
29820
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,n_gt_8_subtile)29821 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, n_gt_8_subtile) {
29822 for (uint32_t n = 9; n < 16; n++) {
29823 for (size_t k = 1; k <= 20; k += 5) {
29824 for (uint32_t m = 1; m <= 6; m++) {
29825 GemmMicrokernelTester()
29826 .mr(6)
29827 .nr(8)
29828 .kr(1)
29829 .sr(4)
29830 .m(m)
29831 .n(n)
29832 .k(k)
29833 .iterations(1)
29834 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29835 }
29836 }
29837 }
29838 }
29839
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,n_div_8)29840 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, n_div_8) {
29841 for (uint32_t n = 16; n <= 24; n += 8) {
29842 for (size_t k = 1; k <= 20; k += 5) {
29843 GemmMicrokernelTester()
29844 .mr(6)
29845 .nr(8)
29846 .kr(1)
29847 .sr(4)
29848 .m(6)
29849 .n(n)
29850 .k(k)
29851 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29852 }
29853 }
29854 }
29855
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,n_div_8_strided_cn)29856 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, n_div_8_strided_cn) {
29857 for (uint32_t n = 16; n <= 24; n += 8) {
29858 for (size_t k = 1; k <= 20; k += 5) {
29859 GemmMicrokernelTester()
29860 .mr(6)
29861 .nr(8)
29862 .kr(1)
29863 .sr(4)
29864 .m(6)
29865 .n(n)
29866 .k(k)
29867 .cn_stride(11)
29868 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29869 }
29870 }
29871 }
29872
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,n_div_8_strided_a)29873 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, n_div_8_strided_a) {
29874 for (uint32_t n = 16; n <= 24; n += 8) {
29875 for (size_t k = 1; k <= 20; k += 5) {
29876 GemmMicrokernelTester()
29877 .mr(6)
29878 .nr(8)
29879 .kr(1)
29880 .sr(4)
29881 .m(6)
29882 .n(n)
29883 .k(k)
29884 .a_stride(23)
29885 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29886 }
29887 }
29888 }
29889
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,n_div_8_subtile)29890 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, n_div_8_subtile) {
29891 for (uint32_t n = 16; n <= 24; n += 8) {
29892 for (size_t k = 1; k <= 20; k += 5) {
29893 for (uint32_t m = 1; m <= 6; m++) {
29894 GemmMicrokernelTester()
29895 .mr(6)
29896 .nr(8)
29897 .kr(1)
29898 .sr(4)
29899 .m(m)
29900 .n(n)
29901 .k(k)
29902 .iterations(1)
29903 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29904 }
29905 }
29906 }
29907 }
29908
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,strided_cm_subtile)29909 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, strided_cm_subtile) {
29910 for (size_t k = 1; k <= 20; k += 5) {
29911 for (uint32_t n = 1; n <= 8; n++) {
29912 for (uint32_t m = 1; m <= 6; m++) {
29913 GemmMicrokernelTester()
29914 .mr(6)
29915 .nr(8)
29916 .kr(1)
29917 .sr(4)
29918 .m(m)
29919 .n(n)
29920 .k(k)
29921 .cm_stride(11)
29922 .iterations(1)
29923 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29924 }
29925 }
29926 }
29927 }
29928
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,qmin)29929 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, qmin) {
29930 GemmMicrokernelTester()
29931 .mr(6)
29932 .nr(8)
29933 .kr(1)
29934 .sr(4)
29935 .m(6)
29936 .n(8)
29937 .k(4)
29938 .qmin(128)
29939 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29940 }
29941
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,qmax)29942 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, qmax) {
29943 GemmMicrokernelTester()
29944 .mr(6)
29945 .nr(8)
29946 .kr(1)
29947 .sr(4)
29948 .m(6)
29949 .n(8)
29950 .k(4)
29951 .qmax(128)
29952 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29953 }
29954
TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86,strided_cm)29955 TEST(F32_GEMM_MINMAX_6X8S4__WASMSIMD_X86, strided_cm) {
29956 GemmMicrokernelTester()
29957 .mr(6)
29958 .nr(8)
29959 .kr(1)
29960 .sr(4)
29961 .m(6)
29962 .n(8)
29963 .k(4)
29964 .cm_stride(11)
29965 .Test(xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86, xnn_init_f32_minmax_wasmsimd_params);
29966 }
29967 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
29968
29969
29970 #if XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1)29971 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1) {
29972 GemmMicrokernelTester()
29973 .mr(1)
29974 .nr(8)
29975 .kr(1)
29976 .sr(1)
29977 .m(1)
29978 .n(8)
29979 .k(1)
29980 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
29981 }
29982
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cn)29983 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cn) {
29984 GemmMicrokernelTester()
29985 .mr(1)
29986 .nr(8)
29987 .kr(1)
29988 .sr(1)
29989 .m(1)
29990 .n(8)
29991 .k(1)
29992 .cn_stride(11)
29993 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
29994 }
29995
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_strided_a)29996 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_strided_a) {
29997 GemmMicrokernelTester()
29998 .mr(1)
29999 .nr(8)
30000 .kr(1)
30001 .sr(1)
30002 .m(1)
30003 .n(8)
30004 .k(1)
30005 .a_stride(3)
30006 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30007 }
30008
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile)30009 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile) {
30010 for (uint32_t n = 1; n <= 8; n++) {
30011 for (uint32_t m = 1; m <= 1; m++) {
30012 GemmMicrokernelTester()
30013 .mr(1)
30014 .nr(8)
30015 .kr(1)
30016 .sr(1)
30017 .m(m)
30018 .n(n)
30019 .k(1)
30020 .iterations(1)
30021 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30022 }
30023 }
30024 }
30025
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_m)30026 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_m) {
30027 for (uint32_t m = 1; m <= 1; m++) {
30028 GemmMicrokernelTester()
30029 .mr(1)
30030 .nr(8)
30031 .kr(1)
30032 .sr(1)
30033 .m(m)
30034 .n(8)
30035 .k(1)
30036 .iterations(1)
30037 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30038 }
30039 }
30040
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_n)30041 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_n) {
30042 for (uint32_t n = 1; n <= 8; n++) {
30043 GemmMicrokernelTester()
30044 .mr(1)
30045 .nr(8)
30046 .kr(1)
30047 .sr(1)
30048 .m(1)
30049 .n(n)
30050 .k(1)
30051 .iterations(1)
30052 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30053 }
30054 }
30055
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1)30056 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1) {
30057 for (size_t k = 2; k < 10; k++) {
30058 GemmMicrokernelTester()
30059 .mr(1)
30060 .nr(8)
30061 .kr(1)
30062 .sr(1)
30063 .m(1)
30064 .n(8)
30065 .k(k)
30066 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30067 }
30068 }
30069
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_strided_a)30070 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_strided_a) {
30071 for (size_t k = 2; k < 10; k++) {
30072 GemmMicrokernelTester()
30073 .mr(1)
30074 .nr(8)
30075 .kr(1)
30076 .sr(1)
30077 .m(1)
30078 .n(8)
30079 .k(k)
30080 .a_stride(11)
30081 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30082 }
30083 }
30084
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_subtile)30085 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_subtile) {
30086 for (size_t k = 2; k < 10; k++) {
30087 for (uint32_t n = 1; n <= 8; n++) {
30088 for (uint32_t m = 1; m <= 1; m++) {
30089 GemmMicrokernelTester()
30090 .mr(1)
30091 .nr(8)
30092 .kr(1)
30093 .sr(1)
30094 .m(m)
30095 .n(n)
30096 .k(k)
30097 .iterations(1)
30098 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30099 }
30100 }
30101 }
30102 }
30103
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8)30104 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8) {
30105 for (uint32_t n = 9; n < 16; n++) {
30106 for (size_t k = 1; k <= 5; k += 2) {
30107 GemmMicrokernelTester()
30108 .mr(1)
30109 .nr(8)
30110 .kr(1)
30111 .sr(1)
30112 .m(1)
30113 .n(n)
30114 .k(k)
30115 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30116 }
30117 }
30118 }
30119
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_cn)30120 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_cn) {
30121 for (uint32_t n = 9; n < 16; n++) {
30122 for (size_t k = 1; k <= 5; k += 2) {
30123 GemmMicrokernelTester()
30124 .mr(1)
30125 .nr(8)
30126 .kr(1)
30127 .sr(1)
30128 .m(1)
30129 .n(n)
30130 .k(k)
30131 .cn_stride(11)
30132 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30133 }
30134 }
30135 }
30136
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_a)30137 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_a) {
30138 for (uint32_t n = 9; n < 16; n++) {
30139 for (size_t k = 1; k <= 5; k += 2) {
30140 GemmMicrokernelTester()
30141 .mr(1)
30142 .nr(8)
30143 .kr(1)
30144 .sr(1)
30145 .m(1)
30146 .n(n)
30147 .k(k)
30148 .a_stride(7)
30149 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30150 }
30151 }
30152 }
30153
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_subtile)30154 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_subtile) {
30155 for (uint32_t n = 9; n < 16; n++) {
30156 for (size_t k = 1; k <= 5; k += 2) {
30157 for (uint32_t m = 1; m <= 1; m++) {
30158 GemmMicrokernelTester()
30159 .mr(1)
30160 .nr(8)
30161 .kr(1)
30162 .sr(1)
30163 .m(m)
30164 .n(n)
30165 .k(k)
30166 .iterations(1)
30167 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30168 }
30169 }
30170 }
30171 }
30172
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8)30173 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8) {
30174 for (uint32_t n = 16; n <= 24; n += 8) {
30175 for (size_t k = 1; k <= 5; k += 2) {
30176 GemmMicrokernelTester()
30177 .mr(1)
30178 .nr(8)
30179 .kr(1)
30180 .sr(1)
30181 .m(1)
30182 .n(n)
30183 .k(k)
30184 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30185 }
30186 }
30187 }
30188
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_cn)30189 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_cn) {
30190 for (uint32_t n = 16; n <= 24; n += 8) {
30191 for (size_t k = 1; k <= 5; k += 2) {
30192 GemmMicrokernelTester()
30193 .mr(1)
30194 .nr(8)
30195 .kr(1)
30196 .sr(1)
30197 .m(1)
30198 .n(n)
30199 .k(k)
30200 .cn_stride(11)
30201 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30202 }
30203 }
30204 }
30205
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_a)30206 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_a) {
30207 for (uint32_t n = 16; n <= 24; n += 8) {
30208 for (size_t k = 1; k <= 5; k += 2) {
30209 GemmMicrokernelTester()
30210 .mr(1)
30211 .nr(8)
30212 .kr(1)
30213 .sr(1)
30214 .m(1)
30215 .n(n)
30216 .k(k)
30217 .a_stride(7)
30218 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30219 }
30220 }
30221 }
30222
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_subtile)30223 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_subtile) {
30224 for (uint32_t n = 16; n <= 24; n += 8) {
30225 for (size_t k = 1; k <= 5; k += 2) {
30226 for (uint32_t m = 1; m <= 1; m++) {
30227 GemmMicrokernelTester()
30228 .mr(1)
30229 .nr(8)
30230 .kr(1)
30231 .sr(1)
30232 .m(m)
30233 .n(n)
30234 .k(k)
30235 .iterations(1)
30236 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30237 }
30238 }
30239 }
30240 }
30241
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm_subtile)30242 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm_subtile) {
30243 for (size_t k = 1; k <= 5; k += 2) {
30244 for (uint32_t n = 1; n <= 8; n++) {
30245 for (uint32_t m = 1; m <= 1; m++) {
30246 GemmMicrokernelTester()
30247 .mr(1)
30248 .nr(8)
30249 .kr(1)
30250 .sr(1)
30251 .m(m)
30252 .n(n)
30253 .k(k)
30254 .cm_stride(11)
30255 .iterations(1)
30256 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30257 }
30258 }
30259 }
30260 }
30261
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,qmin)30262 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, qmin) {
30263 GemmMicrokernelTester()
30264 .mr(1)
30265 .nr(8)
30266 .kr(1)
30267 .sr(1)
30268 .m(1)
30269 .n(8)
30270 .k(1)
30271 .qmin(128)
30272 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30273 }
30274
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,qmax)30275 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, qmax) {
30276 GemmMicrokernelTester()
30277 .mr(1)
30278 .nr(8)
30279 .kr(1)
30280 .sr(1)
30281 .m(1)
30282 .n(8)
30283 .k(1)
30284 .qmax(128)
30285 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30286 }
30287
TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm)30288 TEST(F32_GEMM_MINMAX_1X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm) {
30289 GemmMicrokernelTester()
30290 .mr(1)
30291 .nr(8)
30292 .kr(1)
30293 .sr(1)
30294 .m(1)
30295 .n(8)
30296 .k(1)
30297 .cm_stride(11)
30298 .Test(xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30299 }
30300 #endif // XNN_ARCH_WASMRELAXEDSIMD
30301
30302
30303 #if XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1)30304 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1) {
30305 GemmMicrokernelTester()
30306 .mr(3)
30307 .nr(8)
30308 .kr(1)
30309 .sr(1)
30310 .m(3)
30311 .n(8)
30312 .k(1)
30313 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30314 }
30315
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cn)30316 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cn) {
30317 GemmMicrokernelTester()
30318 .mr(3)
30319 .nr(8)
30320 .kr(1)
30321 .sr(1)
30322 .m(3)
30323 .n(8)
30324 .k(1)
30325 .cn_stride(11)
30326 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30327 }
30328
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_strided_a)30329 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_strided_a) {
30330 GemmMicrokernelTester()
30331 .mr(3)
30332 .nr(8)
30333 .kr(1)
30334 .sr(1)
30335 .m(3)
30336 .n(8)
30337 .k(1)
30338 .a_stride(3)
30339 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30340 }
30341
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile)30342 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile) {
30343 for (uint32_t n = 1; n <= 8; n++) {
30344 for (uint32_t m = 1; m <= 3; m++) {
30345 GemmMicrokernelTester()
30346 .mr(3)
30347 .nr(8)
30348 .kr(1)
30349 .sr(1)
30350 .m(m)
30351 .n(n)
30352 .k(1)
30353 .iterations(1)
30354 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30355 }
30356 }
30357 }
30358
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_m)30359 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_m) {
30360 for (uint32_t m = 1; m <= 3; m++) {
30361 GemmMicrokernelTester()
30362 .mr(3)
30363 .nr(8)
30364 .kr(1)
30365 .sr(1)
30366 .m(m)
30367 .n(8)
30368 .k(1)
30369 .iterations(1)
30370 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30371 }
30372 }
30373
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_n)30374 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_n) {
30375 for (uint32_t n = 1; n <= 8; n++) {
30376 GemmMicrokernelTester()
30377 .mr(3)
30378 .nr(8)
30379 .kr(1)
30380 .sr(1)
30381 .m(3)
30382 .n(n)
30383 .k(1)
30384 .iterations(1)
30385 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30386 }
30387 }
30388
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1)30389 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1) {
30390 for (size_t k = 2; k < 10; k++) {
30391 GemmMicrokernelTester()
30392 .mr(3)
30393 .nr(8)
30394 .kr(1)
30395 .sr(1)
30396 .m(3)
30397 .n(8)
30398 .k(k)
30399 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30400 }
30401 }
30402
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_strided_a)30403 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_strided_a) {
30404 for (size_t k = 2; k < 10; k++) {
30405 GemmMicrokernelTester()
30406 .mr(3)
30407 .nr(8)
30408 .kr(1)
30409 .sr(1)
30410 .m(3)
30411 .n(8)
30412 .k(k)
30413 .a_stride(11)
30414 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30415 }
30416 }
30417
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_subtile)30418 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_subtile) {
30419 for (size_t k = 2; k < 10; k++) {
30420 for (uint32_t n = 1; n <= 8; n++) {
30421 for (uint32_t m = 1; m <= 3; m++) {
30422 GemmMicrokernelTester()
30423 .mr(3)
30424 .nr(8)
30425 .kr(1)
30426 .sr(1)
30427 .m(m)
30428 .n(n)
30429 .k(k)
30430 .iterations(1)
30431 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30432 }
30433 }
30434 }
30435 }
30436
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8)30437 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8) {
30438 for (uint32_t n = 9; n < 16; n++) {
30439 for (size_t k = 1; k <= 5; k += 2) {
30440 GemmMicrokernelTester()
30441 .mr(3)
30442 .nr(8)
30443 .kr(1)
30444 .sr(1)
30445 .m(3)
30446 .n(n)
30447 .k(k)
30448 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30449 }
30450 }
30451 }
30452
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_cn)30453 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_cn) {
30454 for (uint32_t n = 9; n < 16; n++) {
30455 for (size_t k = 1; k <= 5; k += 2) {
30456 GemmMicrokernelTester()
30457 .mr(3)
30458 .nr(8)
30459 .kr(1)
30460 .sr(1)
30461 .m(3)
30462 .n(n)
30463 .k(k)
30464 .cn_stride(11)
30465 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30466 }
30467 }
30468 }
30469
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_a)30470 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_a) {
30471 for (uint32_t n = 9; n < 16; n++) {
30472 for (size_t k = 1; k <= 5; k += 2) {
30473 GemmMicrokernelTester()
30474 .mr(3)
30475 .nr(8)
30476 .kr(1)
30477 .sr(1)
30478 .m(3)
30479 .n(n)
30480 .k(k)
30481 .a_stride(7)
30482 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30483 }
30484 }
30485 }
30486
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_subtile)30487 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_subtile) {
30488 for (uint32_t n = 9; n < 16; n++) {
30489 for (size_t k = 1; k <= 5; k += 2) {
30490 for (uint32_t m = 1; m <= 3; m++) {
30491 GemmMicrokernelTester()
30492 .mr(3)
30493 .nr(8)
30494 .kr(1)
30495 .sr(1)
30496 .m(m)
30497 .n(n)
30498 .k(k)
30499 .iterations(1)
30500 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30501 }
30502 }
30503 }
30504 }
30505
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8)30506 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8) {
30507 for (uint32_t n = 16; n <= 24; n += 8) {
30508 for (size_t k = 1; k <= 5; k += 2) {
30509 GemmMicrokernelTester()
30510 .mr(3)
30511 .nr(8)
30512 .kr(1)
30513 .sr(1)
30514 .m(3)
30515 .n(n)
30516 .k(k)
30517 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30518 }
30519 }
30520 }
30521
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_cn)30522 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_cn) {
30523 for (uint32_t n = 16; n <= 24; n += 8) {
30524 for (size_t k = 1; k <= 5; k += 2) {
30525 GemmMicrokernelTester()
30526 .mr(3)
30527 .nr(8)
30528 .kr(1)
30529 .sr(1)
30530 .m(3)
30531 .n(n)
30532 .k(k)
30533 .cn_stride(11)
30534 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30535 }
30536 }
30537 }
30538
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_a)30539 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_a) {
30540 for (uint32_t n = 16; n <= 24; n += 8) {
30541 for (size_t k = 1; k <= 5; k += 2) {
30542 GemmMicrokernelTester()
30543 .mr(3)
30544 .nr(8)
30545 .kr(1)
30546 .sr(1)
30547 .m(3)
30548 .n(n)
30549 .k(k)
30550 .a_stride(7)
30551 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30552 }
30553 }
30554 }
30555
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_subtile)30556 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_subtile) {
30557 for (uint32_t n = 16; n <= 24; n += 8) {
30558 for (size_t k = 1; k <= 5; k += 2) {
30559 for (uint32_t m = 1; m <= 3; m++) {
30560 GemmMicrokernelTester()
30561 .mr(3)
30562 .nr(8)
30563 .kr(1)
30564 .sr(1)
30565 .m(m)
30566 .n(n)
30567 .k(k)
30568 .iterations(1)
30569 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30570 }
30571 }
30572 }
30573 }
30574
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm_subtile)30575 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm_subtile) {
30576 for (size_t k = 1; k <= 5; k += 2) {
30577 for (uint32_t n = 1; n <= 8; n++) {
30578 for (uint32_t m = 1; m <= 3; m++) {
30579 GemmMicrokernelTester()
30580 .mr(3)
30581 .nr(8)
30582 .kr(1)
30583 .sr(1)
30584 .m(m)
30585 .n(n)
30586 .k(k)
30587 .cm_stride(11)
30588 .iterations(1)
30589 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30590 }
30591 }
30592 }
30593 }
30594
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,qmin)30595 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, qmin) {
30596 GemmMicrokernelTester()
30597 .mr(3)
30598 .nr(8)
30599 .kr(1)
30600 .sr(1)
30601 .m(3)
30602 .n(8)
30603 .k(1)
30604 .qmin(128)
30605 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30606 }
30607
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,qmax)30608 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, qmax) {
30609 GemmMicrokernelTester()
30610 .mr(3)
30611 .nr(8)
30612 .kr(1)
30613 .sr(1)
30614 .m(3)
30615 .n(8)
30616 .k(1)
30617 .qmax(128)
30618 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30619 }
30620
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm)30621 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm) {
30622 GemmMicrokernelTester()
30623 .mr(3)
30624 .nr(8)
30625 .kr(1)
30626 .sr(1)
30627 .m(3)
30628 .n(8)
30629 .k(1)
30630 .cm_stride(11)
30631 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
30632 }
30633 #endif // XNN_ARCH_WASMRELAXEDSIMD
30634
30635
30636 #if XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_eq_4)30637 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_eq_4) {
30638 GemmMicrokernelTester()
30639 .mr(3)
30640 .nr(8)
30641 .kr(1)
30642 .sr(1)
30643 .m(3)
30644 .n(8)
30645 .k(4)
30646 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30647 }
30648
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,strided_cn)30649 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, strided_cn) {
30650 GemmMicrokernelTester()
30651 .mr(3)
30652 .nr(8)
30653 .kr(1)
30654 .sr(1)
30655 .m(3)
30656 .n(8)
30657 .k(4)
30658 .cn_stride(11)
30659 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30660 }
30661
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_eq_4_strided_a)30662 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_eq_4_strided_a) {
30663 GemmMicrokernelTester()
30664 .mr(3)
30665 .nr(8)
30666 .kr(1)
30667 .sr(1)
30668 .m(3)
30669 .n(8)
30670 .k(4)
30671 .a_stride(7)
30672 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30673 }
30674
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_eq_4_subtile)30675 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_eq_4_subtile) {
30676 for (uint32_t n = 1; n <= 8; n++) {
30677 for (uint32_t m = 1; m <= 3; m++) {
30678 GemmMicrokernelTester()
30679 .mr(3)
30680 .nr(8)
30681 .kr(1)
30682 .sr(1)
30683 .m(m)
30684 .n(n)
30685 .k(4)
30686 .iterations(1)
30687 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30688 }
30689 }
30690 }
30691
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_eq_4_subtile_m)30692 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_eq_4_subtile_m) {
30693 for (uint32_t m = 1; m <= 3; m++) {
30694 GemmMicrokernelTester()
30695 .mr(3)
30696 .nr(8)
30697 .kr(1)
30698 .sr(1)
30699 .m(m)
30700 .n(8)
30701 .k(4)
30702 .iterations(1)
30703 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30704 }
30705 }
30706
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_eq_4_subtile_n)30707 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_eq_4_subtile_n) {
30708 for (uint32_t n = 1; n <= 8; n++) {
30709 GemmMicrokernelTester()
30710 .mr(3)
30711 .nr(8)
30712 .kr(1)
30713 .sr(1)
30714 .m(3)
30715 .n(n)
30716 .k(4)
30717 .iterations(1)
30718 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30719 }
30720 }
30721
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_lt_4)30722 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_lt_4) {
30723 for (size_t k = 1; k < 4; k++) {
30724 GemmMicrokernelTester()
30725 .mr(3)
30726 .nr(8)
30727 .kr(1)
30728 .sr(1)
30729 .m(3)
30730 .n(8)
30731 .k(k)
30732 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30733 }
30734 }
30735
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_lt_4_strided_a)30736 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_lt_4_strided_a) {
30737 for (size_t k = 1; k < 4; k++) {
30738 GemmMicrokernelTester()
30739 .mr(3)
30740 .nr(8)
30741 .kr(1)
30742 .sr(1)
30743 .m(3)
30744 .n(8)
30745 .k(k)
30746 .a_stride(7)
30747 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30748 }
30749 }
30750
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_lt_4_subtile)30751 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_lt_4_subtile) {
30752 for (size_t k = 1; k < 4; k++) {
30753 for (uint32_t n = 1; n <= 8; n++) {
30754 for (uint32_t m = 1; m <= 3; m++) {
30755 GemmMicrokernelTester()
30756 .mr(3)
30757 .nr(8)
30758 .kr(1)
30759 .sr(1)
30760 .m(m)
30761 .n(n)
30762 .k(k)
30763 .iterations(1)
30764 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30765 }
30766 }
30767 }
30768 }
30769
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_gt_4)30770 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_gt_4) {
30771 for (size_t k = 5; k < 8; k++) {
30772 GemmMicrokernelTester()
30773 .mr(3)
30774 .nr(8)
30775 .kr(1)
30776 .sr(1)
30777 .m(3)
30778 .n(8)
30779 .k(k)
30780 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30781 }
30782 }
30783
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_gt_4_strided_a)30784 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_gt_4_strided_a) {
30785 for (size_t k = 5; k < 8; k++) {
30786 GemmMicrokernelTester()
30787 .mr(3)
30788 .nr(8)
30789 .kr(1)
30790 .sr(1)
30791 .m(3)
30792 .n(8)
30793 .k(k)
30794 .a_stride(11)
30795 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30796 }
30797 }
30798
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_gt_4_subtile)30799 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_gt_4_subtile) {
30800 for (size_t k = 5; k < 8; k++) {
30801 for (uint32_t n = 1; n <= 8; n++) {
30802 for (uint32_t m = 1; m <= 3; m++) {
30803 GemmMicrokernelTester()
30804 .mr(3)
30805 .nr(8)
30806 .kr(1)
30807 .sr(1)
30808 .m(m)
30809 .n(n)
30810 .k(k)
30811 .iterations(1)
30812 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30813 }
30814 }
30815 }
30816 }
30817
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_div_4)30818 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_div_4) {
30819 for (size_t k = 8; k <= 40; k += 4) {
30820 GemmMicrokernelTester()
30821 .mr(3)
30822 .nr(8)
30823 .kr(1)
30824 .sr(1)
30825 .m(3)
30826 .n(8)
30827 .k(k)
30828 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30829 }
30830 }
30831
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_div_4_strided_a)30832 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_div_4_strided_a) {
30833 for (size_t k = 8; k <= 40; k += 4) {
30834 GemmMicrokernelTester()
30835 .mr(3)
30836 .nr(8)
30837 .kr(1)
30838 .sr(1)
30839 .m(3)
30840 .n(8)
30841 .k(k)
30842 .a_stride(43)
30843 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30844 }
30845 }
30846
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,k_div_4_subtile)30847 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, k_div_4_subtile) {
30848 for (size_t k = 8; k <= 40; k += 4) {
30849 for (uint32_t n = 1; n <= 8; n++) {
30850 for (uint32_t m = 1; m <= 3; m++) {
30851 GemmMicrokernelTester()
30852 .mr(3)
30853 .nr(8)
30854 .kr(1)
30855 .sr(1)
30856 .m(m)
30857 .n(n)
30858 .k(k)
30859 .iterations(1)
30860 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30861 }
30862 }
30863 }
30864 }
30865
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,n_gt_8)30866 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, n_gt_8) {
30867 for (uint32_t n = 9; n < 16; n++) {
30868 for (size_t k = 1; k <= 20; k += 5) {
30869 GemmMicrokernelTester()
30870 .mr(3)
30871 .nr(8)
30872 .kr(1)
30873 .sr(1)
30874 .m(3)
30875 .n(n)
30876 .k(k)
30877 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30878 }
30879 }
30880 }
30881
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,n_gt_8_strided_cn)30882 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, n_gt_8_strided_cn) {
30883 for (uint32_t n = 9; n < 16; n++) {
30884 for (size_t k = 1; k <= 20; k += 5) {
30885 GemmMicrokernelTester()
30886 .mr(3)
30887 .nr(8)
30888 .kr(1)
30889 .sr(1)
30890 .m(3)
30891 .n(n)
30892 .k(k)
30893 .cn_stride(11)
30894 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30895 }
30896 }
30897 }
30898
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,n_gt_8_strided_a)30899 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, n_gt_8_strided_a) {
30900 for (uint32_t n = 9; n < 16; n++) {
30901 for (size_t k = 1; k <= 20; k += 5) {
30902 GemmMicrokernelTester()
30903 .mr(3)
30904 .nr(8)
30905 .kr(1)
30906 .sr(1)
30907 .m(3)
30908 .n(n)
30909 .k(k)
30910 .a_stride(23)
30911 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30912 }
30913 }
30914 }
30915
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,n_gt_8_subtile)30916 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, n_gt_8_subtile) {
30917 for (uint32_t n = 9; n < 16; n++) {
30918 for (size_t k = 1; k <= 20; k += 5) {
30919 for (uint32_t m = 1; m <= 3; m++) {
30920 GemmMicrokernelTester()
30921 .mr(3)
30922 .nr(8)
30923 .kr(1)
30924 .sr(1)
30925 .m(m)
30926 .n(n)
30927 .k(k)
30928 .iterations(1)
30929 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30930 }
30931 }
30932 }
30933 }
30934
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,n_div_8)30935 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, n_div_8) {
30936 for (uint32_t n = 16; n <= 24; n += 8) {
30937 for (size_t k = 1; k <= 20; k += 5) {
30938 GemmMicrokernelTester()
30939 .mr(3)
30940 .nr(8)
30941 .kr(1)
30942 .sr(1)
30943 .m(3)
30944 .n(n)
30945 .k(k)
30946 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30947 }
30948 }
30949 }
30950
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,n_div_8_strided_cn)30951 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, n_div_8_strided_cn) {
30952 for (uint32_t n = 16; n <= 24; n += 8) {
30953 for (size_t k = 1; k <= 20; k += 5) {
30954 GemmMicrokernelTester()
30955 .mr(3)
30956 .nr(8)
30957 .kr(1)
30958 .sr(1)
30959 .m(3)
30960 .n(n)
30961 .k(k)
30962 .cn_stride(11)
30963 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30964 }
30965 }
30966 }
30967
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,n_div_8_strided_a)30968 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, n_div_8_strided_a) {
30969 for (uint32_t n = 16; n <= 24; n += 8) {
30970 for (size_t k = 1; k <= 20; k += 5) {
30971 GemmMicrokernelTester()
30972 .mr(3)
30973 .nr(8)
30974 .kr(1)
30975 .sr(1)
30976 .m(3)
30977 .n(n)
30978 .k(k)
30979 .a_stride(23)
30980 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30981 }
30982 }
30983 }
30984
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,n_div_8_subtile)30985 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, n_div_8_subtile) {
30986 for (uint32_t n = 16; n <= 24; n += 8) {
30987 for (size_t k = 1; k <= 20; k += 5) {
30988 for (uint32_t m = 1; m <= 3; m++) {
30989 GemmMicrokernelTester()
30990 .mr(3)
30991 .nr(8)
30992 .kr(1)
30993 .sr(1)
30994 .m(m)
30995 .n(n)
30996 .k(k)
30997 .iterations(1)
30998 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
30999 }
31000 }
31001 }
31002 }
31003
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,strided_cm_subtile)31004 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, strided_cm_subtile) {
31005 for (size_t k = 1; k <= 20; k += 5) {
31006 for (uint32_t n = 1; n <= 8; n++) {
31007 for (uint32_t m = 1; m <= 3; m++) {
31008 GemmMicrokernelTester()
31009 .mr(3)
31010 .nr(8)
31011 .kr(1)
31012 .sr(1)
31013 .m(m)
31014 .n(n)
31015 .k(k)
31016 .cm_stride(11)
31017 .iterations(1)
31018 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31019 }
31020 }
31021 }
31022 }
31023
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,qmin)31024 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, qmin) {
31025 GemmMicrokernelTester()
31026 .mr(3)
31027 .nr(8)
31028 .kr(1)
31029 .sr(1)
31030 .m(3)
31031 .n(8)
31032 .k(4)
31033 .qmin(128)
31034 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31035 }
31036
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,qmax)31037 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, qmax) {
31038 GemmMicrokernelTester()
31039 .mr(3)
31040 .nr(8)
31041 .kr(1)
31042 .sr(1)
31043 .m(3)
31044 .n(8)
31045 .k(4)
31046 .qmax(128)
31047 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31048 }
31049
TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT,strided_cm)31050 TEST(F32_GEMM_MINMAX_3X8__WASMRELAXEDSIMD_SPLAT, strided_cm) {
31051 GemmMicrokernelTester()
31052 .mr(3)
31053 .nr(8)
31054 .kr(1)
31055 .sr(1)
31056 .m(3)
31057 .n(8)
31058 .k(4)
31059 .cm_stride(11)
31060 .Test(xnn_f32_gemm_minmax_ukernel_3x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31061 }
31062 #endif // XNN_ARCH_WASMRELAXEDSIMD
31063
31064
31065 #if XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_eq_4)31066 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_eq_4) {
31067 GemmMicrokernelTester()
31068 .mr(3)
31069 .nr(8)
31070 .kr(1)
31071 .sr(4)
31072 .m(3)
31073 .n(8)
31074 .k(4)
31075 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31076 }
31077
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,strided_cn)31078 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, strided_cn) {
31079 GemmMicrokernelTester()
31080 .mr(3)
31081 .nr(8)
31082 .kr(1)
31083 .sr(4)
31084 .m(3)
31085 .n(8)
31086 .k(4)
31087 .cn_stride(11)
31088 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31089 }
31090
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_eq_4_strided_a)31091 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_eq_4_strided_a) {
31092 GemmMicrokernelTester()
31093 .mr(3)
31094 .nr(8)
31095 .kr(1)
31096 .sr(4)
31097 .m(3)
31098 .n(8)
31099 .k(4)
31100 .a_stride(7)
31101 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31102 }
31103
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_eq_4_subtile)31104 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_eq_4_subtile) {
31105 for (uint32_t n = 1; n <= 8; n++) {
31106 for (uint32_t m = 1; m <= 3; m++) {
31107 GemmMicrokernelTester()
31108 .mr(3)
31109 .nr(8)
31110 .kr(1)
31111 .sr(4)
31112 .m(m)
31113 .n(n)
31114 .k(4)
31115 .iterations(1)
31116 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31117 }
31118 }
31119 }
31120
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_eq_4_subtile_m)31121 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_eq_4_subtile_m) {
31122 for (uint32_t m = 1; m <= 3; m++) {
31123 GemmMicrokernelTester()
31124 .mr(3)
31125 .nr(8)
31126 .kr(1)
31127 .sr(4)
31128 .m(m)
31129 .n(8)
31130 .k(4)
31131 .iterations(1)
31132 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31133 }
31134 }
31135
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_eq_4_subtile_n)31136 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_eq_4_subtile_n) {
31137 for (uint32_t n = 1; n <= 8; n++) {
31138 GemmMicrokernelTester()
31139 .mr(3)
31140 .nr(8)
31141 .kr(1)
31142 .sr(4)
31143 .m(3)
31144 .n(n)
31145 .k(4)
31146 .iterations(1)
31147 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31148 }
31149 }
31150
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_lt_4)31151 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_lt_4) {
31152 for (size_t k = 1; k < 4; k++) {
31153 GemmMicrokernelTester()
31154 .mr(3)
31155 .nr(8)
31156 .kr(1)
31157 .sr(4)
31158 .m(3)
31159 .n(8)
31160 .k(k)
31161 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31162 }
31163 }
31164
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_lt_4_strided_a)31165 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_lt_4_strided_a) {
31166 for (size_t k = 1; k < 4; k++) {
31167 GemmMicrokernelTester()
31168 .mr(3)
31169 .nr(8)
31170 .kr(1)
31171 .sr(4)
31172 .m(3)
31173 .n(8)
31174 .k(k)
31175 .a_stride(7)
31176 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31177 }
31178 }
31179
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_lt_4_subtile)31180 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_lt_4_subtile) {
31181 for (size_t k = 1; k < 4; k++) {
31182 for (uint32_t n = 1; n <= 8; n++) {
31183 for (uint32_t m = 1; m <= 3; m++) {
31184 GemmMicrokernelTester()
31185 .mr(3)
31186 .nr(8)
31187 .kr(1)
31188 .sr(4)
31189 .m(m)
31190 .n(n)
31191 .k(k)
31192 .iterations(1)
31193 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31194 }
31195 }
31196 }
31197 }
31198
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_gt_4)31199 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_gt_4) {
31200 for (size_t k = 5; k < 8; k++) {
31201 GemmMicrokernelTester()
31202 .mr(3)
31203 .nr(8)
31204 .kr(1)
31205 .sr(4)
31206 .m(3)
31207 .n(8)
31208 .k(k)
31209 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31210 }
31211 }
31212
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_gt_4_strided_a)31213 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_gt_4_strided_a) {
31214 for (size_t k = 5; k < 8; k++) {
31215 GemmMicrokernelTester()
31216 .mr(3)
31217 .nr(8)
31218 .kr(1)
31219 .sr(4)
31220 .m(3)
31221 .n(8)
31222 .k(k)
31223 .a_stride(11)
31224 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31225 }
31226 }
31227
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_gt_4_subtile)31228 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_gt_4_subtile) {
31229 for (size_t k = 5; k < 8; k++) {
31230 for (uint32_t n = 1; n <= 8; n++) {
31231 for (uint32_t m = 1; m <= 3; m++) {
31232 GemmMicrokernelTester()
31233 .mr(3)
31234 .nr(8)
31235 .kr(1)
31236 .sr(4)
31237 .m(m)
31238 .n(n)
31239 .k(k)
31240 .iterations(1)
31241 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31242 }
31243 }
31244 }
31245 }
31246
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_div_4)31247 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_div_4) {
31248 for (size_t k = 8; k <= 40; k += 4) {
31249 GemmMicrokernelTester()
31250 .mr(3)
31251 .nr(8)
31252 .kr(1)
31253 .sr(4)
31254 .m(3)
31255 .n(8)
31256 .k(k)
31257 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31258 }
31259 }
31260
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_div_4_strided_a)31261 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_div_4_strided_a) {
31262 for (size_t k = 8; k <= 40; k += 4) {
31263 GemmMicrokernelTester()
31264 .mr(3)
31265 .nr(8)
31266 .kr(1)
31267 .sr(4)
31268 .m(3)
31269 .n(8)
31270 .k(k)
31271 .a_stride(43)
31272 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31273 }
31274 }
31275
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,k_div_4_subtile)31276 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, k_div_4_subtile) {
31277 for (size_t k = 8; k <= 40; k += 4) {
31278 for (uint32_t n = 1; n <= 8; n++) {
31279 for (uint32_t m = 1; m <= 3; m++) {
31280 GemmMicrokernelTester()
31281 .mr(3)
31282 .nr(8)
31283 .kr(1)
31284 .sr(4)
31285 .m(m)
31286 .n(n)
31287 .k(k)
31288 .iterations(1)
31289 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31290 }
31291 }
31292 }
31293 }
31294
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,n_gt_8)31295 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, n_gt_8) {
31296 for (uint32_t n = 9; n < 16; n++) {
31297 for (size_t k = 1; k <= 20; k += 5) {
31298 GemmMicrokernelTester()
31299 .mr(3)
31300 .nr(8)
31301 .kr(1)
31302 .sr(4)
31303 .m(3)
31304 .n(n)
31305 .k(k)
31306 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31307 }
31308 }
31309 }
31310
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,n_gt_8_strided_cn)31311 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, n_gt_8_strided_cn) {
31312 for (uint32_t n = 9; n < 16; n++) {
31313 for (size_t k = 1; k <= 20; k += 5) {
31314 GemmMicrokernelTester()
31315 .mr(3)
31316 .nr(8)
31317 .kr(1)
31318 .sr(4)
31319 .m(3)
31320 .n(n)
31321 .k(k)
31322 .cn_stride(11)
31323 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31324 }
31325 }
31326 }
31327
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,n_gt_8_strided_a)31328 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, n_gt_8_strided_a) {
31329 for (uint32_t n = 9; n < 16; n++) {
31330 for (size_t k = 1; k <= 20; k += 5) {
31331 GemmMicrokernelTester()
31332 .mr(3)
31333 .nr(8)
31334 .kr(1)
31335 .sr(4)
31336 .m(3)
31337 .n(n)
31338 .k(k)
31339 .a_stride(23)
31340 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31341 }
31342 }
31343 }
31344
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,n_gt_8_subtile)31345 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, n_gt_8_subtile) {
31346 for (uint32_t n = 9; n < 16; n++) {
31347 for (size_t k = 1; k <= 20; k += 5) {
31348 for (uint32_t m = 1; m <= 3; m++) {
31349 GemmMicrokernelTester()
31350 .mr(3)
31351 .nr(8)
31352 .kr(1)
31353 .sr(4)
31354 .m(m)
31355 .n(n)
31356 .k(k)
31357 .iterations(1)
31358 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31359 }
31360 }
31361 }
31362 }
31363
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,n_div_8)31364 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, n_div_8) {
31365 for (uint32_t n = 16; n <= 24; n += 8) {
31366 for (size_t k = 1; k <= 20; k += 5) {
31367 GemmMicrokernelTester()
31368 .mr(3)
31369 .nr(8)
31370 .kr(1)
31371 .sr(4)
31372 .m(3)
31373 .n(n)
31374 .k(k)
31375 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31376 }
31377 }
31378 }
31379
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,n_div_8_strided_cn)31380 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, n_div_8_strided_cn) {
31381 for (uint32_t n = 16; n <= 24; n += 8) {
31382 for (size_t k = 1; k <= 20; k += 5) {
31383 GemmMicrokernelTester()
31384 .mr(3)
31385 .nr(8)
31386 .kr(1)
31387 .sr(4)
31388 .m(3)
31389 .n(n)
31390 .k(k)
31391 .cn_stride(11)
31392 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31393 }
31394 }
31395 }
31396
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,n_div_8_strided_a)31397 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, n_div_8_strided_a) {
31398 for (uint32_t n = 16; n <= 24; n += 8) {
31399 for (size_t k = 1; k <= 20; k += 5) {
31400 GemmMicrokernelTester()
31401 .mr(3)
31402 .nr(8)
31403 .kr(1)
31404 .sr(4)
31405 .m(3)
31406 .n(n)
31407 .k(k)
31408 .a_stride(23)
31409 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31410 }
31411 }
31412 }
31413
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,n_div_8_subtile)31414 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, n_div_8_subtile) {
31415 for (uint32_t n = 16; n <= 24; n += 8) {
31416 for (size_t k = 1; k <= 20; k += 5) {
31417 for (uint32_t m = 1; m <= 3; m++) {
31418 GemmMicrokernelTester()
31419 .mr(3)
31420 .nr(8)
31421 .kr(1)
31422 .sr(4)
31423 .m(m)
31424 .n(n)
31425 .k(k)
31426 .iterations(1)
31427 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31428 }
31429 }
31430 }
31431 }
31432
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,strided_cm_subtile)31433 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, strided_cm_subtile) {
31434 for (size_t k = 1; k <= 20; k += 5) {
31435 for (uint32_t n = 1; n <= 8; n++) {
31436 for (uint32_t m = 1; m <= 3; m++) {
31437 GemmMicrokernelTester()
31438 .mr(3)
31439 .nr(8)
31440 .kr(1)
31441 .sr(4)
31442 .m(m)
31443 .n(n)
31444 .k(k)
31445 .cm_stride(11)
31446 .iterations(1)
31447 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31448 }
31449 }
31450 }
31451 }
31452
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,qmin)31453 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, qmin) {
31454 GemmMicrokernelTester()
31455 .mr(3)
31456 .nr(8)
31457 .kr(1)
31458 .sr(4)
31459 .m(3)
31460 .n(8)
31461 .k(4)
31462 .qmin(128)
31463 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31464 }
31465
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,qmax)31466 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, qmax) {
31467 GemmMicrokernelTester()
31468 .mr(3)
31469 .nr(8)
31470 .kr(1)
31471 .sr(4)
31472 .m(3)
31473 .n(8)
31474 .k(4)
31475 .qmax(128)
31476 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31477 }
31478
TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA,strided_cm)31479 TEST(F32_GEMM_MINMAX_3X8S4__WASMRELAXEDSIMD_FMA, strided_cm) {
31480 GemmMicrokernelTester()
31481 .mr(3)
31482 .nr(8)
31483 .kr(1)
31484 .sr(4)
31485 .m(3)
31486 .n(8)
31487 .k(4)
31488 .cm_stride(11)
31489 .Test(xnn_f32_gemm_minmax_ukernel_3x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
31490 }
31491 #endif // XNN_ARCH_WASMRELAXEDSIMD
31492
31493
31494 #if XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1)31495 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1) {
31496 GemmMicrokernelTester()
31497 .mr(4)
31498 .nr(8)
31499 .kr(1)
31500 .sr(1)
31501 .m(4)
31502 .n(8)
31503 .k(1)
31504 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31505 }
31506
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cn)31507 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cn) {
31508 GemmMicrokernelTester()
31509 .mr(4)
31510 .nr(8)
31511 .kr(1)
31512 .sr(1)
31513 .m(4)
31514 .n(8)
31515 .k(1)
31516 .cn_stride(11)
31517 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31518 }
31519
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_strided_a)31520 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_strided_a) {
31521 GemmMicrokernelTester()
31522 .mr(4)
31523 .nr(8)
31524 .kr(1)
31525 .sr(1)
31526 .m(4)
31527 .n(8)
31528 .k(1)
31529 .a_stride(3)
31530 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31531 }
31532
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile)31533 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile) {
31534 for (uint32_t n = 1; n <= 8; n++) {
31535 for (uint32_t m = 1; m <= 4; m++) {
31536 GemmMicrokernelTester()
31537 .mr(4)
31538 .nr(8)
31539 .kr(1)
31540 .sr(1)
31541 .m(m)
31542 .n(n)
31543 .k(1)
31544 .iterations(1)
31545 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31546 }
31547 }
31548 }
31549
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_m)31550 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_m) {
31551 for (uint32_t m = 1; m <= 4; m++) {
31552 GemmMicrokernelTester()
31553 .mr(4)
31554 .nr(8)
31555 .kr(1)
31556 .sr(1)
31557 .m(m)
31558 .n(8)
31559 .k(1)
31560 .iterations(1)
31561 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31562 }
31563 }
31564
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_n)31565 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_n) {
31566 for (uint32_t n = 1; n <= 8; n++) {
31567 GemmMicrokernelTester()
31568 .mr(4)
31569 .nr(8)
31570 .kr(1)
31571 .sr(1)
31572 .m(4)
31573 .n(n)
31574 .k(1)
31575 .iterations(1)
31576 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31577 }
31578 }
31579
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1)31580 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1) {
31581 for (size_t k = 2; k < 10; k++) {
31582 GemmMicrokernelTester()
31583 .mr(4)
31584 .nr(8)
31585 .kr(1)
31586 .sr(1)
31587 .m(4)
31588 .n(8)
31589 .k(k)
31590 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31591 }
31592 }
31593
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_strided_a)31594 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_strided_a) {
31595 for (size_t k = 2; k < 10; k++) {
31596 GemmMicrokernelTester()
31597 .mr(4)
31598 .nr(8)
31599 .kr(1)
31600 .sr(1)
31601 .m(4)
31602 .n(8)
31603 .k(k)
31604 .a_stride(11)
31605 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31606 }
31607 }
31608
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_subtile)31609 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_subtile) {
31610 for (size_t k = 2; k < 10; k++) {
31611 for (uint32_t n = 1; n <= 8; n++) {
31612 for (uint32_t m = 1; m <= 4; m++) {
31613 GemmMicrokernelTester()
31614 .mr(4)
31615 .nr(8)
31616 .kr(1)
31617 .sr(1)
31618 .m(m)
31619 .n(n)
31620 .k(k)
31621 .iterations(1)
31622 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31623 }
31624 }
31625 }
31626 }
31627
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8)31628 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8) {
31629 for (uint32_t n = 9; n < 16; n++) {
31630 for (size_t k = 1; k <= 5; k += 2) {
31631 GemmMicrokernelTester()
31632 .mr(4)
31633 .nr(8)
31634 .kr(1)
31635 .sr(1)
31636 .m(4)
31637 .n(n)
31638 .k(k)
31639 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31640 }
31641 }
31642 }
31643
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_cn)31644 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_cn) {
31645 for (uint32_t n = 9; n < 16; n++) {
31646 for (size_t k = 1; k <= 5; k += 2) {
31647 GemmMicrokernelTester()
31648 .mr(4)
31649 .nr(8)
31650 .kr(1)
31651 .sr(1)
31652 .m(4)
31653 .n(n)
31654 .k(k)
31655 .cn_stride(11)
31656 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31657 }
31658 }
31659 }
31660
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_a)31661 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_a) {
31662 for (uint32_t n = 9; n < 16; n++) {
31663 for (size_t k = 1; k <= 5; k += 2) {
31664 GemmMicrokernelTester()
31665 .mr(4)
31666 .nr(8)
31667 .kr(1)
31668 .sr(1)
31669 .m(4)
31670 .n(n)
31671 .k(k)
31672 .a_stride(7)
31673 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31674 }
31675 }
31676 }
31677
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_subtile)31678 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_subtile) {
31679 for (uint32_t n = 9; n < 16; n++) {
31680 for (size_t k = 1; k <= 5; k += 2) {
31681 for (uint32_t m = 1; m <= 4; m++) {
31682 GemmMicrokernelTester()
31683 .mr(4)
31684 .nr(8)
31685 .kr(1)
31686 .sr(1)
31687 .m(m)
31688 .n(n)
31689 .k(k)
31690 .iterations(1)
31691 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31692 }
31693 }
31694 }
31695 }
31696
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8)31697 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8) {
31698 for (uint32_t n = 16; n <= 24; n += 8) {
31699 for (size_t k = 1; k <= 5; k += 2) {
31700 GemmMicrokernelTester()
31701 .mr(4)
31702 .nr(8)
31703 .kr(1)
31704 .sr(1)
31705 .m(4)
31706 .n(n)
31707 .k(k)
31708 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31709 }
31710 }
31711 }
31712
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_cn)31713 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_cn) {
31714 for (uint32_t n = 16; n <= 24; n += 8) {
31715 for (size_t k = 1; k <= 5; k += 2) {
31716 GemmMicrokernelTester()
31717 .mr(4)
31718 .nr(8)
31719 .kr(1)
31720 .sr(1)
31721 .m(4)
31722 .n(n)
31723 .k(k)
31724 .cn_stride(11)
31725 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31726 }
31727 }
31728 }
31729
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_a)31730 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_a) {
31731 for (uint32_t n = 16; n <= 24; n += 8) {
31732 for (size_t k = 1; k <= 5; k += 2) {
31733 GemmMicrokernelTester()
31734 .mr(4)
31735 .nr(8)
31736 .kr(1)
31737 .sr(1)
31738 .m(4)
31739 .n(n)
31740 .k(k)
31741 .a_stride(7)
31742 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31743 }
31744 }
31745 }
31746
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_subtile)31747 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_subtile) {
31748 for (uint32_t n = 16; n <= 24; n += 8) {
31749 for (size_t k = 1; k <= 5; k += 2) {
31750 for (uint32_t m = 1; m <= 4; m++) {
31751 GemmMicrokernelTester()
31752 .mr(4)
31753 .nr(8)
31754 .kr(1)
31755 .sr(1)
31756 .m(m)
31757 .n(n)
31758 .k(k)
31759 .iterations(1)
31760 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31761 }
31762 }
31763 }
31764 }
31765
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm_subtile)31766 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm_subtile) {
31767 for (size_t k = 1; k <= 5; k += 2) {
31768 for (uint32_t n = 1; n <= 8; n++) {
31769 for (uint32_t m = 1; m <= 4; m++) {
31770 GemmMicrokernelTester()
31771 .mr(4)
31772 .nr(8)
31773 .kr(1)
31774 .sr(1)
31775 .m(m)
31776 .n(n)
31777 .k(k)
31778 .cm_stride(11)
31779 .iterations(1)
31780 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31781 }
31782 }
31783 }
31784 }
31785
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,qmin)31786 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, qmin) {
31787 GemmMicrokernelTester()
31788 .mr(4)
31789 .nr(8)
31790 .kr(1)
31791 .sr(1)
31792 .m(4)
31793 .n(8)
31794 .k(1)
31795 .qmin(128)
31796 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31797 }
31798
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,qmax)31799 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, qmax) {
31800 GemmMicrokernelTester()
31801 .mr(4)
31802 .nr(8)
31803 .kr(1)
31804 .sr(1)
31805 .m(4)
31806 .n(8)
31807 .k(1)
31808 .qmax(128)
31809 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31810 }
31811
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm)31812 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm) {
31813 GemmMicrokernelTester()
31814 .mr(4)
31815 .nr(8)
31816 .kr(1)
31817 .sr(1)
31818 .m(4)
31819 .n(8)
31820 .k(1)
31821 .cm_stride(11)
31822 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
31823 }
31824 #endif // XNN_ARCH_WASMRELAXEDSIMD
31825
31826
31827 #if XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_eq_4)31828 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_eq_4) {
31829 GemmMicrokernelTester()
31830 .mr(4)
31831 .nr(8)
31832 .kr(1)
31833 .sr(1)
31834 .m(4)
31835 .n(8)
31836 .k(4)
31837 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31838 }
31839
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,strided_cn)31840 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, strided_cn) {
31841 GemmMicrokernelTester()
31842 .mr(4)
31843 .nr(8)
31844 .kr(1)
31845 .sr(1)
31846 .m(4)
31847 .n(8)
31848 .k(4)
31849 .cn_stride(11)
31850 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31851 }
31852
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_eq_4_strided_a)31853 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_eq_4_strided_a) {
31854 GemmMicrokernelTester()
31855 .mr(4)
31856 .nr(8)
31857 .kr(1)
31858 .sr(1)
31859 .m(4)
31860 .n(8)
31861 .k(4)
31862 .a_stride(7)
31863 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31864 }
31865
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_eq_4_subtile)31866 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_eq_4_subtile) {
31867 for (uint32_t n = 1; n <= 8; n++) {
31868 for (uint32_t m = 1; m <= 4; m++) {
31869 GemmMicrokernelTester()
31870 .mr(4)
31871 .nr(8)
31872 .kr(1)
31873 .sr(1)
31874 .m(m)
31875 .n(n)
31876 .k(4)
31877 .iterations(1)
31878 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31879 }
31880 }
31881 }
31882
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_eq_4_subtile_m)31883 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_eq_4_subtile_m) {
31884 for (uint32_t m = 1; m <= 4; m++) {
31885 GemmMicrokernelTester()
31886 .mr(4)
31887 .nr(8)
31888 .kr(1)
31889 .sr(1)
31890 .m(m)
31891 .n(8)
31892 .k(4)
31893 .iterations(1)
31894 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31895 }
31896 }
31897
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_eq_4_subtile_n)31898 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_eq_4_subtile_n) {
31899 for (uint32_t n = 1; n <= 8; n++) {
31900 GemmMicrokernelTester()
31901 .mr(4)
31902 .nr(8)
31903 .kr(1)
31904 .sr(1)
31905 .m(4)
31906 .n(n)
31907 .k(4)
31908 .iterations(1)
31909 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31910 }
31911 }
31912
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_lt_4)31913 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_lt_4) {
31914 for (size_t k = 1; k < 4; k++) {
31915 GemmMicrokernelTester()
31916 .mr(4)
31917 .nr(8)
31918 .kr(1)
31919 .sr(1)
31920 .m(4)
31921 .n(8)
31922 .k(k)
31923 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31924 }
31925 }
31926
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_lt_4_strided_a)31927 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_lt_4_strided_a) {
31928 for (size_t k = 1; k < 4; k++) {
31929 GemmMicrokernelTester()
31930 .mr(4)
31931 .nr(8)
31932 .kr(1)
31933 .sr(1)
31934 .m(4)
31935 .n(8)
31936 .k(k)
31937 .a_stride(7)
31938 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31939 }
31940 }
31941
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_lt_4_subtile)31942 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_lt_4_subtile) {
31943 for (size_t k = 1; k < 4; k++) {
31944 for (uint32_t n = 1; n <= 8; n++) {
31945 for (uint32_t m = 1; m <= 4; m++) {
31946 GemmMicrokernelTester()
31947 .mr(4)
31948 .nr(8)
31949 .kr(1)
31950 .sr(1)
31951 .m(m)
31952 .n(n)
31953 .k(k)
31954 .iterations(1)
31955 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31956 }
31957 }
31958 }
31959 }
31960
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_gt_4)31961 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_gt_4) {
31962 for (size_t k = 5; k < 8; k++) {
31963 GemmMicrokernelTester()
31964 .mr(4)
31965 .nr(8)
31966 .kr(1)
31967 .sr(1)
31968 .m(4)
31969 .n(8)
31970 .k(k)
31971 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31972 }
31973 }
31974
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_gt_4_strided_a)31975 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_gt_4_strided_a) {
31976 for (size_t k = 5; k < 8; k++) {
31977 GemmMicrokernelTester()
31978 .mr(4)
31979 .nr(8)
31980 .kr(1)
31981 .sr(1)
31982 .m(4)
31983 .n(8)
31984 .k(k)
31985 .a_stride(11)
31986 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
31987 }
31988 }
31989
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_gt_4_subtile)31990 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_gt_4_subtile) {
31991 for (size_t k = 5; k < 8; k++) {
31992 for (uint32_t n = 1; n <= 8; n++) {
31993 for (uint32_t m = 1; m <= 4; m++) {
31994 GemmMicrokernelTester()
31995 .mr(4)
31996 .nr(8)
31997 .kr(1)
31998 .sr(1)
31999 .m(m)
32000 .n(n)
32001 .k(k)
32002 .iterations(1)
32003 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32004 }
32005 }
32006 }
32007 }
32008
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_div_4)32009 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_div_4) {
32010 for (size_t k = 8; k <= 40; k += 4) {
32011 GemmMicrokernelTester()
32012 .mr(4)
32013 .nr(8)
32014 .kr(1)
32015 .sr(1)
32016 .m(4)
32017 .n(8)
32018 .k(k)
32019 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32020 }
32021 }
32022
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_div_4_strided_a)32023 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_div_4_strided_a) {
32024 for (size_t k = 8; k <= 40; k += 4) {
32025 GemmMicrokernelTester()
32026 .mr(4)
32027 .nr(8)
32028 .kr(1)
32029 .sr(1)
32030 .m(4)
32031 .n(8)
32032 .k(k)
32033 .a_stride(43)
32034 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32035 }
32036 }
32037
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,k_div_4_subtile)32038 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, k_div_4_subtile) {
32039 for (size_t k = 8; k <= 40; k += 4) {
32040 for (uint32_t n = 1; n <= 8; n++) {
32041 for (uint32_t m = 1; m <= 4; m++) {
32042 GemmMicrokernelTester()
32043 .mr(4)
32044 .nr(8)
32045 .kr(1)
32046 .sr(1)
32047 .m(m)
32048 .n(n)
32049 .k(k)
32050 .iterations(1)
32051 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32052 }
32053 }
32054 }
32055 }
32056
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,n_gt_8)32057 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, n_gt_8) {
32058 for (uint32_t n = 9; n < 16; n++) {
32059 for (size_t k = 1; k <= 20; k += 5) {
32060 GemmMicrokernelTester()
32061 .mr(4)
32062 .nr(8)
32063 .kr(1)
32064 .sr(1)
32065 .m(4)
32066 .n(n)
32067 .k(k)
32068 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32069 }
32070 }
32071 }
32072
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,n_gt_8_strided_cn)32073 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, n_gt_8_strided_cn) {
32074 for (uint32_t n = 9; n < 16; n++) {
32075 for (size_t k = 1; k <= 20; k += 5) {
32076 GemmMicrokernelTester()
32077 .mr(4)
32078 .nr(8)
32079 .kr(1)
32080 .sr(1)
32081 .m(4)
32082 .n(n)
32083 .k(k)
32084 .cn_stride(11)
32085 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32086 }
32087 }
32088 }
32089
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,n_gt_8_strided_a)32090 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, n_gt_8_strided_a) {
32091 for (uint32_t n = 9; n < 16; n++) {
32092 for (size_t k = 1; k <= 20; k += 5) {
32093 GemmMicrokernelTester()
32094 .mr(4)
32095 .nr(8)
32096 .kr(1)
32097 .sr(1)
32098 .m(4)
32099 .n(n)
32100 .k(k)
32101 .a_stride(23)
32102 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32103 }
32104 }
32105 }
32106
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,n_gt_8_subtile)32107 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, n_gt_8_subtile) {
32108 for (uint32_t n = 9; n < 16; n++) {
32109 for (size_t k = 1; k <= 20; k += 5) {
32110 for (uint32_t m = 1; m <= 4; m++) {
32111 GemmMicrokernelTester()
32112 .mr(4)
32113 .nr(8)
32114 .kr(1)
32115 .sr(1)
32116 .m(m)
32117 .n(n)
32118 .k(k)
32119 .iterations(1)
32120 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32121 }
32122 }
32123 }
32124 }
32125
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,n_div_8)32126 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, n_div_8) {
32127 for (uint32_t n = 16; n <= 24; n += 8) {
32128 for (size_t k = 1; k <= 20; k += 5) {
32129 GemmMicrokernelTester()
32130 .mr(4)
32131 .nr(8)
32132 .kr(1)
32133 .sr(1)
32134 .m(4)
32135 .n(n)
32136 .k(k)
32137 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32138 }
32139 }
32140 }
32141
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,n_div_8_strided_cn)32142 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, n_div_8_strided_cn) {
32143 for (uint32_t n = 16; n <= 24; n += 8) {
32144 for (size_t k = 1; k <= 20; k += 5) {
32145 GemmMicrokernelTester()
32146 .mr(4)
32147 .nr(8)
32148 .kr(1)
32149 .sr(1)
32150 .m(4)
32151 .n(n)
32152 .k(k)
32153 .cn_stride(11)
32154 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32155 }
32156 }
32157 }
32158
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,n_div_8_strided_a)32159 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, n_div_8_strided_a) {
32160 for (uint32_t n = 16; n <= 24; n += 8) {
32161 for (size_t k = 1; k <= 20; k += 5) {
32162 GemmMicrokernelTester()
32163 .mr(4)
32164 .nr(8)
32165 .kr(1)
32166 .sr(1)
32167 .m(4)
32168 .n(n)
32169 .k(k)
32170 .a_stride(23)
32171 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32172 }
32173 }
32174 }
32175
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,n_div_8_subtile)32176 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, n_div_8_subtile) {
32177 for (uint32_t n = 16; n <= 24; n += 8) {
32178 for (size_t k = 1; k <= 20; k += 5) {
32179 for (uint32_t m = 1; m <= 4; m++) {
32180 GemmMicrokernelTester()
32181 .mr(4)
32182 .nr(8)
32183 .kr(1)
32184 .sr(1)
32185 .m(m)
32186 .n(n)
32187 .k(k)
32188 .iterations(1)
32189 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32190 }
32191 }
32192 }
32193 }
32194
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,strided_cm_subtile)32195 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, strided_cm_subtile) {
32196 for (size_t k = 1; k <= 20; k += 5) {
32197 for (uint32_t n = 1; n <= 8; n++) {
32198 for (uint32_t m = 1; m <= 4; m++) {
32199 GemmMicrokernelTester()
32200 .mr(4)
32201 .nr(8)
32202 .kr(1)
32203 .sr(1)
32204 .m(m)
32205 .n(n)
32206 .k(k)
32207 .cm_stride(11)
32208 .iterations(1)
32209 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32210 }
32211 }
32212 }
32213 }
32214
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,qmin)32215 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, qmin) {
32216 GemmMicrokernelTester()
32217 .mr(4)
32218 .nr(8)
32219 .kr(1)
32220 .sr(1)
32221 .m(4)
32222 .n(8)
32223 .k(4)
32224 .qmin(128)
32225 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32226 }
32227
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,qmax)32228 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, qmax) {
32229 GemmMicrokernelTester()
32230 .mr(4)
32231 .nr(8)
32232 .kr(1)
32233 .sr(1)
32234 .m(4)
32235 .n(8)
32236 .k(4)
32237 .qmax(128)
32238 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32239 }
32240
TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT,strided_cm)32241 TEST(F32_GEMM_MINMAX_4X8__WASMRELAXEDSIMD_SPLAT, strided_cm) {
32242 GemmMicrokernelTester()
32243 .mr(4)
32244 .nr(8)
32245 .kr(1)
32246 .sr(1)
32247 .m(4)
32248 .n(8)
32249 .k(4)
32250 .cm_stride(11)
32251 .Test(xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_splat, xnn_init_f32_minmax_wasmsimd_params);
32252 }
32253 #endif // XNN_ARCH_WASMRELAXEDSIMD
32254
32255
32256 #if XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_eq_4)32257 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_eq_4) {
32258 GemmMicrokernelTester()
32259 .mr(4)
32260 .nr(8)
32261 .kr(1)
32262 .sr(4)
32263 .m(4)
32264 .n(8)
32265 .k(4)
32266 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32267 }
32268
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,strided_cn)32269 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, strided_cn) {
32270 GemmMicrokernelTester()
32271 .mr(4)
32272 .nr(8)
32273 .kr(1)
32274 .sr(4)
32275 .m(4)
32276 .n(8)
32277 .k(4)
32278 .cn_stride(11)
32279 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32280 }
32281
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_eq_4_strided_a)32282 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_eq_4_strided_a) {
32283 GemmMicrokernelTester()
32284 .mr(4)
32285 .nr(8)
32286 .kr(1)
32287 .sr(4)
32288 .m(4)
32289 .n(8)
32290 .k(4)
32291 .a_stride(7)
32292 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32293 }
32294
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_eq_4_subtile)32295 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_eq_4_subtile) {
32296 for (uint32_t n = 1; n <= 8; n++) {
32297 for (uint32_t m = 1; m <= 4; m++) {
32298 GemmMicrokernelTester()
32299 .mr(4)
32300 .nr(8)
32301 .kr(1)
32302 .sr(4)
32303 .m(m)
32304 .n(n)
32305 .k(4)
32306 .iterations(1)
32307 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32308 }
32309 }
32310 }
32311
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_eq_4_subtile_m)32312 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_eq_4_subtile_m) {
32313 for (uint32_t m = 1; m <= 4; m++) {
32314 GemmMicrokernelTester()
32315 .mr(4)
32316 .nr(8)
32317 .kr(1)
32318 .sr(4)
32319 .m(m)
32320 .n(8)
32321 .k(4)
32322 .iterations(1)
32323 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32324 }
32325 }
32326
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_eq_4_subtile_n)32327 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_eq_4_subtile_n) {
32328 for (uint32_t n = 1; n <= 8; n++) {
32329 GemmMicrokernelTester()
32330 .mr(4)
32331 .nr(8)
32332 .kr(1)
32333 .sr(4)
32334 .m(4)
32335 .n(n)
32336 .k(4)
32337 .iterations(1)
32338 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32339 }
32340 }
32341
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_lt_4)32342 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_lt_4) {
32343 for (size_t k = 1; k < 4; k++) {
32344 GemmMicrokernelTester()
32345 .mr(4)
32346 .nr(8)
32347 .kr(1)
32348 .sr(4)
32349 .m(4)
32350 .n(8)
32351 .k(k)
32352 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32353 }
32354 }
32355
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_lt_4_strided_a)32356 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_lt_4_strided_a) {
32357 for (size_t k = 1; k < 4; k++) {
32358 GemmMicrokernelTester()
32359 .mr(4)
32360 .nr(8)
32361 .kr(1)
32362 .sr(4)
32363 .m(4)
32364 .n(8)
32365 .k(k)
32366 .a_stride(7)
32367 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32368 }
32369 }
32370
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_lt_4_subtile)32371 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_lt_4_subtile) {
32372 for (size_t k = 1; k < 4; k++) {
32373 for (uint32_t n = 1; n <= 8; n++) {
32374 for (uint32_t m = 1; m <= 4; m++) {
32375 GemmMicrokernelTester()
32376 .mr(4)
32377 .nr(8)
32378 .kr(1)
32379 .sr(4)
32380 .m(m)
32381 .n(n)
32382 .k(k)
32383 .iterations(1)
32384 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32385 }
32386 }
32387 }
32388 }
32389
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_gt_4)32390 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_gt_4) {
32391 for (size_t k = 5; k < 8; k++) {
32392 GemmMicrokernelTester()
32393 .mr(4)
32394 .nr(8)
32395 .kr(1)
32396 .sr(4)
32397 .m(4)
32398 .n(8)
32399 .k(k)
32400 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32401 }
32402 }
32403
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_gt_4_strided_a)32404 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_gt_4_strided_a) {
32405 for (size_t k = 5; k < 8; k++) {
32406 GemmMicrokernelTester()
32407 .mr(4)
32408 .nr(8)
32409 .kr(1)
32410 .sr(4)
32411 .m(4)
32412 .n(8)
32413 .k(k)
32414 .a_stride(11)
32415 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32416 }
32417 }
32418
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_gt_4_subtile)32419 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_gt_4_subtile) {
32420 for (size_t k = 5; k < 8; k++) {
32421 for (uint32_t n = 1; n <= 8; n++) {
32422 for (uint32_t m = 1; m <= 4; m++) {
32423 GemmMicrokernelTester()
32424 .mr(4)
32425 .nr(8)
32426 .kr(1)
32427 .sr(4)
32428 .m(m)
32429 .n(n)
32430 .k(k)
32431 .iterations(1)
32432 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32433 }
32434 }
32435 }
32436 }
32437
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_div_4)32438 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_div_4) {
32439 for (size_t k = 8; k <= 40; k += 4) {
32440 GemmMicrokernelTester()
32441 .mr(4)
32442 .nr(8)
32443 .kr(1)
32444 .sr(4)
32445 .m(4)
32446 .n(8)
32447 .k(k)
32448 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32449 }
32450 }
32451
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_div_4_strided_a)32452 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_div_4_strided_a) {
32453 for (size_t k = 8; k <= 40; k += 4) {
32454 GemmMicrokernelTester()
32455 .mr(4)
32456 .nr(8)
32457 .kr(1)
32458 .sr(4)
32459 .m(4)
32460 .n(8)
32461 .k(k)
32462 .a_stride(43)
32463 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32464 }
32465 }
32466
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,k_div_4_subtile)32467 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, k_div_4_subtile) {
32468 for (size_t k = 8; k <= 40; k += 4) {
32469 for (uint32_t n = 1; n <= 8; n++) {
32470 for (uint32_t m = 1; m <= 4; m++) {
32471 GemmMicrokernelTester()
32472 .mr(4)
32473 .nr(8)
32474 .kr(1)
32475 .sr(4)
32476 .m(m)
32477 .n(n)
32478 .k(k)
32479 .iterations(1)
32480 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32481 }
32482 }
32483 }
32484 }
32485
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,n_gt_8)32486 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, n_gt_8) {
32487 for (uint32_t n = 9; n < 16; n++) {
32488 for (size_t k = 1; k <= 20; k += 5) {
32489 GemmMicrokernelTester()
32490 .mr(4)
32491 .nr(8)
32492 .kr(1)
32493 .sr(4)
32494 .m(4)
32495 .n(n)
32496 .k(k)
32497 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32498 }
32499 }
32500 }
32501
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,n_gt_8_strided_cn)32502 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, n_gt_8_strided_cn) {
32503 for (uint32_t n = 9; n < 16; n++) {
32504 for (size_t k = 1; k <= 20; k += 5) {
32505 GemmMicrokernelTester()
32506 .mr(4)
32507 .nr(8)
32508 .kr(1)
32509 .sr(4)
32510 .m(4)
32511 .n(n)
32512 .k(k)
32513 .cn_stride(11)
32514 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32515 }
32516 }
32517 }
32518
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,n_gt_8_strided_a)32519 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, n_gt_8_strided_a) {
32520 for (uint32_t n = 9; n < 16; n++) {
32521 for (size_t k = 1; k <= 20; k += 5) {
32522 GemmMicrokernelTester()
32523 .mr(4)
32524 .nr(8)
32525 .kr(1)
32526 .sr(4)
32527 .m(4)
32528 .n(n)
32529 .k(k)
32530 .a_stride(23)
32531 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32532 }
32533 }
32534 }
32535
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,n_gt_8_subtile)32536 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, n_gt_8_subtile) {
32537 for (uint32_t n = 9; n < 16; n++) {
32538 for (size_t k = 1; k <= 20; k += 5) {
32539 for (uint32_t m = 1; m <= 4; m++) {
32540 GemmMicrokernelTester()
32541 .mr(4)
32542 .nr(8)
32543 .kr(1)
32544 .sr(4)
32545 .m(m)
32546 .n(n)
32547 .k(k)
32548 .iterations(1)
32549 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32550 }
32551 }
32552 }
32553 }
32554
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,n_div_8)32555 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, n_div_8) {
32556 for (uint32_t n = 16; n <= 24; n += 8) {
32557 for (size_t k = 1; k <= 20; k += 5) {
32558 GemmMicrokernelTester()
32559 .mr(4)
32560 .nr(8)
32561 .kr(1)
32562 .sr(4)
32563 .m(4)
32564 .n(n)
32565 .k(k)
32566 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32567 }
32568 }
32569 }
32570
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,n_div_8_strided_cn)32571 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, n_div_8_strided_cn) {
32572 for (uint32_t n = 16; n <= 24; n += 8) {
32573 for (size_t k = 1; k <= 20; k += 5) {
32574 GemmMicrokernelTester()
32575 .mr(4)
32576 .nr(8)
32577 .kr(1)
32578 .sr(4)
32579 .m(4)
32580 .n(n)
32581 .k(k)
32582 .cn_stride(11)
32583 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32584 }
32585 }
32586 }
32587
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,n_div_8_strided_a)32588 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, n_div_8_strided_a) {
32589 for (uint32_t n = 16; n <= 24; n += 8) {
32590 for (size_t k = 1; k <= 20; k += 5) {
32591 GemmMicrokernelTester()
32592 .mr(4)
32593 .nr(8)
32594 .kr(1)
32595 .sr(4)
32596 .m(4)
32597 .n(n)
32598 .k(k)
32599 .a_stride(23)
32600 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32601 }
32602 }
32603 }
32604
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,n_div_8_subtile)32605 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, n_div_8_subtile) {
32606 for (uint32_t n = 16; n <= 24; n += 8) {
32607 for (size_t k = 1; k <= 20; k += 5) {
32608 for (uint32_t m = 1; m <= 4; m++) {
32609 GemmMicrokernelTester()
32610 .mr(4)
32611 .nr(8)
32612 .kr(1)
32613 .sr(4)
32614 .m(m)
32615 .n(n)
32616 .k(k)
32617 .iterations(1)
32618 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32619 }
32620 }
32621 }
32622 }
32623
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,strided_cm_subtile)32624 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, strided_cm_subtile) {
32625 for (size_t k = 1; k <= 20; k += 5) {
32626 for (uint32_t n = 1; n <= 8; n++) {
32627 for (uint32_t m = 1; m <= 4; m++) {
32628 GemmMicrokernelTester()
32629 .mr(4)
32630 .nr(8)
32631 .kr(1)
32632 .sr(4)
32633 .m(m)
32634 .n(n)
32635 .k(k)
32636 .cm_stride(11)
32637 .iterations(1)
32638 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32639 }
32640 }
32641 }
32642 }
32643
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,qmin)32644 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, qmin) {
32645 GemmMicrokernelTester()
32646 .mr(4)
32647 .nr(8)
32648 .kr(1)
32649 .sr(4)
32650 .m(4)
32651 .n(8)
32652 .k(4)
32653 .qmin(128)
32654 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32655 }
32656
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,qmax)32657 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, qmax) {
32658 GemmMicrokernelTester()
32659 .mr(4)
32660 .nr(8)
32661 .kr(1)
32662 .sr(4)
32663 .m(4)
32664 .n(8)
32665 .k(4)
32666 .qmax(128)
32667 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32668 }
32669
TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA,strided_cm)32670 TEST(F32_GEMM_MINMAX_4X8S4__WASMRELAXEDSIMD_FMA, strided_cm) {
32671 GemmMicrokernelTester()
32672 .mr(4)
32673 .nr(8)
32674 .kr(1)
32675 .sr(4)
32676 .m(4)
32677 .n(8)
32678 .k(4)
32679 .cm_stride(11)
32680 .Test(xnn_f32_gemm_minmax_ukernel_4x8s4__wasmrelaxedsimd_fma, xnn_init_f32_minmax_wasmsimd_params);
32681 }
32682 #endif // XNN_ARCH_WASMRELAXEDSIMD
32683
32684
32685 #if XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1)32686 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1) {
32687 GemmMicrokernelTester()
32688 .mr(5)
32689 .nr(8)
32690 .kr(1)
32691 .sr(1)
32692 .m(5)
32693 .n(8)
32694 .k(1)
32695 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32696 }
32697
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cn)32698 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cn) {
32699 GemmMicrokernelTester()
32700 .mr(5)
32701 .nr(8)
32702 .kr(1)
32703 .sr(1)
32704 .m(5)
32705 .n(8)
32706 .k(1)
32707 .cn_stride(11)
32708 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32709 }
32710
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_strided_a)32711 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_strided_a) {
32712 GemmMicrokernelTester()
32713 .mr(5)
32714 .nr(8)
32715 .kr(1)
32716 .sr(1)
32717 .m(5)
32718 .n(8)
32719 .k(1)
32720 .a_stride(3)
32721 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32722 }
32723
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile)32724 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile) {
32725 for (uint32_t n = 1; n <= 8; n++) {
32726 for (uint32_t m = 1; m <= 5; m++) {
32727 GemmMicrokernelTester()
32728 .mr(5)
32729 .nr(8)
32730 .kr(1)
32731 .sr(1)
32732 .m(m)
32733 .n(n)
32734 .k(1)
32735 .iterations(1)
32736 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32737 }
32738 }
32739 }
32740
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_m)32741 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_m) {
32742 for (uint32_t m = 1; m <= 5; m++) {
32743 GemmMicrokernelTester()
32744 .mr(5)
32745 .nr(8)
32746 .kr(1)
32747 .sr(1)
32748 .m(m)
32749 .n(8)
32750 .k(1)
32751 .iterations(1)
32752 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32753 }
32754 }
32755
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_n)32756 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_n) {
32757 for (uint32_t n = 1; n <= 8; n++) {
32758 GemmMicrokernelTester()
32759 .mr(5)
32760 .nr(8)
32761 .kr(1)
32762 .sr(1)
32763 .m(5)
32764 .n(n)
32765 .k(1)
32766 .iterations(1)
32767 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32768 }
32769 }
32770
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1)32771 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1) {
32772 for (size_t k = 2; k < 10; k++) {
32773 GemmMicrokernelTester()
32774 .mr(5)
32775 .nr(8)
32776 .kr(1)
32777 .sr(1)
32778 .m(5)
32779 .n(8)
32780 .k(k)
32781 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32782 }
32783 }
32784
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_strided_a)32785 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_strided_a) {
32786 for (size_t k = 2; k < 10; k++) {
32787 GemmMicrokernelTester()
32788 .mr(5)
32789 .nr(8)
32790 .kr(1)
32791 .sr(1)
32792 .m(5)
32793 .n(8)
32794 .k(k)
32795 .a_stride(11)
32796 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32797 }
32798 }
32799
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_subtile)32800 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_subtile) {
32801 for (size_t k = 2; k < 10; k++) {
32802 for (uint32_t n = 1; n <= 8; n++) {
32803 for (uint32_t m = 1; m <= 5; m++) {
32804 GemmMicrokernelTester()
32805 .mr(5)
32806 .nr(8)
32807 .kr(1)
32808 .sr(1)
32809 .m(m)
32810 .n(n)
32811 .k(k)
32812 .iterations(1)
32813 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32814 }
32815 }
32816 }
32817 }
32818
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8)32819 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8) {
32820 for (uint32_t n = 9; n < 16; n++) {
32821 for (size_t k = 1; k <= 5; k += 2) {
32822 GemmMicrokernelTester()
32823 .mr(5)
32824 .nr(8)
32825 .kr(1)
32826 .sr(1)
32827 .m(5)
32828 .n(n)
32829 .k(k)
32830 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32831 }
32832 }
32833 }
32834
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_cn)32835 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_cn) {
32836 for (uint32_t n = 9; n < 16; n++) {
32837 for (size_t k = 1; k <= 5; k += 2) {
32838 GemmMicrokernelTester()
32839 .mr(5)
32840 .nr(8)
32841 .kr(1)
32842 .sr(1)
32843 .m(5)
32844 .n(n)
32845 .k(k)
32846 .cn_stride(11)
32847 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32848 }
32849 }
32850 }
32851
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_a)32852 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_a) {
32853 for (uint32_t n = 9; n < 16; n++) {
32854 for (size_t k = 1; k <= 5; k += 2) {
32855 GemmMicrokernelTester()
32856 .mr(5)
32857 .nr(8)
32858 .kr(1)
32859 .sr(1)
32860 .m(5)
32861 .n(n)
32862 .k(k)
32863 .a_stride(7)
32864 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32865 }
32866 }
32867 }
32868
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_subtile)32869 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_subtile) {
32870 for (uint32_t n = 9; n < 16; n++) {
32871 for (size_t k = 1; k <= 5; k += 2) {
32872 for (uint32_t m = 1; m <= 5; m++) {
32873 GemmMicrokernelTester()
32874 .mr(5)
32875 .nr(8)
32876 .kr(1)
32877 .sr(1)
32878 .m(m)
32879 .n(n)
32880 .k(k)
32881 .iterations(1)
32882 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32883 }
32884 }
32885 }
32886 }
32887
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8)32888 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8) {
32889 for (uint32_t n = 16; n <= 24; n += 8) {
32890 for (size_t k = 1; k <= 5; k += 2) {
32891 GemmMicrokernelTester()
32892 .mr(5)
32893 .nr(8)
32894 .kr(1)
32895 .sr(1)
32896 .m(5)
32897 .n(n)
32898 .k(k)
32899 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32900 }
32901 }
32902 }
32903
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_cn)32904 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_cn) {
32905 for (uint32_t n = 16; n <= 24; n += 8) {
32906 for (size_t k = 1; k <= 5; k += 2) {
32907 GemmMicrokernelTester()
32908 .mr(5)
32909 .nr(8)
32910 .kr(1)
32911 .sr(1)
32912 .m(5)
32913 .n(n)
32914 .k(k)
32915 .cn_stride(11)
32916 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32917 }
32918 }
32919 }
32920
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_a)32921 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_a) {
32922 for (uint32_t n = 16; n <= 24; n += 8) {
32923 for (size_t k = 1; k <= 5; k += 2) {
32924 GemmMicrokernelTester()
32925 .mr(5)
32926 .nr(8)
32927 .kr(1)
32928 .sr(1)
32929 .m(5)
32930 .n(n)
32931 .k(k)
32932 .a_stride(7)
32933 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32934 }
32935 }
32936 }
32937
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_subtile)32938 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_subtile) {
32939 for (uint32_t n = 16; n <= 24; n += 8) {
32940 for (size_t k = 1; k <= 5; k += 2) {
32941 for (uint32_t m = 1; m <= 5; m++) {
32942 GemmMicrokernelTester()
32943 .mr(5)
32944 .nr(8)
32945 .kr(1)
32946 .sr(1)
32947 .m(m)
32948 .n(n)
32949 .k(k)
32950 .iterations(1)
32951 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32952 }
32953 }
32954 }
32955 }
32956
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm_subtile)32957 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm_subtile) {
32958 for (size_t k = 1; k <= 5; k += 2) {
32959 for (uint32_t n = 1; n <= 8; n++) {
32960 for (uint32_t m = 1; m <= 5; m++) {
32961 GemmMicrokernelTester()
32962 .mr(5)
32963 .nr(8)
32964 .kr(1)
32965 .sr(1)
32966 .m(m)
32967 .n(n)
32968 .k(k)
32969 .cm_stride(11)
32970 .iterations(1)
32971 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32972 }
32973 }
32974 }
32975 }
32976
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,qmin)32977 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, qmin) {
32978 GemmMicrokernelTester()
32979 .mr(5)
32980 .nr(8)
32981 .kr(1)
32982 .sr(1)
32983 .m(5)
32984 .n(8)
32985 .k(1)
32986 .qmin(128)
32987 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
32988 }
32989
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,qmax)32990 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, qmax) {
32991 GemmMicrokernelTester()
32992 .mr(5)
32993 .nr(8)
32994 .kr(1)
32995 .sr(1)
32996 .m(5)
32997 .n(8)
32998 .k(1)
32999 .qmax(128)
33000 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33001 }
33002
TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm)33003 TEST(F32_GEMM_MINMAX_5X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm) {
33004 GemmMicrokernelTester()
33005 .mr(5)
33006 .nr(8)
33007 .kr(1)
33008 .sr(1)
33009 .m(5)
33010 .n(8)
33011 .k(1)
33012 .cm_stride(11)
33013 .Test(xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33014 }
33015 #endif // XNN_ARCH_WASMRELAXEDSIMD
33016
33017
33018 #if XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1)33019 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1) {
33020 GemmMicrokernelTester()
33021 .mr(6)
33022 .nr(8)
33023 .kr(1)
33024 .sr(1)
33025 .m(6)
33026 .n(8)
33027 .k(1)
33028 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33029 }
33030
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cn)33031 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cn) {
33032 GemmMicrokernelTester()
33033 .mr(6)
33034 .nr(8)
33035 .kr(1)
33036 .sr(1)
33037 .m(6)
33038 .n(8)
33039 .k(1)
33040 .cn_stride(11)
33041 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33042 }
33043
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_strided_a)33044 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_strided_a) {
33045 GemmMicrokernelTester()
33046 .mr(6)
33047 .nr(8)
33048 .kr(1)
33049 .sr(1)
33050 .m(6)
33051 .n(8)
33052 .k(1)
33053 .a_stride(3)
33054 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33055 }
33056
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile)33057 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile) {
33058 for (uint32_t n = 1; n <= 8; n++) {
33059 for (uint32_t m = 1; m <= 6; m++) {
33060 GemmMicrokernelTester()
33061 .mr(6)
33062 .nr(8)
33063 .kr(1)
33064 .sr(1)
33065 .m(m)
33066 .n(n)
33067 .k(1)
33068 .iterations(1)
33069 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33070 }
33071 }
33072 }
33073
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_m)33074 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_m) {
33075 for (uint32_t m = 1; m <= 6; m++) {
33076 GemmMicrokernelTester()
33077 .mr(6)
33078 .nr(8)
33079 .kr(1)
33080 .sr(1)
33081 .m(m)
33082 .n(8)
33083 .k(1)
33084 .iterations(1)
33085 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33086 }
33087 }
33088
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,k_eq_1_subtile_n)33089 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, k_eq_1_subtile_n) {
33090 for (uint32_t n = 1; n <= 8; n++) {
33091 GemmMicrokernelTester()
33092 .mr(6)
33093 .nr(8)
33094 .kr(1)
33095 .sr(1)
33096 .m(6)
33097 .n(n)
33098 .k(1)
33099 .iterations(1)
33100 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33101 }
33102 }
33103
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1)33104 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1) {
33105 for (size_t k = 2; k < 10; k++) {
33106 GemmMicrokernelTester()
33107 .mr(6)
33108 .nr(8)
33109 .kr(1)
33110 .sr(1)
33111 .m(6)
33112 .n(8)
33113 .k(k)
33114 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33115 }
33116 }
33117
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_strided_a)33118 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_strided_a) {
33119 for (size_t k = 2; k < 10; k++) {
33120 GemmMicrokernelTester()
33121 .mr(6)
33122 .nr(8)
33123 .kr(1)
33124 .sr(1)
33125 .m(6)
33126 .n(8)
33127 .k(k)
33128 .a_stride(11)
33129 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33130 }
33131 }
33132
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,k_gt_1_subtile)33133 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, k_gt_1_subtile) {
33134 for (size_t k = 2; k < 10; k++) {
33135 for (uint32_t n = 1; n <= 8; n++) {
33136 for (uint32_t m = 1; m <= 6; m++) {
33137 GemmMicrokernelTester()
33138 .mr(6)
33139 .nr(8)
33140 .kr(1)
33141 .sr(1)
33142 .m(m)
33143 .n(n)
33144 .k(k)
33145 .iterations(1)
33146 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33147 }
33148 }
33149 }
33150 }
33151
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8)33152 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8) {
33153 for (uint32_t n = 9; n < 16; n++) {
33154 for (size_t k = 1; k <= 5; k += 2) {
33155 GemmMicrokernelTester()
33156 .mr(6)
33157 .nr(8)
33158 .kr(1)
33159 .sr(1)
33160 .m(6)
33161 .n(n)
33162 .k(k)
33163 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33164 }
33165 }
33166 }
33167
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_cn)33168 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_cn) {
33169 for (uint32_t n = 9; n < 16; n++) {
33170 for (size_t k = 1; k <= 5; k += 2) {
33171 GemmMicrokernelTester()
33172 .mr(6)
33173 .nr(8)
33174 .kr(1)
33175 .sr(1)
33176 .m(6)
33177 .n(n)
33178 .k(k)
33179 .cn_stride(11)
33180 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33181 }
33182 }
33183 }
33184
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_strided_a)33185 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_strided_a) {
33186 for (uint32_t n = 9; n < 16; n++) {
33187 for (size_t k = 1; k <= 5; k += 2) {
33188 GemmMicrokernelTester()
33189 .mr(6)
33190 .nr(8)
33191 .kr(1)
33192 .sr(1)
33193 .m(6)
33194 .n(n)
33195 .k(k)
33196 .a_stride(7)
33197 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33198 }
33199 }
33200 }
33201
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,n_gt_8_subtile)33202 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, n_gt_8_subtile) {
33203 for (uint32_t n = 9; n < 16; n++) {
33204 for (size_t k = 1; k <= 5; k += 2) {
33205 for (uint32_t m = 1; m <= 6; m++) {
33206 GemmMicrokernelTester()
33207 .mr(6)
33208 .nr(8)
33209 .kr(1)
33210 .sr(1)
33211 .m(m)
33212 .n(n)
33213 .k(k)
33214 .iterations(1)
33215 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33216 }
33217 }
33218 }
33219 }
33220
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8)33221 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8) {
33222 for (uint32_t n = 16; n <= 24; n += 8) {
33223 for (size_t k = 1; k <= 5; k += 2) {
33224 GemmMicrokernelTester()
33225 .mr(6)
33226 .nr(8)
33227 .kr(1)
33228 .sr(1)
33229 .m(6)
33230 .n(n)
33231 .k(k)
33232 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33233 }
33234 }
33235 }
33236
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_cn)33237 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_cn) {
33238 for (uint32_t n = 16; n <= 24; n += 8) {
33239 for (size_t k = 1; k <= 5; k += 2) {
33240 GemmMicrokernelTester()
33241 .mr(6)
33242 .nr(8)
33243 .kr(1)
33244 .sr(1)
33245 .m(6)
33246 .n(n)
33247 .k(k)
33248 .cn_stride(11)
33249 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33250 }
33251 }
33252 }
33253
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_strided_a)33254 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_strided_a) {
33255 for (uint32_t n = 16; n <= 24; n += 8) {
33256 for (size_t k = 1; k <= 5; k += 2) {
33257 GemmMicrokernelTester()
33258 .mr(6)
33259 .nr(8)
33260 .kr(1)
33261 .sr(1)
33262 .m(6)
33263 .n(n)
33264 .k(k)
33265 .a_stride(7)
33266 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33267 }
33268 }
33269 }
33270
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,n_div_8_subtile)33271 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, n_div_8_subtile) {
33272 for (uint32_t n = 16; n <= 24; n += 8) {
33273 for (size_t k = 1; k <= 5; k += 2) {
33274 for (uint32_t m = 1; m <= 6; m++) {
33275 GemmMicrokernelTester()
33276 .mr(6)
33277 .nr(8)
33278 .kr(1)
33279 .sr(1)
33280 .m(m)
33281 .n(n)
33282 .k(k)
33283 .iterations(1)
33284 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33285 }
33286 }
33287 }
33288 }
33289
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm_subtile)33290 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm_subtile) {
33291 for (size_t k = 1; k <= 5; k += 2) {
33292 for (uint32_t n = 1; n <= 8; n++) {
33293 for (uint32_t m = 1; m <= 6; m++) {
33294 GemmMicrokernelTester()
33295 .mr(6)
33296 .nr(8)
33297 .kr(1)
33298 .sr(1)
33299 .m(m)
33300 .n(n)
33301 .k(k)
33302 .cm_stride(11)
33303 .iterations(1)
33304 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33305 }
33306 }
33307 }
33308 }
33309
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,qmin)33310 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, qmin) {
33311 GemmMicrokernelTester()
33312 .mr(6)
33313 .nr(8)
33314 .kr(1)
33315 .sr(1)
33316 .m(6)
33317 .n(8)
33318 .k(1)
33319 .qmin(128)
33320 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33321 }
33322
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,qmax)33323 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, qmax) {
33324 GemmMicrokernelTester()
33325 .mr(6)
33326 .nr(8)
33327 .kr(1)
33328 .sr(1)
33329 .m(6)
33330 .n(8)
33331 .k(1)
33332 .qmax(128)
33333 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33334 }
33335
TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT,strided_cm)33336 TEST(F32_GEMM_MINMAX_6X8__WASMRELAXEDSIMD_LOADSPLAT, strided_cm) {
33337 GemmMicrokernelTester()
33338 .mr(6)
33339 .nr(8)
33340 .kr(1)
33341 .sr(1)
33342 .m(6)
33343 .n(8)
33344 .k(1)
33345 .cm_stride(11)
33346 .Test(xnn_f32_gemm_minmax_ukernel_6x8__wasmrelaxedsimd_loadsplat, xnn_init_f32_minmax_wasmsimd_params);
33347 }
33348 #endif // XNN_ARCH_WASMRELAXEDSIMD
33349
33350
33351 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_2X4__WASM,k_eq_1)33352 TEST(F32_GEMM_MINMAX_2X4__WASM, k_eq_1) {
33353 GemmMicrokernelTester()
33354 .mr(2)
33355 .nr(4)
33356 .kr(1)
33357 .sr(1)
33358 .m(2)
33359 .n(4)
33360 .k(1)
33361 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33362 }
33363
TEST(F32_GEMM_MINMAX_2X4__WASM,strided_cn)33364 TEST(F32_GEMM_MINMAX_2X4__WASM, strided_cn) {
33365 GemmMicrokernelTester()
33366 .mr(2)
33367 .nr(4)
33368 .kr(1)
33369 .sr(1)
33370 .m(2)
33371 .n(4)
33372 .k(1)
33373 .cn_stride(7)
33374 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33375 }
33376
TEST(F32_GEMM_MINMAX_2X4__WASM,k_eq_1_strided_a)33377 TEST(F32_GEMM_MINMAX_2X4__WASM, k_eq_1_strided_a) {
33378 GemmMicrokernelTester()
33379 .mr(2)
33380 .nr(4)
33381 .kr(1)
33382 .sr(1)
33383 .m(2)
33384 .n(4)
33385 .k(1)
33386 .a_stride(3)
33387 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33388 }
33389
TEST(F32_GEMM_MINMAX_2X4__WASM,k_eq_1_subtile)33390 TEST(F32_GEMM_MINMAX_2X4__WASM, k_eq_1_subtile) {
33391 for (uint32_t n = 1; n <= 4; n++) {
33392 for (uint32_t m = 1; m <= 2; m++) {
33393 GemmMicrokernelTester()
33394 .mr(2)
33395 .nr(4)
33396 .kr(1)
33397 .sr(1)
33398 .m(m)
33399 .n(n)
33400 .k(1)
33401 .iterations(1)
33402 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33403 }
33404 }
33405 }
33406
TEST(F32_GEMM_MINMAX_2X4__WASM,k_eq_1_subtile_m)33407 TEST(F32_GEMM_MINMAX_2X4__WASM, k_eq_1_subtile_m) {
33408 for (uint32_t m = 1; m <= 2; m++) {
33409 GemmMicrokernelTester()
33410 .mr(2)
33411 .nr(4)
33412 .kr(1)
33413 .sr(1)
33414 .m(m)
33415 .n(4)
33416 .k(1)
33417 .iterations(1)
33418 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33419 }
33420 }
33421
TEST(F32_GEMM_MINMAX_2X4__WASM,k_eq_1_subtile_n)33422 TEST(F32_GEMM_MINMAX_2X4__WASM, k_eq_1_subtile_n) {
33423 for (uint32_t n = 1; n <= 4; n++) {
33424 GemmMicrokernelTester()
33425 .mr(2)
33426 .nr(4)
33427 .kr(1)
33428 .sr(1)
33429 .m(2)
33430 .n(n)
33431 .k(1)
33432 .iterations(1)
33433 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33434 }
33435 }
33436
TEST(F32_GEMM_MINMAX_2X4__WASM,k_gt_1)33437 TEST(F32_GEMM_MINMAX_2X4__WASM, k_gt_1) {
33438 for (size_t k = 2; k < 10; k++) {
33439 GemmMicrokernelTester()
33440 .mr(2)
33441 .nr(4)
33442 .kr(1)
33443 .sr(1)
33444 .m(2)
33445 .n(4)
33446 .k(k)
33447 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33448 }
33449 }
33450
TEST(F32_GEMM_MINMAX_2X4__WASM,k_gt_1_strided_a)33451 TEST(F32_GEMM_MINMAX_2X4__WASM, k_gt_1_strided_a) {
33452 for (size_t k = 2; k < 10; k++) {
33453 GemmMicrokernelTester()
33454 .mr(2)
33455 .nr(4)
33456 .kr(1)
33457 .sr(1)
33458 .m(2)
33459 .n(4)
33460 .k(k)
33461 .a_stride(11)
33462 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33463 }
33464 }
33465
TEST(F32_GEMM_MINMAX_2X4__WASM,k_gt_1_subtile)33466 TEST(F32_GEMM_MINMAX_2X4__WASM, k_gt_1_subtile) {
33467 for (size_t k = 2; k < 10; k++) {
33468 for (uint32_t n = 1; n <= 4; n++) {
33469 for (uint32_t m = 1; m <= 2; m++) {
33470 GemmMicrokernelTester()
33471 .mr(2)
33472 .nr(4)
33473 .kr(1)
33474 .sr(1)
33475 .m(m)
33476 .n(n)
33477 .k(k)
33478 .iterations(1)
33479 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33480 }
33481 }
33482 }
33483 }
33484
TEST(F32_GEMM_MINMAX_2X4__WASM,n_gt_4)33485 TEST(F32_GEMM_MINMAX_2X4__WASM, n_gt_4) {
33486 for (uint32_t n = 5; n < 8; n++) {
33487 for (size_t k = 1; k <= 5; k += 2) {
33488 GemmMicrokernelTester()
33489 .mr(2)
33490 .nr(4)
33491 .kr(1)
33492 .sr(1)
33493 .m(2)
33494 .n(n)
33495 .k(k)
33496 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33497 }
33498 }
33499 }
33500
TEST(F32_GEMM_MINMAX_2X4__WASM,n_gt_4_strided_cn)33501 TEST(F32_GEMM_MINMAX_2X4__WASM, n_gt_4_strided_cn) {
33502 for (uint32_t n = 5; n < 8; n++) {
33503 for (size_t k = 1; k <= 5; k += 2) {
33504 GemmMicrokernelTester()
33505 .mr(2)
33506 .nr(4)
33507 .kr(1)
33508 .sr(1)
33509 .m(2)
33510 .n(n)
33511 .k(k)
33512 .cn_stride(7)
33513 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33514 }
33515 }
33516 }
33517
TEST(F32_GEMM_MINMAX_2X4__WASM,n_gt_4_strided_a)33518 TEST(F32_GEMM_MINMAX_2X4__WASM, n_gt_4_strided_a) {
33519 for (uint32_t n = 5; n < 8; n++) {
33520 for (size_t k = 1; k <= 5; k += 2) {
33521 GemmMicrokernelTester()
33522 .mr(2)
33523 .nr(4)
33524 .kr(1)
33525 .sr(1)
33526 .m(2)
33527 .n(n)
33528 .k(k)
33529 .a_stride(7)
33530 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33531 }
33532 }
33533 }
33534
TEST(F32_GEMM_MINMAX_2X4__WASM,n_gt_4_subtile)33535 TEST(F32_GEMM_MINMAX_2X4__WASM, n_gt_4_subtile) {
33536 for (uint32_t n = 5; n < 8; n++) {
33537 for (size_t k = 1; k <= 5; k += 2) {
33538 for (uint32_t m = 1; m <= 2; m++) {
33539 GemmMicrokernelTester()
33540 .mr(2)
33541 .nr(4)
33542 .kr(1)
33543 .sr(1)
33544 .m(m)
33545 .n(n)
33546 .k(k)
33547 .iterations(1)
33548 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33549 }
33550 }
33551 }
33552 }
33553
TEST(F32_GEMM_MINMAX_2X4__WASM,n_div_4)33554 TEST(F32_GEMM_MINMAX_2X4__WASM, n_div_4) {
33555 for (uint32_t n = 8; n <= 12; n += 4) {
33556 for (size_t k = 1; k <= 5; k += 2) {
33557 GemmMicrokernelTester()
33558 .mr(2)
33559 .nr(4)
33560 .kr(1)
33561 .sr(1)
33562 .m(2)
33563 .n(n)
33564 .k(k)
33565 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33566 }
33567 }
33568 }
33569
TEST(F32_GEMM_MINMAX_2X4__WASM,n_div_4_strided_cn)33570 TEST(F32_GEMM_MINMAX_2X4__WASM, n_div_4_strided_cn) {
33571 for (uint32_t n = 8; n <= 12; n += 4) {
33572 for (size_t k = 1; k <= 5; k += 2) {
33573 GemmMicrokernelTester()
33574 .mr(2)
33575 .nr(4)
33576 .kr(1)
33577 .sr(1)
33578 .m(2)
33579 .n(n)
33580 .k(k)
33581 .cn_stride(7)
33582 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33583 }
33584 }
33585 }
33586
TEST(F32_GEMM_MINMAX_2X4__WASM,n_div_4_strided_a)33587 TEST(F32_GEMM_MINMAX_2X4__WASM, n_div_4_strided_a) {
33588 for (uint32_t n = 8; n <= 12; n += 4) {
33589 for (size_t k = 1; k <= 5; k += 2) {
33590 GemmMicrokernelTester()
33591 .mr(2)
33592 .nr(4)
33593 .kr(1)
33594 .sr(1)
33595 .m(2)
33596 .n(n)
33597 .k(k)
33598 .a_stride(7)
33599 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33600 }
33601 }
33602 }
33603
TEST(F32_GEMM_MINMAX_2X4__WASM,n_div_4_subtile)33604 TEST(F32_GEMM_MINMAX_2X4__WASM, n_div_4_subtile) {
33605 for (uint32_t n = 8; n <= 12; n += 4) {
33606 for (size_t k = 1; k <= 5; k += 2) {
33607 for (uint32_t m = 1; m <= 2; m++) {
33608 GemmMicrokernelTester()
33609 .mr(2)
33610 .nr(4)
33611 .kr(1)
33612 .sr(1)
33613 .m(m)
33614 .n(n)
33615 .k(k)
33616 .iterations(1)
33617 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33618 }
33619 }
33620 }
33621 }
33622
TEST(F32_GEMM_MINMAX_2X4__WASM,strided_cm_subtile)33623 TEST(F32_GEMM_MINMAX_2X4__WASM, strided_cm_subtile) {
33624 for (size_t k = 1; k <= 5; k += 2) {
33625 for (uint32_t n = 1; n <= 4; n++) {
33626 for (uint32_t m = 1; m <= 2; m++) {
33627 GemmMicrokernelTester()
33628 .mr(2)
33629 .nr(4)
33630 .kr(1)
33631 .sr(1)
33632 .m(m)
33633 .n(n)
33634 .k(k)
33635 .cm_stride(7)
33636 .iterations(1)
33637 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33638 }
33639 }
33640 }
33641 }
33642
TEST(F32_GEMM_MINMAX_2X4__WASM,qmin)33643 TEST(F32_GEMM_MINMAX_2X4__WASM, qmin) {
33644 GemmMicrokernelTester()
33645 .mr(2)
33646 .nr(4)
33647 .kr(1)
33648 .sr(1)
33649 .m(2)
33650 .n(4)
33651 .k(1)
33652 .qmin(128)
33653 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33654 }
33655
TEST(F32_GEMM_MINMAX_2X4__WASM,qmax)33656 TEST(F32_GEMM_MINMAX_2X4__WASM, qmax) {
33657 GemmMicrokernelTester()
33658 .mr(2)
33659 .nr(4)
33660 .kr(1)
33661 .sr(1)
33662 .m(2)
33663 .n(4)
33664 .k(1)
33665 .qmax(128)
33666 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33667 }
33668
TEST(F32_GEMM_MINMAX_2X4__WASM,strided_cm)33669 TEST(F32_GEMM_MINMAX_2X4__WASM, strided_cm) {
33670 GemmMicrokernelTester()
33671 .mr(2)
33672 .nr(4)
33673 .kr(1)
33674 .sr(1)
33675 .m(2)
33676 .n(4)
33677 .k(1)
33678 .cm_stride(7)
33679 .Test(xnn_f32_gemm_minmax_ukernel_2x4__wasm, xnn_init_f32_minmax_scalar_params);
33680 }
33681 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
33682
33683
33684 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_4X2__WASM,k_eq_1)33685 TEST(F32_GEMM_MINMAX_4X2__WASM, k_eq_1) {
33686 GemmMicrokernelTester()
33687 .mr(4)
33688 .nr(2)
33689 .kr(1)
33690 .sr(1)
33691 .m(4)
33692 .n(2)
33693 .k(1)
33694 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33695 }
33696
TEST(F32_GEMM_MINMAX_4X2__WASM,strided_cn)33697 TEST(F32_GEMM_MINMAX_4X2__WASM, strided_cn) {
33698 GemmMicrokernelTester()
33699 .mr(4)
33700 .nr(2)
33701 .kr(1)
33702 .sr(1)
33703 .m(4)
33704 .n(2)
33705 .k(1)
33706 .cn_stride(5)
33707 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33708 }
33709
TEST(F32_GEMM_MINMAX_4X2__WASM,k_eq_1_strided_a)33710 TEST(F32_GEMM_MINMAX_4X2__WASM, k_eq_1_strided_a) {
33711 GemmMicrokernelTester()
33712 .mr(4)
33713 .nr(2)
33714 .kr(1)
33715 .sr(1)
33716 .m(4)
33717 .n(2)
33718 .k(1)
33719 .a_stride(3)
33720 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33721 }
33722
TEST(F32_GEMM_MINMAX_4X2__WASM,k_eq_1_subtile)33723 TEST(F32_GEMM_MINMAX_4X2__WASM, k_eq_1_subtile) {
33724 for (uint32_t n = 1; n <= 2; n++) {
33725 for (uint32_t m = 1; m <= 4; m++) {
33726 GemmMicrokernelTester()
33727 .mr(4)
33728 .nr(2)
33729 .kr(1)
33730 .sr(1)
33731 .m(m)
33732 .n(n)
33733 .k(1)
33734 .iterations(1)
33735 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33736 }
33737 }
33738 }
33739
TEST(F32_GEMM_MINMAX_4X2__WASM,k_eq_1_subtile_m)33740 TEST(F32_GEMM_MINMAX_4X2__WASM, k_eq_1_subtile_m) {
33741 for (uint32_t m = 1; m <= 4; m++) {
33742 GemmMicrokernelTester()
33743 .mr(4)
33744 .nr(2)
33745 .kr(1)
33746 .sr(1)
33747 .m(m)
33748 .n(2)
33749 .k(1)
33750 .iterations(1)
33751 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33752 }
33753 }
33754
TEST(F32_GEMM_MINMAX_4X2__WASM,k_eq_1_subtile_n)33755 TEST(F32_GEMM_MINMAX_4X2__WASM, k_eq_1_subtile_n) {
33756 for (uint32_t n = 1; n <= 2; n++) {
33757 GemmMicrokernelTester()
33758 .mr(4)
33759 .nr(2)
33760 .kr(1)
33761 .sr(1)
33762 .m(4)
33763 .n(n)
33764 .k(1)
33765 .iterations(1)
33766 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33767 }
33768 }
33769
TEST(F32_GEMM_MINMAX_4X2__WASM,k_gt_1)33770 TEST(F32_GEMM_MINMAX_4X2__WASM, k_gt_1) {
33771 for (size_t k = 2; k < 10; k++) {
33772 GemmMicrokernelTester()
33773 .mr(4)
33774 .nr(2)
33775 .kr(1)
33776 .sr(1)
33777 .m(4)
33778 .n(2)
33779 .k(k)
33780 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33781 }
33782 }
33783
TEST(F32_GEMM_MINMAX_4X2__WASM,k_gt_1_strided_a)33784 TEST(F32_GEMM_MINMAX_4X2__WASM, k_gt_1_strided_a) {
33785 for (size_t k = 2; k < 10; k++) {
33786 GemmMicrokernelTester()
33787 .mr(4)
33788 .nr(2)
33789 .kr(1)
33790 .sr(1)
33791 .m(4)
33792 .n(2)
33793 .k(k)
33794 .a_stride(11)
33795 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33796 }
33797 }
33798
TEST(F32_GEMM_MINMAX_4X2__WASM,k_gt_1_subtile)33799 TEST(F32_GEMM_MINMAX_4X2__WASM, k_gt_1_subtile) {
33800 for (size_t k = 2; k < 10; k++) {
33801 for (uint32_t n = 1; n <= 2; n++) {
33802 for (uint32_t m = 1; m <= 4; m++) {
33803 GemmMicrokernelTester()
33804 .mr(4)
33805 .nr(2)
33806 .kr(1)
33807 .sr(1)
33808 .m(m)
33809 .n(n)
33810 .k(k)
33811 .iterations(1)
33812 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33813 }
33814 }
33815 }
33816 }
33817
TEST(F32_GEMM_MINMAX_4X2__WASM,n_gt_2)33818 TEST(F32_GEMM_MINMAX_4X2__WASM, n_gt_2) {
33819 for (uint32_t n = 3; n < 4; n++) {
33820 for (size_t k = 1; k <= 5; k += 2) {
33821 GemmMicrokernelTester()
33822 .mr(4)
33823 .nr(2)
33824 .kr(1)
33825 .sr(1)
33826 .m(4)
33827 .n(n)
33828 .k(k)
33829 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33830 }
33831 }
33832 }
33833
TEST(F32_GEMM_MINMAX_4X2__WASM,n_gt_2_strided_cn)33834 TEST(F32_GEMM_MINMAX_4X2__WASM, n_gt_2_strided_cn) {
33835 for (uint32_t n = 3; n < 4; n++) {
33836 for (size_t k = 1; k <= 5; k += 2) {
33837 GemmMicrokernelTester()
33838 .mr(4)
33839 .nr(2)
33840 .kr(1)
33841 .sr(1)
33842 .m(4)
33843 .n(n)
33844 .k(k)
33845 .cn_stride(5)
33846 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33847 }
33848 }
33849 }
33850
TEST(F32_GEMM_MINMAX_4X2__WASM,n_gt_2_strided_a)33851 TEST(F32_GEMM_MINMAX_4X2__WASM, n_gt_2_strided_a) {
33852 for (uint32_t n = 3; n < 4; n++) {
33853 for (size_t k = 1; k <= 5; k += 2) {
33854 GemmMicrokernelTester()
33855 .mr(4)
33856 .nr(2)
33857 .kr(1)
33858 .sr(1)
33859 .m(4)
33860 .n(n)
33861 .k(k)
33862 .a_stride(7)
33863 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33864 }
33865 }
33866 }
33867
TEST(F32_GEMM_MINMAX_4X2__WASM,n_gt_2_subtile)33868 TEST(F32_GEMM_MINMAX_4X2__WASM, n_gt_2_subtile) {
33869 for (uint32_t n = 3; n < 4; n++) {
33870 for (size_t k = 1; k <= 5; k += 2) {
33871 for (uint32_t m = 1; m <= 4; m++) {
33872 GemmMicrokernelTester()
33873 .mr(4)
33874 .nr(2)
33875 .kr(1)
33876 .sr(1)
33877 .m(m)
33878 .n(n)
33879 .k(k)
33880 .iterations(1)
33881 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33882 }
33883 }
33884 }
33885 }
33886
TEST(F32_GEMM_MINMAX_4X2__WASM,n_div_2)33887 TEST(F32_GEMM_MINMAX_4X2__WASM, n_div_2) {
33888 for (uint32_t n = 4; n <= 6; n += 2) {
33889 for (size_t k = 1; k <= 5; k += 2) {
33890 GemmMicrokernelTester()
33891 .mr(4)
33892 .nr(2)
33893 .kr(1)
33894 .sr(1)
33895 .m(4)
33896 .n(n)
33897 .k(k)
33898 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33899 }
33900 }
33901 }
33902
TEST(F32_GEMM_MINMAX_4X2__WASM,n_div_2_strided_cn)33903 TEST(F32_GEMM_MINMAX_4X2__WASM, n_div_2_strided_cn) {
33904 for (uint32_t n = 4; n <= 6; n += 2) {
33905 for (size_t k = 1; k <= 5; k += 2) {
33906 GemmMicrokernelTester()
33907 .mr(4)
33908 .nr(2)
33909 .kr(1)
33910 .sr(1)
33911 .m(4)
33912 .n(n)
33913 .k(k)
33914 .cn_stride(5)
33915 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33916 }
33917 }
33918 }
33919
TEST(F32_GEMM_MINMAX_4X2__WASM,n_div_2_strided_a)33920 TEST(F32_GEMM_MINMAX_4X2__WASM, n_div_2_strided_a) {
33921 for (uint32_t n = 4; n <= 6; n += 2) {
33922 for (size_t k = 1; k <= 5; k += 2) {
33923 GemmMicrokernelTester()
33924 .mr(4)
33925 .nr(2)
33926 .kr(1)
33927 .sr(1)
33928 .m(4)
33929 .n(n)
33930 .k(k)
33931 .a_stride(7)
33932 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33933 }
33934 }
33935 }
33936
TEST(F32_GEMM_MINMAX_4X2__WASM,n_div_2_subtile)33937 TEST(F32_GEMM_MINMAX_4X2__WASM, n_div_2_subtile) {
33938 for (uint32_t n = 4; n <= 6; n += 2) {
33939 for (size_t k = 1; k <= 5; k += 2) {
33940 for (uint32_t m = 1; m <= 4; m++) {
33941 GemmMicrokernelTester()
33942 .mr(4)
33943 .nr(2)
33944 .kr(1)
33945 .sr(1)
33946 .m(m)
33947 .n(n)
33948 .k(k)
33949 .iterations(1)
33950 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33951 }
33952 }
33953 }
33954 }
33955
TEST(F32_GEMM_MINMAX_4X2__WASM,strided_cm_subtile)33956 TEST(F32_GEMM_MINMAX_4X2__WASM, strided_cm_subtile) {
33957 for (size_t k = 1; k <= 5; k += 2) {
33958 for (uint32_t n = 1; n <= 2; n++) {
33959 for (uint32_t m = 1; m <= 4; m++) {
33960 GemmMicrokernelTester()
33961 .mr(4)
33962 .nr(2)
33963 .kr(1)
33964 .sr(1)
33965 .m(m)
33966 .n(n)
33967 .k(k)
33968 .cm_stride(5)
33969 .iterations(1)
33970 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33971 }
33972 }
33973 }
33974 }
33975
TEST(F32_GEMM_MINMAX_4X2__WASM,qmin)33976 TEST(F32_GEMM_MINMAX_4X2__WASM, qmin) {
33977 GemmMicrokernelTester()
33978 .mr(4)
33979 .nr(2)
33980 .kr(1)
33981 .sr(1)
33982 .m(4)
33983 .n(2)
33984 .k(1)
33985 .qmin(128)
33986 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
33987 }
33988
TEST(F32_GEMM_MINMAX_4X2__WASM,qmax)33989 TEST(F32_GEMM_MINMAX_4X2__WASM, qmax) {
33990 GemmMicrokernelTester()
33991 .mr(4)
33992 .nr(2)
33993 .kr(1)
33994 .sr(1)
33995 .m(4)
33996 .n(2)
33997 .k(1)
33998 .qmax(128)
33999 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
34000 }
34001
TEST(F32_GEMM_MINMAX_4X2__WASM,strided_cm)34002 TEST(F32_GEMM_MINMAX_4X2__WASM, strided_cm) {
34003 GemmMicrokernelTester()
34004 .mr(4)
34005 .nr(2)
34006 .kr(1)
34007 .sr(1)
34008 .m(4)
34009 .n(2)
34010 .k(1)
34011 .cm_stride(5)
34012 .Test(xnn_f32_gemm_minmax_ukernel_4x2__wasm, xnn_init_f32_minmax_scalar_params);
34013 }
34014 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
34015
34016
34017 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(F32_GEMM_MINMAX_4X4__WASM,k_eq_1)34018 TEST(F32_GEMM_MINMAX_4X4__WASM, k_eq_1) {
34019 GemmMicrokernelTester()
34020 .mr(4)
34021 .nr(4)
34022 .kr(1)
34023 .sr(1)
34024 .m(4)
34025 .n(4)
34026 .k(1)
34027 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34028 }
34029
TEST(F32_GEMM_MINMAX_4X4__WASM,strided_cn)34030 TEST(F32_GEMM_MINMAX_4X4__WASM, strided_cn) {
34031 GemmMicrokernelTester()
34032 .mr(4)
34033 .nr(4)
34034 .kr(1)
34035 .sr(1)
34036 .m(4)
34037 .n(4)
34038 .k(1)
34039 .cn_stride(7)
34040 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34041 }
34042
TEST(F32_GEMM_MINMAX_4X4__WASM,k_eq_1_strided_a)34043 TEST(F32_GEMM_MINMAX_4X4__WASM, k_eq_1_strided_a) {
34044 GemmMicrokernelTester()
34045 .mr(4)
34046 .nr(4)
34047 .kr(1)
34048 .sr(1)
34049 .m(4)
34050 .n(4)
34051 .k(1)
34052 .a_stride(3)
34053 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34054 }
34055
TEST(F32_GEMM_MINMAX_4X4__WASM,k_eq_1_subtile)34056 TEST(F32_GEMM_MINMAX_4X4__WASM, k_eq_1_subtile) {
34057 for (uint32_t n = 1; n <= 4; n++) {
34058 for (uint32_t m = 1; m <= 4; m++) {
34059 GemmMicrokernelTester()
34060 .mr(4)
34061 .nr(4)
34062 .kr(1)
34063 .sr(1)
34064 .m(m)
34065 .n(n)
34066 .k(1)
34067 .iterations(1)
34068 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34069 }
34070 }
34071 }
34072
TEST(F32_GEMM_MINMAX_4X4__WASM,k_eq_1_subtile_m)34073 TEST(F32_GEMM_MINMAX_4X4__WASM, k_eq_1_subtile_m) {
34074 for (uint32_t m = 1; m <= 4; m++) {
34075 GemmMicrokernelTester()
34076 .mr(4)
34077 .nr(4)
34078 .kr(1)
34079 .sr(1)
34080 .m(m)
34081 .n(4)
34082 .k(1)
34083 .iterations(1)
34084 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34085 }
34086 }
34087
TEST(F32_GEMM_MINMAX_4X4__WASM,k_eq_1_subtile_n)34088 TEST(F32_GEMM_MINMAX_4X4__WASM, k_eq_1_subtile_n) {
34089 for (uint32_t n = 1; n <= 4; n++) {
34090 GemmMicrokernelTester()
34091 .mr(4)
34092 .nr(4)
34093 .kr(1)
34094 .sr(1)
34095 .m(4)
34096 .n(n)
34097 .k(1)
34098 .iterations(1)
34099 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34100 }
34101 }
34102
TEST(F32_GEMM_MINMAX_4X4__WASM,k_gt_1)34103 TEST(F32_GEMM_MINMAX_4X4__WASM, k_gt_1) {
34104 for (size_t k = 2; k < 10; k++) {
34105 GemmMicrokernelTester()
34106 .mr(4)
34107 .nr(4)
34108 .kr(1)
34109 .sr(1)
34110 .m(4)
34111 .n(4)
34112 .k(k)
34113 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34114 }
34115 }
34116
TEST(F32_GEMM_MINMAX_4X4__WASM,k_gt_1_strided_a)34117 TEST(F32_GEMM_MINMAX_4X4__WASM, k_gt_1_strided_a) {
34118 for (size_t k = 2; k < 10; k++) {
34119 GemmMicrokernelTester()
34120 .mr(4)
34121 .nr(4)
34122 .kr(1)
34123 .sr(1)
34124 .m(4)
34125 .n(4)
34126 .k(k)
34127 .a_stride(11)
34128 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34129 }
34130 }
34131
TEST(F32_GEMM_MINMAX_4X4__WASM,k_gt_1_subtile)34132 TEST(F32_GEMM_MINMAX_4X4__WASM, k_gt_1_subtile) {
34133 for (size_t k = 2; k < 10; k++) {
34134 for (uint32_t n = 1; n <= 4; n++) {
34135 for (uint32_t m = 1; m <= 4; m++) {
34136 GemmMicrokernelTester()
34137 .mr(4)
34138 .nr(4)
34139 .kr(1)
34140 .sr(1)
34141 .m(m)
34142 .n(n)
34143 .k(k)
34144 .iterations(1)
34145 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34146 }
34147 }
34148 }
34149 }
34150
TEST(F32_GEMM_MINMAX_4X4__WASM,n_gt_4)34151 TEST(F32_GEMM_MINMAX_4X4__WASM, n_gt_4) {
34152 for (uint32_t n = 5; n < 8; n++) {
34153 for (size_t k = 1; k <= 5; k += 2) {
34154 GemmMicrokernelTester()
34155 .mr(4)
34156 .nr(4)
34157 .kr(1)
34158 .sr(1)
34159 .m(4)
34160 .n(n)
34161 .k(k)
34162 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34163 }
34164 }
34165 }
34166
TEST(F32_GEMM_MINMAX_4X4__WASM,n_gt_4_strided_cn)34167 TEST(F32_GEMM_MINMAX_4X4__WASM, n_gt_4_strided_cn) {
34168 for (uint32_t n = 5; n < 8; n++) {
34169 for (size_t k = 1; k <= 5; k += 2) {
34170 GemmMicrokernelTester()
34171 .mr(4)
34172 .nr(4)
34173 .kr(1)
34174 .sr(1)
34175 .m(4)
34176 .n(n)
34177 .k(k)
34178 .cn_stride(7)
34179 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34180 }
34181 }
34182 }
34183
TEST(F32_GEMM_MINMAX_4X4__WASM,n_gt_4_strided_a)34184 TEST(F32_GEMM_MINMAX_4X4__WASM, n_gt_4_strided_a) {
34185 for (uint32_t n = 5; n < 8; n++) {
34186 for (size_t k = 1; k <= 5; k += 2) {
34187 GemmMicrokernelTester()
34188 .mr(4)
34189 .nr(4)
34190 .kr(1)
34191 .sr(1)
34192 .m(4)
34193 .n(n)
34194 .k(k)
34195 .a_stride(7)
34196 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34197 }
34198 }
34199 }
34200
TEST(F32_GEMM_MINMAX_4X4__WASM,n_gt_4_subtile)34201 TEST(F32_GEMM_MINMAX_4X4__WASM, n_gt_4_subtile) {
34202 for (uint32_t n = 5; n < 8; n++) {
34203 for (size_t k = 1; k <= 5; k += 2) {
34204 for (uint32_t m = 1; m <= 4; m++) {
34205 GemmMicrokernelTester()
34206 .mr(4)
34207 .nr(4)
34208 .kr(1)
34209 .sr(1)
34210 .m(m)
34211 .n(n)
34212 .k(k)
34213 .iterations(1)
34214 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34215 }
34216 }
34217 }
34218 }
34219
TEST(F32_GEMM_MINMAX_4X4__WASM,n_div_4)34220 TEST(F32_GEMM_MINMAX_4X4__WASM, n_div_4) {
34221 for (uint32_t n = 8; n <= 12; n += 4) {
34222 for (size_t k = 1; k <= 5; k += 2) {
34223 GemmMicrokernelTester()
34224 .mr(4)
34225 .nr(4)
34226 .kr(1)
34227 .sr(1)
34228 .m(4)
34229 .n(n)
34230 .k(k)
34231 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34232 }
34233 }
34234 }
34235
TEST(F32_GEMM_MINMAX_4X4__WASM,n_div_4_strided_cn)34236 TEST(F32_GEMM_MINMAX_4X4__WASM, n_div_4_strided_cn) {
34237 for (uint32_t n = 8; n <= 12; n += 4) {
34238 for (size_t k = 1; k <= 5; k += 2) {
34239 GemmMicrokernelTester()
34240 .mr(4)
34241 .nr(4)
34242 .kr(1)
34243 .sr(1)
34244 .m(4)
34245 .n(n)
34246 .k(k)
34247 .cn_stride(7)
34248 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34249 }
34250 }
34251 }
34252
TEST(F32_GEMM_MINMAX_4X4__WASM,n_div_4_strided_a)34253 TEST(F32_GEMM_MINMAX_4X4__WASM, n_div_4_strided_a) {
34254 for (uint32_t n = 8; n <= 12; n += 4) {
34255 for (size_t k = 1; k <= 5; k += 2) {
34256 GemmMicrokernelTester()
34257 .mr(4)
34258 .nr(4)
34259 .kr(1)
34260 .sr(1)
34261 .m(4)
34262 .n(n)
34263 .k(k)
34264 .a_stride(7)
34265 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34266 }
34267 }
34268 }
34269
TEST(F32_GEMM_MINMAX_4X4__WASM,n_div_4_subtile)34270 TEST(F32_GEMM_MINMAX_4X4__WASM, n_div_4_subtile) {
34271 for (uint32_t n = 8; n <= 12; n += 4) {
34272 for (size_t k = 1; k <= 5; k += 2) {
34273 for (uint32_t m = 1; m <= 4; m++) {
34274 GemmMicrokernelTester()
34275 .mr(4)
34276 .nr(4)
34277 .kr(1)
34278 .sr(1)
34279 .m(m)
34280 .n(n)
34281 .k(k)
34282 .iterations(1)
34283 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34284 }
34285 }
34286 }
34287 }
34288
TEST(F32_GEMM_MINMAX_4X4__WASM,strided_cm_subtile)34289 TEST(F32_GEMM_MINMAX_4X4__WASM, strided_cm_subtile) {
34290 for (size_t k = 1; k <= 5; k += 2) {
34291 for (uint32_t n = 1; n <= 4; n++) {
34292 for (uint32_t m = 1; m <= 4; m++) {
34293 GemmMicrokernelTester()
34294 .mr(4)
34295 .nr(4)
34296 .kr(1)
34297 .sr(1)
34298 .m(m)
34299 .n(n)
34300 .k(k)
34301 .cm_stride(7)
34302 .iterations(1)
34303 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34304 }
34305 }
34306 }
34307 }
34308
TEST(F32_GEMM_MINMAX_4X4__WASM,qmin)34309 TEST(F32_GEMM_MINMAX_4X4__WASM, qmin) {
34310 GemmMicrokernelTester()
34311 .mr(4)
34312 .nr(4)
34313 .kr(1)
34314 .sr(1)
34315 .m(4)
34316 .n(4)
34317 .k(1)
34318 .qmin(128)
34319 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34320 }
34321
TEST(F32_GEMM_MINMAX_4X4__WASM,qmax)34322 TEST(F32_GEMM_MINMAX_4X4__WASM, qmax) {
34323 GemmMicrokernelTester()
34324 .mr(4)
34325 .nr(4)
34326 .kr(1)
34327 .sr(1)
34328 .m(4)
34329 .n(4)
34330 .k(1)
34331 .qmax(128)
34332 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34333 }
34334
TEST(F32_GEMM_MINMAX_4X4__WASM,strided_cm)34335 TEST(F32_GEMM_MINMAX_4X4__WASM, strided_cm) {
34336 GemmMicrokernelTester()
34337 .mr(4)
34338 .nr(4)
34339 .kr(1)
34340 .sr(1)
34341 .m(4)
34342 .n(4)
34343 .k(1)
34344 .cm_stride(7)
34345 .Test(xnn_f32_gemm_minmax_ukernel_4x4__wasm, xnn_init_f32_minmax_scalar_params);
34346 }
34347 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
34348
34349
TEST(F32_GEMM_MINMAX_1X4__SCALAR,k_eq_1)34350 TEST(F32_GEMM_MINMAX_1X4__SCALAR, k_eq_1) {
34351 GemmMicrokernelTester()
34352 .mr(1)
34353 .nr(4)
34354 .kr(1)
34355 .sr(1)
34356 .m(1)
34357 .n(4)
34358 .k(1)
34359 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34360 }
34361
TEST(F32_GEMM_MINMAX_1X4__SCALAR,strided_cn)34362 TEST(F32_GEMM_MINMAX_1X4__SCALAR, strided_cn) {
34363 GemmMicrokernelTester()
34364 .mr(1)
34365 .nr(4)
34366 .kr(1)
34367 .sr(1)
34368 .m(1)
34369 .n(4)
34370 .k(1)
34371 .cn_stride(7)
34372 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34373 }
34374
TEST(F32_GEMM_MINMAX_1X4__SCALAR,k_eq_1_strided_a)34375 TEST(F32_GEMM_MINMAX_1X4__SCALAR, k_eq_1_strided_a) {
34376 GemmMicrokernelTester()
34377 .mr(1)
34378 .nr(4)
34379 .kr(1)
34380 .sr(1)
34381 .m(1)
34382 .n(4)
34383 .k(1)
34384 .a_stride(3)
34385 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34386 }
34387
TEST(F32_GEMM_MINMAX_1X4__SCALAR,k_eq_1_subtile)34388 TEST(F32_GEMM_MINMAX_1X4__SCALAR, k_eq_1_subtile) {
34389 for (uint32_t n = 1; n <= 4; n++) {
34390 for (uint32_t m = 1; m <= 1; m++) {
34391 GemmMicrokernelTester()
34392 .mr(1)
34393 .nr(4)
34394 .kr(1)
34395 .sr(1)
34396 .m(m)
34397 .n(n)
34398 .k(1)
34399 .iterations(1)
34400 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34401 }
34402 }
34403 }
34404
TEST(F32_GEMM_MINMAX_1X4__SCALAR,k_eq_1_subtile_m)34405 TEST(F32_GEMM_MINMAX_1X4__SCALAR, k_eq_1_subtile_m) {
34406 for (uint32_t m = 1; m <= 1; m++) {
34407 GemmMicrokernelTester()
34408 .mr(1)
34409 .nr(4)
34410 .kr(1)
34411 .sr(1)
34412 .m(m)
34413 .n(4)
34414 .k(1)
34415 .iterations(1)
34416 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34417 }
34418 }
34419
TEST(F32_GEMM_MINMAX_1X4__SCALAR,k_eq_1_subtile_n)34420 TEST(F32_GEMM_MINMAX_1X4__SCALAR, k_eq_1_subtile_n) {
34421 for (uint32_t n = 1; n <= 4; n++) {
34422 GemmMicrokernelTester()
34423 .mr(1)
34424 .nr(4)
34425 .kr(1)
34426 .sr(1)
34427 .m(1)
34428 .n(n)
34429 .k(1)
34430 .iterations(1)
34431 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34432 }
34433 }
34434
TEST(F32_GEMM_MINMAX_1X4__SCALAR,k_gt_1)34435 TEST(F32_GEMM_MINMAX_1X4__SCALAR, k_gt_1) {
34436 for (size_t k = 2; k < 10; k++) {
34437 GemmMicrokernelTester()
34438 .mr(1)
34439 .nr(4)
34440 .kr(1)
34441 .sr(1)
34442 .m(1)
34443 .n(4)
34444 .k(k)
34445 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34446 }
34447 }
34448
TEST(F32_GEMM_MINMAX_1X4__SCALAR,k_gt_1_strided_a)34449 TEST(F32_GEMM_MINMAX_1X4__SCALAR, k_gt_1_strided_a) {
34450 for (size_t k = 2; k < 10; k++) {
34451 GemmMicrokernelTester()
34452 .mr(1)
34453 .nr(4)
34454 .kr(1)
34455 .sr(1)
34456 .m(1)
34457 .n(4)
34458 .k(k)
34459 .a_stride(11)
34460 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34461 }
34462 }
34463
TEST(F32_GEMM_MINMAX_1X4__SCALAR,k_gt_1_subtile)34464 TEST(F32_GEMM_MINMAX_1X4__SCALAR, k_gt_1_subtile) {
34465 for (size_t k = 2; k < 10; k++) {
34466 for (uint32_t n = 1; n <= 4; n++) {
34467 for (uint32_t m = 1; m <= 1; m++) {
34468 GemmMicrokernelTester()
34469 .mr(1)
34470 .nr(4)
34471 .kr(1)
34472 .sr(1)
34473 .m(m)
34474 .n(n)
34475 .k(k)
34476 .iterations(1)
34477 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34478 }
34479 }
34480 }
34481 }
34482
TEST(F32_GEMM_MINMAX_1X4__SCALAR,n_gt_4)34483 TEST(F32_GEMM_MINMAX_1X4__SCALAR, n_gt_4) {
34484 for (uint32_t n = 5; n < 8; n++) {
34485 for (size_t k = 1; k <= 5; k += 2) {
34486 GemmMicrokernelTester()
34487 .mr(1)
34488 .nr(4)
34489 .kr(1)
34490 .sr(1)
34491 .m(1)
34492 .n(n)
34493 .k(k)
34494 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34495 }
34496 }
34497 }
34498
TEST(F32_GEMM_MINMAX_1X4__SCALAR,n_gt_4_strided_cn)34499 TEST(F32_GEMM_MINMAX_1X4__SCALAR, n_gt_4_strided_cn) {
34500 for (uint32_t n = 5; n < 8; n++) {
34501 for (size_t k = 1; k <= 5; k += 2) {
34502 GemmMicrokernelTester()
34503 .mr(1)
34504 .nr(4)
34505 .kr(1)
34506 .sr(1)
34507 .m(1)
34508 .n(n)
34509 .k(k)
34510 .cn_stride(7)
34511 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34512 }
34513 }
34514 }
34515
TEST(F32_GEMM_MINMAX_1X4__SCALAR,n_gt_4_strided_a)34516 TEST(F32_GEMM_MINMAX_1X4__SCALAR, n_gt_4_strided_a) {
34517 for (uint32_t n = 5; n < 8; n++) {
34518 for (size_t k = 1; k <= 5; k += 2) {
34519 GemmMicrokernelTester()
34520 .mr(1)
34521 .nr(4)
34522 .kr(1)
34523 .sr(1)
34524 .m(1)
34525 .n(n)
34526 .k(k)
34527 .a_stride(7)
34528 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34529 }
34530 }
34531 }
34532
TEST(F32_GEMM_MINMAX_1X4__SCALAR,n_gt_4_subtile)34533 TEST(F32_GEMM_MINMAX_1X4__SCALAR, n_gt_4_subtile) {
34534 for (uint32_t n = 5; n < 8; n++) {
34535 for (size_t k = 1; k <= 5; k += 2) {
34536 for (uint32_t m = 1; m <= 1; m++) {
34537 GemmMicrokernelTester()
34538 .mr(1)
34539 .nr(4)
34540 .kr(1)
34541 .sr(1)
34542 .m(m)
34543 .n(n)
34544 .k(k)
34545 .iterations(1)
34546 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34547 }
34548 }
34549 }
34550 }
34551
TEST(F32_GEMM_MINMAX_1X4__SCALAR,n_div_4)34552 TEST(F32_GEMM_MINMAX_1X4__SCALAR, n_div_4) {
34553 for (uint32_t n = 8; n <= 12; n += 4) {
34554 for (size_t k = 1; k <= 5; k += 2) {
34555 GemmMicrokernelTester()
34556 .mr(1)
34557 .nr(4)
34558 .kr(1)
34559 .sr(1)
34560 .m(1)
34561 .n(n)
34562 .k(k)
34563 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34564 }
34565 }
34566 }
34567
TEST(F32_GEMM_MINMAX_1X4__SCALAR,n_div_4_strided_cn)34568 TEST(F32_GEMM_MINMAX_1X4__SCALAR, n_div_4_strided_cn) {
34569 for (uint32_t n = 8; n <= 12; n += 4) {
34570 for (size_t k = 1; k <= 5; k += 2) {
34571 GemmMicrokernelTester()
34572 .mr(1)
34573 .nr(4)
34574 .kr(1)
34575 .sr(1)
34576 .m(1)
34577 .n(n)
34578 .k(k)
34579 .cn_stride(7)
34580 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34581 }
34582 }
34583 }
34584
TEST(F32_GEMM_MINMAX_1X4__SCALAR,n_div_4_strided_a)34585 TEST(F32_GEMM_MINMAX_1X4__SCALAR, n_div_4_strided_a) {
34586 for (uint32_t n = 8; n <= 12; n += 4) {
34587 for (size_t k = 1; k <= 5; k += 2) {
34588 GemmMicrokernelTester()
34589 .mr(1)
34590 .nr(4)
34591 .kr(1)
34592 .sr(1)
34593 .m(1)
34594 .n(n)
34595 .k(k)
34596 .a_stride(7)
34597 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34598 }
34599 }
34600 }
34601
TEST(F32_GEMM_MINMAX_1X4__SCALAR,n_div_4_subtile)34602 TEST(F32_GEMM_MINMAX_1X4__SCALAR, n_div_4_subtile) {
34603 for (uint32_t n = 8; n <= 12; n += 4) {
34604 for (size_t k = 1; k <= 5; k += 2) {
34605 for (uint32_t m = 1; m <= 1; m++) {
34606 GemmMicrokernelTester()
34607 .mr(1)
34608 .nr(4)
34609 .kr(1)
34610 .sr(1)
34611 .m(m)
34612 .n(n)
34613 .k(k)
34614 .iterations(1)
34615 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34616 }
34617 }
34618 }
34619 }
34620
TEST(F32_GEMM_MINMAX_1X4__SCALAR,strided_cm_subtile)34621 TEST(F32_GEMM_MINMAX_1X4__SCALAR, strided_cm_subtile) {
34622 for (size_t k = 1; k <= 5; k += 2) {
34623 for (uint32_t n = 1; n <= 4; n++) {
34624 for (uint32_t m = 1; m <= 1; m++) {
34625 GemmMicrokernelTester()
34626 .mr(1)
34627 .nr(4)
34628 .kr(1)
34629 .sr(1)
34630 .m(m)
34631 .n(n)
34632 .k(k)
34633 .cm_stride(7)
34634 .iterations(1)
34635 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34636 }
34637 }
34638 }
34639 }
34640
TEST(F32_GEMM_MINMAX_1X4__SCALAR,qmin)34641 TEST(F32_GEMM_MINMAX_1X4__SCALAR, qmin) {
34642 GemmMicrokernelTester()
34643 .mr(1)
34644 .nr(4)
34645 .kr(1)
34646 .sr(1)
34647 .m(1)
34648 .n(4)
34649 .k(1)
34650 .qmin(128)
34651 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34652 }
34653
TEST(F32_GEMM_MINMAX_1X4__SCALAR,qmax)34654 TEST(F32_GEMM_MINMAX_1X4__SCALAR, qmax) {
34655 GemmMicrokernelTester()
34656 .mr(1)
34657 .nr(4)
34658 .kr(1)
34659 .sr(1)
34660 .m(1)
34661 .n(4)
34662 .k(1)
34663 .qmax(128)
34664 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34665 }
34666
TEST(F32_GEMM_MINMAX_1X4__SCALAR,strided_cm)34667 TEST(F32_GEMM_MINMAX_1X4__SCALAR, strided_cm) {
34668 GemmMicrokernelTester()
34669 .mr(1)
34670 .nr(4)
34671 .kr(1)
34672 .sr(1)
34673 .m(1)
34674 .n(4)
34675 .k(1)
34676 .cm_stride(7)
34677 .Test(xnn_f32_gemm_minmax_ukernel_1x4__scalar, xnn_init_f32_minmax_scalar_params);
34678 }
34679
34680
TEST(F32_GEMM_MINMAX_4X2__SCALAR,k_eq_1)34681 TEST(F32_GEMM_MINMAX_4X2__SCALAR, k_eq_1) {
34682 GemmMicrokernelTester()
34683 .mr(4)
34684 .nr(2)
34685 .kr(1)
34686 .sr(1)
34687 .m(4)
34688 .n(2)
34689 .k(1)
34690 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34691 }
34692
TEST(F32_GEMM_MINMAX_4X2__SCALAR,strided_cn)34693 TEST(F32_GEMM_MINMAX_4X2__SCALAR, strided_cn) {
34694 GemmMicrokernelTester()
34695 .mr(4)
34696 .nr(2)
34697 .kr(1)
34698 .sr(1)
34699 .m(4)
34700 .n(2)
34701 .k(1)
34702 .cn_stride(5)
34703 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34704 }
34705
TEST(F32_GEMM_MINMAX_4X2__SCALAR,k_eq_1_strided_a)34706 TEST(F32_GEMM_MINMAX_4X2__SCALAR, k_eq_1_strided_a) {
34707 GemmMicrokernelTester()
34708 .mr(4)
34709 .nr(2)
34710 .kr(1)
34711 .sr(1)
34712 .m(4)
34713 .n(2)
34714 .k(1)
34715 .a_stride(3)
34716 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34717 }
34718
TEST(F32_GEMM_MINMAX_4X2__SCALAR,k_eq_1_subtile)34719 TEST(F32_GEMM_MINMAX_4X2__SCALAR, k_eq_1_subtile) {
34720 for (uint32_t n = 1; n <= 2; n++) {
34721 for (uint32_t m = 1; m <= 4; m++) {
34722 GemmMicrokernelTester()
34723 .mr(4)
34724 .nr(2)
34725 .kr(1)
34726 .sr(1)
34727 .m(m)
34728 .n(n)
34729 .k(1)
34730 .iterations(1)
34731 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34732 }
34733 }
34734 }
34735
TEST(F32_GEMM_MINMAX_4X2__SCALAR,k_eq_1_subtile_m)34736 TEST(F32_GEMM_MINMAX_4X2__SCALAR, k_eq_1_subtile_m) {
34737 for (uint32_t m = 1; m <= 4; m++) {
34738 GemmMicrokernelTester()
34739 .mr(4)
34740 .nr(2)
34741 .kr(1)
34742 .sr(1)
34743 .m(m)
34744 .n(2)
34745 .k(1)
34746 .iterations(1)
34747 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34748 }
34749 }
34750
TEST(F32_GEMM_MINMAX_4X2__SCALAR,k_eq_1_subtile_n)34751 TEST(F32_GEMM_MINMAX_4X2__SCALAR, k_eq_1_subtile_n) {
34752 for (uint32_t n = 1; n <= 2; n++) {
34753 GemmMicrokernelTester()
34754 .mr(4)
34755 .nr(2)
34756 .kr(1)
34757 .sr(1)
34758 .m(4)
34759 .n(n)
34760 .k(1)
34761 .iterations(1)
34762 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34763 }
34764 }
34765
TEST(F32_GEMM_MINMAX_4X2__SCALAR,k_gt_1)34766 TEST(F32_GEMM_MINMAX_4X2__SCALAR, k_gt_1) {
34767 for (size_t k = 2; k < 10; k++) {
34768 GemmMicrokernelTester()
34769 .mr(4)
34770 .nr(2)
34771 .kr(1)
34772 .sr(1)
34773 .m(4)
34774 .n(2)
34775 .k(k)
34776 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34777 }
34778 }
34779
TEST(F32_GEMM_MINMAX_4X2__SCALAR,k_gt_1_strided_a)34780 TEST(F32_GEMM_MINMAX_4X2__SCALAR, k_gt_1_strided_a) {
34781 for (size_t k = 2; k < 10; k++) {
34782 GemmMicrokernelTester()
34783 .mr(4)
34784 .nr(2)
34785 .kr(1)
34786 .sr(1)
34787 .m(4)
34788 .n(2)
34789 .k(k)
34790 .a_stride(11)
34791 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34792 }
34793 }
34794
TEST(F32_GEMM_MINMAX_4X2__SCALAR,k_gt_1_subtile)34795 TEST(F32_GEMM_MINMAX_4X2__SCALAR, k_gt_1_subtile) {
34796 for (size_t k = 2; k < 10; k++) {
34797 for (uint32_t n = 1; n <= 2; n++) {
34798 for (uint32_t m = 1; m <= 4; m++) {
34799 GemmMicrokernelTester()
34800 .mr(4)
34801 .nr(2)
34802 .kr(1)
34803 .sr(1)
34804 .m(m)
34805 .n(n)
34806 .k(k)
34807 .iterations(1)
34808 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34809 }
34810 }
34811 }
34812 }
34813
TEST(F32_GEMM_MINMAX_4X2__SCALAR,n_gt_2)34814 TEST(F32_GEMM_MINMAX_4X2__SCALAR, n_gt_2) {
34815 for (uint32_t n = 3; n < 4; n++) {
34816 for (size_t k = 1; k <= 5; k += 2) {
34817 GemmMicrokernelTester()
34818 .mr(4)
34819 .nr(2)
34820 .kr(1)
34821 .sr(1)
34822 .m(4)
34823 .n(n)
34824 .k(k)
34825 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34826 }
34827 }
34828 }
34829
TEST(F32_GEMM_MINMAX_4X2__SCALAR,n_gt_2_strided_cn)34830 TEST(F32_GEMM_MINMAX_4X2__SCALAR, n_gt_2_strided_cn) {
34831 for (uint32_t n = 3; n < 4; n++) {
34832 for (size_t k = 1; k <= 5; k += 2) {
34833 GemmMicrokernelTester()
34834 .mr(4)
34835 .nr(2)
34836 .kr(1)
34837 .sr(1)
34838 .m(4)
34839 .n(n)
34840 .k(k)
34841 .cn_stride(5)
34842 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34843 }
34844 }
34845 }
34846
TEST(F32_GEMM_MINMAX_4X2__SCALAR,n_gt_2_strided_a)34847 TEST(F32_GEMM_MINMAX_4X2__SCALAR, n_gt_2_strided_a) {
34848 for (uint32_t n = 3; n < 4; n++) {
34849 for (size_t k = 1; k <= 5; k += 2) {
34850 GemmMicrokernelTester()
34851 .mr(4)
34852 .nr(2)
34853 .kr(1)
34854 .sr(1)
34855 .m(4)
34856 .n(n)
34857 .k(k)
34858 .a_stride(7)
34859 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34860 }
34861 }
34862 }
34863
TEST(F32_GEMM_MINMAX_4X2__SCALAR,n_gt_2_subtile)34864 TEST(F32_GEMM_MINMAX_4X2__SCALAR, n_gt_2_subtile) {
34865 for (uint32_t n = 3; n < 4; n++) {
34866 for (size_t k = 1; k <= 5; k += 2) {
34867 for (uint32_t m = 1; m <= 4; m++) {
34868 GemmMicrokernelTester()
34869 .mr(4)
34870 .nr(2)
34871 .kr(1)
34872 .sr(1)
34873 .m(m)
34874 .n(n)
34875 .k(k)
34876 .iterations(1)
34877 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34878 }
34879 }
34880 }
34881 }
34882
TEST(F32_GEMM_MINMAX_4X2__SCALAR,n_div_2)34883 TEST(F32_GEMM_MINMAX_4X2__SCALAR, n_div_2) {
34884 for (uint32_t n = 4; n <= 6; n += 2) {
34885 for (size_t k = 1; k <= 5; k += 2) {
34886 GemmMicrokernelTester()
34887 .mr(4)
34888 .nr(2)
34889 .kr(1)
34890 .sr(1)
34891 .m(4)
34892 .n(n)
34893 .k(k)
34894 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34895 }
34896 }
34897 }
34898
TEST(F32_GEMM_MINMAX_4X2__SCALAR,n_div_2_strided_cn)34899 TEST(F32_GEMM_MINMAX_4X2__SCALAR, n_div_2_strided_cn) {
34900 for (uint32_t n = 4; n <= 6; n += 2) {
34901 for (size_t k = 1; k <= 5; k += 2) {
34902 GemmMicrokernelTester()
34903 .mr(4)
34904 .nr(2)
34905 .kr(1)
34906 .sr(1)
34907 .m(4)
34908 .n(n)
34909 .k(k)
34910 .cn_stride(5)
34911 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34912 }
34913 }
34914 }
34915
TEST(F32_GEMM_MINMAX_4X2__SCALAR,n_div_2_strided_a)34916 TEST(F32_GEMM_MINMAX_4X2__SCALAR, n_div_2_strided_a) {
34917 for (uint32_t n = 4; n <= 6; n += 2) {
34918 for (size_t k = 1; k <= 5; k += 2) {
34919 GemmMicrokernelTester()
34920 .mr(4)
34921 .nr(2)
34922 .kr(1)
34923 .sr(1)
34924 .m(4)
34925 .n(n)
34926 .k(k)
34927 .a_stride(7)
34928 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34929 }
34930 }
34931 }
34932
TEST(F32_GEMM_MINMAX_4X2__SCALAR,n_div_2_subtile)34933 TEST(F32_GEMM_MINMAX_4X2__SCALAR, n_div_2_subtile) {
34934 for (uint32_t n = 4; n <= 6; n += 2) {
34935 for (size_t k = 1; k <= 5; k += 2) {
34936 for (uint32_t m = 1; m <= 4; m++) {
34937 GemmMicrokernelTester()
34938 .mr(4)
34939 .nr(2)
34940 .kr(1)
34941 .sr(1)
34942 .m(m)
34943 .n(n)
34944 .k(k)
34945 .iterations(1)
34946 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34947 }
34948 }
34949 }
34950 }
34951
TEST(F32_GEMM_MINMAX_4X2__SCALAR,strided_cm_subtile)34952 TEST(F32_GEMM_MINMAX_4X2__SCALAR, strided_cm_subtile) {
34953 for (size_t k = 1; k <= 5; k += 2) {
34954 for (uint32_t n = 1; n <= 2; n++) {
34955 for (uint32_t m = 1; m <= 4; m++) {
34956 GemmMicrokernelTester()
34957 .mr(4)
34958 .nr(2)
34959 .kr(1)
34960 .sr(1)
34961 .m(m)
34962 .n(n)
34963 .k(k)
34964 .cm_stride(5)
34965 .iterations(1)
34966 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34967 }
34968 }
34969 }
34970 }
34971
TEST(F32_GEMM_MINMAX_4X2__SCALAR,qmin)34972 TEST(F32_GEMM_MINMAX_4X2__SCALAR, qmin) {
34973 GemmMicrokernelTester()
34974 .mr(4)
34975 .nr(2)
34976 .kr(1)
34977 .sr(1)
34978 .m(4)
34979 .n(2)
34980 .k(1)
34981 .qmin(128)
34982 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34983 }
34984
TEST(F32_GEMM_MINMAX_4X2__SCALAR,qmax)34985 TEST(F32_GEMM_MINMAX_4X2__SCALAR, qmax) {
34986 GemmMicrokernelTester()
34987 .mr(4)
34988 .nr(2)
34989 .kr(1)
34990 .sr(1)
34991 .m(4)
34992 .n(2)
34993 .k(1)
34994 .qmax(128)
34995 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
34996 }
34997
TEST(F32_GEMM_MINMAX_4X2__SCALAR,strided_cm)34998 TEST(F32_GEMM_MINMAX_4X2__SCALAR, strided_cm) {
34999 GemmMicrokernelTester()
35000 .mr(4)
35001 .nr(2)
35002 .kr(1)
35003 .sr(1)
35004 .m(4)
35005 .n(2)
35006 .k(1)
35007 .cm_stride(5)
35008 .Test(xnn_f32_gemm_minmax_ukernel_4x2__scalar, xnn_init_f32_minmax_scalar_params);
35009 }
35010
35011
TEST(F32_GEMM_MINMAX_4X4__SCALAR,k_eq_1)35012 TEST(F32_GEMM_MINMAX_4X4__SCALAR, k_eq_1) {
35013 GemmMicrokernelTester()
35014 .mr(4)
35015 .nr(4)
35016 .kr(1)
35017 .sr(1)
35018 .m(4)
35019 .n(4)
35020 .k(1)
35021 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35022 }
35023
TEST(F32_GEMM_MINMAX_4X4__SCALAR,strided_cn)35024 TEST(F32_GEMM_MINMAX_4X4__SCALAR, strided_cn) {
35025 GemmMicrokernelTester()
35026 .mr(4)
35027 .nr(4)
35028 .kr(1)
35029 .sr(1)
35030 .m(4)
35031 .n(4)
35032 .k(1)
35033 .cn_stride(7)
35034 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35035 }
35036
TEST(F32_GEMM_MINMAX_4X4__SCALAR,k_eq_1_strided_a)35037 TEST(F32_GEMM_MINMAX_4X4__SCALAR, k_eq_1_strided_a) {
35038 GemmMicrokernelTester()
35039 .mr(4)
35040 .nr(4)
35041 .kr(1)
35042 .sr(1)
35043 .m(4)
35044 .n(4)
35045 .k(1)
35046 .a_stride(3)
35047 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35048 }
35049
TEST(F32_GEMM_MINMAX_4X4__SCALAR,k_eq_1_subtile)35050 TEST(F32_GEMM_MINMAX_4X4__SCALAR, k_eq_1_subtile) {
35051 for (uint32_t n = 1; n <= 4; n++) {
35052 for (uint32_t m = 1; m <= 4; m++) {
35053 GemmMicrokernelTester()
35054 .mr(4)
35055 .nr(4)
35056 .kr(1)
35057 .sr(1)
35058 .m(m)
35059 .n(n)
35060 .k(1)
35061 .iterations(1)
35062 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35063 }
35064 }
35065 }
35066
TEST(F32_GEMM_MINMAX_4X4__SCALAR,k_eq_1_subtile_m)35067 TEST(F32_GEMM_MINMAX_4X4__SCALAR, k_eq_1_subtile_m) {
35068 for (uint32_t m = 1; m <= 4; m++) {
35069 GemmMicrokernelTester()
35070 .mr(4)
35071 .nr(4)
35072 .kr(1)
35073 .sr(1)
35074 .m(m)
35075 .n(4)
35076 .k(1)
35077 .iterations(1)
35078 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35079 }
35080 }
35081
TEST(F32_GEMM_MINMAX_4X4__SCALAR,k_eq_1_subtile_n)35082 TEST(F32_GEMM_MINMAX_4X4__SCALAR, k_eq_1_subtile_n) {
35083 for (uint32_t n = 1; n <= 4; n++) {
35084 GemmMicrokernelTester()
35085 .mr(4)
35086 .nr(4)
35087 .kr(1)
35088 .sr(1)
35089 .m(4)
35090 .n(n)
35091 .k(1)
35092 .iterations(1)
35093 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35094 }
35095 }
35096
TEST(F32_GEMM_MINMAX_4X4__SCALAR,k_gt_1)35097 TEST(F32_GEMM_MINMAX_4X4__SCALAR, k_gt_1) {
35098 for (size_t k = 2; k < 10; k++) {
35099 GemmMicrokernelTester()
35100 .mr(4)
35101 .nr(4)
35102 .kr(1)
35103 .sr(1)
35104 .m(4)
35105 .n(4)
35106 .k(k)
35107 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35108 }
35109 }
35110
TEST(F32_GEMM_MINMAX_4X4__SCALAR,k_gt_1_strided_a)35111 TEST(F32_GEMM_MINMAX_4X4__SCALAR, k_gt_1_strided_a) {
35112 for (size_t k = 2; k < 10; k++) {
35113 GemmMicrokernelTester()
35114 .mr(4)
35115 .nr(4)
35116 .kr(1)
35117 .sr(1)
35118 .m(4)
35119 .n(4)
35120 .k(k)
35121 .a_stride(11)
35122 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35123 }
35124 }
35125
TEST(F32_GEMM_MINMAX_4X4__SCALAR,k_gt_1_subtile)35126 TEST(F32_GEMM_MINMAX_4X4__SCALAR, k_gt_1_subtile) {
35127 for (size_t k = 2; k < 10; k++) {
35128 for (uint32_t n = 1; n <= 4; n++) {
35129 for (uint32_t m = 1; m <= 4; m++) {
35130 GemmMicrokernelTester()
35131 .mr(4)
35132 .nr(4)
35133 .kr(1)
35134 .sr(1)
35135 .m(m)
35136 .n(n)
35137 .k(k)
35138 .iterations(1)
35139 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35140 }
35141 }
35142 }
35143 }
35144
TEST(F32_GEMM_MINMAX_4X4__SCALAR,n_gt_4)35145 TEST(F32_GEMM_MINMAX_4X4__SCALAR, n_gt_4) {
35146 for (uint32_t n = 5; n < 8; n++) {
35147 for (size_t k = 1; k <= 5; k += 2) {
35148 GemmMicrokernelTester()
35149 .mr(4)
35150 .nr(4)
35151 .kr(1)
35152 .sr(1)
35153 .m(4)
35154 .n(n)
35155 .k(k)
35156 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35157 }
35158 }
35159 }
35160
TEST(F32_GEMM_MINMAX_4X4__SCALAR,n_gt_4_strided_cn)35161 TEST(F32_GEMM_MINMAX_4X4__SCALAR, n_gt_4_strided_cn) {
35162 for (uint32_t n = 5; n < 8; n++) {
35163 for (size_t k = 1; k <= 5; k += 2) {
35164 GemmMicrokernelTester()
35165 .mr(4)
35166 .nr(4)
35167 .kr(1)
35168 .sr(1)
35169 .m(4)
35170 .n(n)
35171 .k(k)
35172 .cn_stride(7)
35173 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35174 }
35175 }
35176 }
35177
TEST(F32_GEMM_MINMAX_4X4__SCALAR,n_gt_4_strided_a)35178 TEST(F32_GEMM_MINMAX_4X4__SCALAR, n_gt_4_strided_a) {
35179 for (uint32_t n = 5; n < 8; n++) {
35180 for (size_t k = 1; k <= 5; k += 2) {
35181 GemmMicrokernelTester()
35182 .mr(4)
35183 .nr(4)
35184 .kr(1)
35185 .sr(1)
35186 .m(4)
35187 .n(n)
35188 .k(k)
35189 .a_stride(7)
35190 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35191 }
35192 }
35193 }
35194
TEST(F32_GEMM_MINMAX_4X4__SCALAR,n_gt_4_subtile)35195 TEST(F32_GEMM_MINMAX_4X4__SCALAR, n_gt_4_subtile) {
35196 for (uint32_t n = 5; n < 8; n++) {
35197 for (size_t k = 1; k <= 5; k += 2) {
35198 for (uint32_t m = 1; m <= 4; m++) {
35199 GemmMicrokernelTester()
35200 .mr(4)
35201 .nr(4)
35202 .kr(1)
35203 .sr(1)
35204 .m(m)
35205 .n(n)
35206 .k(k)
35207 .iterations(1)
35208 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35209 }
35210 }
35211 }
35212 }
35213
TEST(F32_GEMM_MINMAX_4X4__SCALAR,n_div_4)35214 TEST(F32_GEMM_MINMAX_4X4__SCALAR, n_div_4) {
35215 for (uint32_t n = 8; n <= 12; n += 4) {
35216 for (size_t k = 1; k <= 5; k += 2) {
35217 GemmMicrokernelTester()
35218 .mr(4)
35219 .nr(4)
35220 .kr(1)
35221 .sr(1)
35222 .m(4)
35223 .n(n)
35224 .k(k)
35225 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35226 }
35227 }
35228 }
35229
TEST(F32_GEMM_MINMAX_4X4__SCALAR,n_div_4_strided_cn)35230 TEST(F32_GEMM_MINMAX_4X4__SCALAR, n_div_4_strided_cn) {
35231 for (uint32_t n = 8; n <= 12; n += 4) {
35232 for (size_t k = 1; k <= 5; k += 2) {
35233 GemmMicrokernelTester()
35234 .mr(4)
35235 .nr(4)
35236 .kr(1)
35237 .sr(1)
35238 .m(4)
35239 .n(n)
35240 .k(k)
35241 .cn_stride(7)
35242 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35243 }
35244 }
35245 }
35246
TEST(F32_GEMM_MINMAX_4X4__SCALAR,n_div_4_strided_a)35247 TEST(F32_GEMM_MINMAX_4X4__SCALAR, n_div_4_strided_a) {
35248 for (uint32_t n = 8; n <= 12; n += 4) {
35249 for (size_t k = 1; k <= 5; k += 2) {
35250 GemmMicrokernelTester()
35251 .mr(4)
35252 .nr(4)
35253 .kr(1)
35254 .sr(1)
35255 .m(4)
35256 .n(n)
35257 .k(k)
35258 .a_stride(7)
35259 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35260 }
35261 }
35262 }
35263
TEST(F32_GEMM_MINMAX_4X4__SCALAR,n_div_4_subtile)35264 TEST(F32_GEMM_MINMAX_4X4__SCALAR, n_div_4_subtile) {
35265 for (uint32_t n = 8; n <= 12; n += 4) {
35266 for (size_t k = 1; k <= 5; k += 2) {
35267 for (uint32_t m = 1; m <= 4; m++) {
35268 GemmMicrokernelTester()
35269 .mr(4)
35270 .nr(4)
35271 .kr(1)
35272 .sr(1)
35273 .m(m)
35274 .n(n)
35275 .k(k)
35276 .iterations(1)
35277 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35278 }
35279 }
35280 }
35281 }
35282
TEST(F32_GEMM_MINMAX_4X4__SCALAR,strided_cm_subtile)35283 TEST(F32_GEMM_MINMAX_4X4__SCALAR, strided_cm_subtile) {
35284 for (size_t k = 1; k <= 5; k += 2) {
35285 for (uint32_t n = 1; n <= 4; n++) {
35286 for (uint32_t m = 1; m <= 4; m++) {
35287 GemmMicrokernelTester()
35288 .mr(4)
35289 .nr(4)
35290 .kr(1)
35291 .sr(1)
35292 .m(m)
35293 .n(n)
35294 .k(k)
35295 .cm_stride(7)
35296 .iterations(1)
35297 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35298 }
35299 }
35300 }
35301 }
35302
TEST(F32_GEMM_MINMAX_4X4__SCALAR,qmin)35303 TEST(F32_GEMM_MINMAX_4X4__SCALAR, qmin) {
35304 GemmMicrokernelTester()
35305 .mr(4)
35306 .nr(4)
35307 .kr(1)
35308 .sr(1)
35309 .m(4)
35310 .n(4)
35311 .k(1)
35312 .qmin(128)
35313 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35314 }
35315
TEST(F32_GEMM_MINMAX_4X4__SCALAR,qmax)35316 TEST(F32_GEMM_MINMAX_4X4__SCALAR, qmax) {
35317 GemmMicrokernelTester()
35318 .mr(4)
35319 .nr(4)
35320 .kr(1)
35321 .sr(1)
35322 .m(4)
35323 .n(4)
35324 .k(1)
35325 .qmax(128)
35326 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35327 }
35328
TEST(F32_GEMM_MINMAX_4X4__SCALAR,strided_cm)35329 TEST(F32_GEMM_MINMAX_4X4__SCALAR, strided_cm) {
35330 GemmMicrokernelTester()
35331 .mr(4)
35332 .nr(4)
35333 .kr(1)
35334 .sr(1)
35335 .m(4)
35336 .n(4)
35337 .k(1)
35338 .cm_stride(7)
35339 .Test(xnn_f32_gemm_minmax_ukernel_4x4__scalar, xnn_init_f32_minmax_scalar_params);
35340 }
35341
35342
35343 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_eq_2)35344 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_eq_2) {
35345 TEST_REQUIRES_ARM_NEON;
35346 GemmMicrokernelTester()
35347 .mr(4)
35348 .nr(8)
35349 .kr(1)
35350 .sr(1)
35351 .m(4)
35352 .n(8)
35353 .k(2)
35354 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35355 }
35356
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,strided_cn)35357 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, strided_cn) {
35358 TEST_REQUIRES_ARM_NEON;
35359 GemmMicrokernelTester()
35360 .mr(4)
35361 .nr(8)
35362 .kr(1)
35363 .sr(1)
35364 .m(4)
35365 .n(8)
35366 .k(2)
35367 .cn_stride(11)
35368 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35369 }
35370
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_eq_2_strided_a)35371 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_eq_2_strided_a) {
35372 TEST_REQUIRES_ARM_NEON;
35373 GemmMicrokernelTester()
35374 .mr(4)
35375 .nr(8)
35376 .kr(1)
35377 .sr(1)
35378 .m(4)
35379 .n(8)
35380 .k(2)
35381 .a_stride(5)
35382 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35383 }
35384
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_eq_2_subtile)35385 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_eq_2_subtile) {
35386 TEST_REQUIRES_ARM_NEON;
35387 for (uint32_t n = 1; n <= 8; n++) {
35388 for (uint32_t m = 1; m <= 4; m++) {
35389 GemmMicrokernelTester()
35390 .mr(4)
35391 .nr(8)
35392 .kr(1)
35393 .sr(1)
35394 .m(m)
35395 .n(n)
35396 .k(2)
35397 .iterations(1)
35398 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35399 }
35400 }
35401 }
35402
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_eq_2_subtile_m)35403 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_eq_2_subtile_m) {
35404 TEST_REQUIRES_ARM_NEON;
35405 for (uint32_t m = 1; m <= 4; m++) {
35406 GemmMicrokernelTester()
35407 .mr(4)
35408 .nr(8)
35409 .kr(1)
35410 .sr(1)
35411 .m(m)
35412 .n(8)
35413 .k(2)
35414 .iterations(1)
35415 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35416 }
35417 }
35418
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_eq_2_subtile_n)35419 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_eq_2_subtile_n) {
35420 TEST_REQUIRES_ARM_NEON;
35421 for (uint32_t n = 1; n <= 8; n++) {
35422 GemmMicrokernelTester()
35423 .mr(4)
35424 .nr(8)
35425 .kr(1)
35426 .sr(1)
35427 .m(4)
35428 .n(n)
35429 .k(2)
35430 .iterations(1)
35431 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35432 }
35433 }
35434
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_lt_2)35435 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_lt_2) {
35436 TEST_REQUIRES_ARM_NEON;
35437 for (size_t k = 1; k < 2; k++) {
35438 GemmMicrokernelTester()
35439 .mr(4)
35440 .nr(8)
35441 .kr(1)
35442 .sr(1)
35443 .m(4)
35444 .n(8)
35445 .k(k)
35446 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35447 }
35448 }
35449
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_lt_2_strided_a)35450 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_lt_2_strided_a) {
35451 TEST_REQUIRES_ARM_NEON;
35452 for (size_t k = 1; k < 2; k++) {
35453 GemmMicrokernelTester()
35454 .mr(4)
35455 .nr(8)
35456 .kr(1)
35457 .sr(1)
35458 .m(4)
35459 .n(8)
35460 .k(k)
35461 .a_stride(5)
35462 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35463 }
35464 }
35465
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_lt_2_subtile)35466 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_lt_2_subtile) {
35467 TEST_REQUIRES_ARM_NEON;
35468 for (size_t k = 1; k < 2; k++) {
35469 for (uint32_t n = 1; n <= 8; n++) {
35470 for (uint32_t m = 1; m <= 4; m++) {
35471 GemmMicrokernelTester()
35472 .mr(4)
35473 .nr(8)
35474 .kr(1)
35475 .sr(1)
35476 .m(m)
35477 .n(n)
35478 .k(k)
35479 .iterations(1)
35480 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35481 }
35482 }
35483 }
35484 }
35485
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_gt_2)35486 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_gt_2) {
35487 TEST_REQUIRES_ARM_NEON;
35488 for (size_t k = 3; k < 4; k++) {
35489 GemmMicrokernelTester()
35490 .mr(4)
35491 .nr(8)
35492 .kr(1)
35493 .sr(1)
35494 .m(4)
35495 .n(8)
35496 .k(k)
35497 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35498 }
35499 }
35500
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_gt_2_strided_a)35501 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_gt_2_strided_a) {
35502 TEST_REQUIRES_ARM_NEON;
35503 for (size_t k = 3; k < 4; k++) {
35504 GemmMicrokernelTester()
35505 .mr(4)
35506 .nr(8)
35507 .kr(1)
35508 .sr(1)
35509 .m(4)
35510 .n(8)
35511 .k(k)
35512 .a_stride(7)
35513 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35514 }
35515 }
35516
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_gt_2_subtile)35517 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_gt_2_subtile) {
35518 TEST_REQUIRES_ARM_NEON;
35519 for (size_t k = 3; k < 4; k++) {
35520 for (uint32_t n = 1; n <= 8; n++) {
35521 for (uint32_t m = 1; m <= 4; m++) {
35522 GemmMicrokernelTester()
35523 .mr(4)
35524 .nr(8)
35525 .kr(1)
35526 .sr(1)
35527 .m(m)
35528 .n(n)
35529 .k(k)
35530 .iterations(1)
35531 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35532 }
35533 }
35534 }
35535 }
35536
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_div_2)35537 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_div_2) {
35538 TEST_REQUIRES_ARM_NEON;
35539 for (size_t k = 4; k <= 20; k += 2) {
35540 GemmMicrokernelTester()
35541 .mr(4)
35542 .nr(8)
35543 .kr(1)
35544 .sr(1)
35545 .m(4)
35546 .n(8)
35547 .k(k)
35548 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35549 }
35550 }
35551
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_div_2_strided_a)35552 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_div_2_strided_a) {
35553 TEST_REQUIRES_ARM_NEON;
35554 for (size_t k = 4; k <= 20; k += 2) {
35555 GemmMicrokernelTester()
35556 .mr(4)
35557 .nr(8)
35558 .kr(1)
35559 .sr(1)
35560 .m(4)
35561 .n(8)
35562 .k(k)
35563 .a_stride(23)
35564 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35565 }
35566 }
35567
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,k_div_2_subtile)35568 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, k_div_2_subtile) {
35569 TEST_REQUIRES_ARM_NEON;
35570 for (size_t k = 4; k <= 20; k += 2) {
35571 for (uint32_t n = 1; n <= 8; n++) {
35572 for (uint32_t m = 1; m <= 4; m++) {
35573 GemmMicrokernelTester()
35574 .mr(4)
35575 .nr(8)
35576 .kr(1)
35577 .sr(1)
35578 .m(m)
35579 .n(n)
35580 .k(k)
35581 .iterations(1)
35582 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35583 }
35584 }
35585 }
35586 }
35587
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,n_gt_8)35588 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, n_gt_8) {
35589 TEST_REQUIRES_ARM_NEON;
35590 for (uint32_t n = 9; n < 16; n++) {
35591 for (size_t k = 1; k <= 10; k += 3) {
35592 GemmMicrokernelTester()
35593 .mr(4)
35594 .nr(8)
35595 .kr(1)
35596 .sr(1)
35597 .m(4)
35598 .n(n)
35599 .k(k)
35600 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35601 }
35602 }
35603 }
35604
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,n_gt_8_strided_cn)35605 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, n_gt_8_strided_cn) {
35606 TEST_REQUIRES_ARM_NEON;
35607 for (uint32_t n = 9; n < 16; n++) {
35608 for (size_t k = 1; k <= 10; k += 3) {
35609 GemmMicrokernelTester()
35610 .mr(4)
35611 .nr(8)
35612 .kr(1)
35613 .sr(1)
35614 .m(4)
35615 .n(n)
35616 .k(k)
35617 .cn_stride(11)
35618 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35619 }
35620 }
35621 }
35622
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,n_gt_8_strided_a)35623 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, n_gt_8_strided_a) {
35624 TEST_REQUIRES_ARM_NEON;
35625 for (uint32_t n = 9; n < 16; n++) {
35626 for (size_t k = 1; k <= 10; k += 3) {
35627 GemmMicrokernelTester()
35628 .mr(4)
35629 .nr(8)
35630 .kr(1)
35631 .sr(1)
35632 .m(4)
35633 .n(n)
35634 .k(k)
35635 .a_stride(13)
35636 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35637 }
35638 }
35639 }
35640
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,n_gt_8_subtile)35641 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, n_gt_8_subtile) {
35642 TEST_REQUIRES_ARM_NEON;
35643 for (uint32_t n = 9; n < 16; n++) {
35644 for (size_t k = 1; k <= 10; k += 3) {
35645 for (uint32_t m = 1; m <= 4; m++) {
35646 GemmMicrokernelTester()
35647 .mr(4)
35648 .nr(8)
35649 .kr(1)
35650 .sr(1)
35651 .m(m)
35652 .n(n)
35653 .k(k)
35654 .iterations(1)
35655 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35656 }
35657 }
35658 }
35659 }
35660
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,n_div_8)35661 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, n_div_8) {
35662 TEST_REQUIRES_ARM_NEON;
35663 for (uint32_t n = 16; n <= 24; n += 8) {
35664 for (size_t k = 1; k <= 10; k += 3) {
35665 GemmMicrokernelTester()
35666 .mr(4)
35667 .nr(8)
35668 .kr(1)
35669 .sr(1)
35670 .m(4)
35671 .n(n)
35672 .k(k)
35673 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35674 }
35675 }
35676 }
35677
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,n_div_8_strided_cn)35678 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, n_div_8_strided_cn) {
35679 TEST_REQUIRES_ARM_NEON;
35680 for (uint32_t n = 16; n <= 24; n += 8) {
35681 for (size_t k = 1; k <= 10; k += 3) {
35682 GemmMicrokernelTester()
35683 .mr(4)
35684 .nr(8)
35685 .kr(1)
35686 .sr(1)
35687 .m(4)
35688 .n(n)
35689 .k(k)
35690 .cn_stride(11)
35691 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35692 }
35693 }
35694 }
35695
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,n_div_8_strided_a)35696 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, n_div_8_strided_a) {
35697 TEST_REQUIRES_ARM_NEON;
35698 for (uint32_t n = 16; n <= 24; n += 8) {
35699 for (size_t k = 1; k <= 10; k += 3) {
35700 GemmMicrokernelTester()
35701 .mr(4)
35702 .nr(8)
35703 .kr(1)
35704 .sr(1)
35705 .m(4)
35706 .n(n)
35707 .k(k)
35708 .a_stride(13)
35709 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35710 }
35711 }
35712 }
35713
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,n_div_8_subtile)35714 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, n_div_8_subtile) {
35715 TEST_REQUIRES_ARM_NEON;
35716 for (uint32_t n = 16; n <= 24; n += 8) {
35717 for (size_t k = 1; k <= 10; k += 3) {
35718 for (uint32_t m = 1; m <= 4; m++) {
35719 GemmMicrokernelTester()
35720 .mr(4)
35721 .nr(8)
35722 .kr(1)
35723 .sr(1)
35724 .m(m)
35725 .n(n)
35726 .k(k)
35727 .iterations(1)
35728 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35729 }
35730 }
35731 }
35732 }
35733
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,strided_cm_subtile)35734 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, strided_cm_subtile) {
35735 TEST_REQUIRES_ARM_NEON;
35736 for (size_t k = 1; k <= 10; k += 3) {
35737 for (uint32_t n = 1; n <= 8; n++) {
35738 for (uint32_t m = 1; m <= 4; m++) {
35739 GemmMicrokernelTester()
35740 .mr(4)
35741 .nr(8)
35742 .kr(1)
35743 .sr(1)
35744 .m(m)
35745 .n(n)
35746 .k(k)
35747 .cm_stride(11)
35748 .iterations(1)
35749 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35750 }
35751 }
35752 }
35753 }
35754
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,qmin)35755 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, qmin) {
35756 TEST_REQUIRES_ARM_NEON;
35757 GemmMicrokernelTester()
35758 .mr(4)
35759 .nr(8)
35760 .kr(1)
35761 .sr(1)
35762 .m(4)
35763 .n(8)
35764 .k(2)
35765 .qmin(128)
35766 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35767 }
35768
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,qmax)35769 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, qmax) {
35770 TEST_REQUIRES_ARM_NEON;
35771 GemmMicrokernelTester()
35772 .mr(4)
35773 .nr(8)
35774 .kr(1)
35775 .sr(1)
35776 .m(4)
35777 .n(8)
35778 .k(2)
35779 .qmax(128)
35780 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35781 }
35782
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7,strided_cm)35783 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A7, strided_cm) {
35784 TEST_REQUIRES_ARM_NEON;
35785 GemmMicrokernelTester()
35786 .mr(4)
35787 .nr(8)
35788 .kr(1)
35789 .sr(1)
35790 .m(4)
35791 .n(8)
35792 .k(2)
35793 .cm_stride(11)
35794 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a7, xnn_init_f32_minmax_scalar_params);
35795 }
35796 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
35797
35798
35799 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4)35800 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4) {
35801 TEST_REQUIRES_ARM_NEON;
35802 GemmMicrokernelTester()
35803 .mr(4)
35804 .nr(8)
35805 .kr(1)
35806 .sr(1)
35807 .m(4)
35808 .n(8)
35809 .k(4)
35810 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35811 }
35812
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,strided_cn)35813 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cn) {
35814 TEST_REQUIRES_ARM_NEON;
35815 GemmMicrokernelTester()
35816 .mr(4)
35817 .nr(8)
35818 .kr(1)
35819 .sr(1)
35820 .m(4)
35821 .n(8)
35822 .k(4)
35823 .cn_stride(11)
35824 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35825 }
35826
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4_strided_a)35827 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_strided_a) {
35828 TEST_REQUIRES_ARM_NEON;
35829 GemmMicrokernelTester()
35830 .mr(4)
35831 .nr(8)
35832 .kr(1)
35833 .sr(1)
35834 .m(4)
35835 .n(8)
35836 .k(4)
35837 .a_stride(7)
35838 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35839 }
35840
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4_subtile)35841 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile) {
35842 TEST_REQUIRES_ARM_NEON;
35843 for (uint32_t n = 1; n <= 8; n++) {
35844 for (uint32_t m = 1; m <= 4; m++) {
35845 GemmMicrokernelTester()
35846 .mr(4)
35847 .nr(8)
35848 .kr(1)
35849 .sr(1)
35850 .m(m)
35851 .n(n)
35852 .k(4)
35853 .iterations(1)
35854 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35855 }
35856 }
35857 }
35858
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4_subtile_m)35859 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_m) {
35860 TEST_REQUIRES_ARM_NEON;
35861 for (uint32_t m = 1; m <= 4; m++) {
35862 GemmMicrokernelTester()
35863 .mr(4)
35864 .nr(8)
35865 .kr(1)
35866 .sr(1)
35867 .m(m)
35868 .n(8)
35869 .k(4)
35870 .iterations(1)
35871 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35872 }
35873 }
35874
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_eq_4_subtile_n)35875 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_n) {
35876 TEST_REQUIRES_ARM_NEON;
35877 for (uint32_t n = 1; n <= 8; n++) {
35878 GemmMicrokernelTester()
35879 .mr(4)
35880 .nr(8)
35881 .kr(1)
35882 .sr(1)
35883 .m(4)
35884 .n(n)
35885 .k(4)
35886 .iterations(1)
35887 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35888 }
35889 }
35890
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_eq_8)35891 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8) {
35892 TEST_REQUIRES_ARM_NEON;
35893 GemmMicrokernelTester()
35894 .mr(4)
35895 .nr(8)
35896 .kr(1)
35897 .sr(1)
35898 .m(4)
35899 .n(8)
35900 .k(8)
35901 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35902 }
35903
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_eq_8_strided_a)35904 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_strided_a) {
35905 TEST_REQUIRES_ARM_NEON;
35906 GemmMicrokernelTester()
35907 .mr(4)
35908 .nr(8)
35909 .kr(1)
35910 .sr(1)
35911 .m(4)
35912 .n(8)
35913 .k(8)
35914 .a_stride(11)
35915 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35916 }
35917
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_eq_8_subtile)35918 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_subtile) {
35919 TEST_REQUIRES_ARM_NEON;
35920 for (uint32_t n = 1; n <= 8; n++) {
35921 for (uint32_t m = 1; m <= 4; m++) {
35922 GemmMicrokernelTester()
35923 .mr(4)
35924 .nr(8)
35925 .kr(1)
35926 .sr(1)
35927 .m(m)
35928 .n(n)
35929 .k(8)
35930 .iterations(1)
35931 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35932 }
35933 }
35934 }
35935
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_lt_8)35936 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8) {
35937 TEST_REQUIRES_ARM_NEON;
35938 for (size_t k = 1; k < 8; k++) {
35939 GemmMicrokernelTester()
35940 .mr(4)
35941 .nr(8)
35942 .kr(1)
35943 .sr(1)
35944 .m(4)
35945 .n(8)
35946 .k(k)
35947 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35948 }
35949 }
35950
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_lt_8_strided_a)35951 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_strided_a) {
35952 TEST_REQUIRES_ARM_NEON;
35953 for (size_t k = 1; k < 8; k++) {
35954 GemmMicrokernelTester()
35955 .mr(4)
35956 .nr(8)
35957 .kr(1)
35958 .sr(1)
35959 .m(4)
35960 .n(8)
35961 .k(k)
35962 .a_stride(11)
35963 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35964 }
35965 }
35966
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_lt_8_subtile)35967 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_subtile) {
35968 TEST_REQUIRES_ARM_NEON;
35969 for (size_t k = 1; k < 8; k++) {
35970 for (uint32_t n = 1; n <= 8; n++) {
35971 for (uint32_t m = 1; m <= 4; m++) {
35972 GemmMicrokernelTester()
35973 .mr(4)
35974 .nr(8)
35975 .kr(1)
35976 .sr(1)
35977 .m(m)
35978 .n(n)
35979 .k(k)
35980 .iterations(1)
35981 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35982 }
35983 }
35984 }
35985 }
35986
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_gt_8)35987 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8) {
35988 TEST_REQUIRES_ARM_NEON;
35989 for (size_t k = 9; k < 16; k++) {
35990 GemmMicrokernelTester()
35991 .mr(4)
35992 .nr(8)
35993 .kr(1)
35994 .sr(1)
35995 .m(4)
35996 .n(8)
35997 .k(k)
35998 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
35999 }
36000 }
36001
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_gt_8_strided_a)36002 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8_strided_a) {
36003 TEST_REQUIRES_ARM_NEON;
36004 for (size_t k = 9; k < 16; k++) {
36005 GemmMicrokernelTester()
36006 .mr(4)
36007 .nr(8)
36008 .kr(1)
36009 .sr(1)
36010 .m(4)
36011 .n(8)
36012 .k(k)
36013 .a_stride(19)
36014 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36015 }
36016 }
36017
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_gt_8_subtile)36018 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8_subtile) {
36019 TEST_REQUIRES_ARM_NEON;
36020 for (size_t k = 9; k < 16; k++) {
36021 for (uint32_t n = 1; n <= 8; n++) {
36022 for (uint32_t m = 1; m <= 4; m++) {
36023 GemmMicrokernelTester()
36024 .mr(4)
36025 .nr(8)
36026 .kr(1)
36027 .sr(1)
36028 .m(m)
36029 .n(n)
36030 .k(k)
36031 .iterations(1)
36032 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36033 }
36034 }
36035 }
36036 }
36037
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_div_4)36038 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4) {
36039 TEST_REQUIRES_ARM_NEON;
36040 for (size_t k = 12; k <= 40; k += 4) {
36041 GemmMicrokernelTester()
36042 .mr(4)
36043 .nr(8)
36044 .kr(1)
36045 .sr(1)
36046 .m(4)
36047 .n(8)
36048 .k(k)
36049 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36050 }
36051 }
36052
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_div_4_strided_a)36053 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_strided_a) {
36054 TEST_REQUIRES_ARM_NEON;
36055 for (size_t k = 12; k <= 40; k += 4) {
36056 GemmMicrokernelTester()
36057 .mr(4)
36058 .nr(8)
36059 .kr(1)
36060 .sr(1)
36061 .m(4)
36062 .n(8)
36063 .k(k)
36064 .a_stride(43)
36065 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36066 }
36067 }
36068
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,k_div_4_subtile)36069 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_subtile) {
36070 TEST_REQUIRES_ARM_NEON;
36071 for (size_t k = 12; k <= 40; k += 4) {
36072 for (uint32_t n = 1; n <= 8; n++) {
36073 for (uint32_t m = 1; m <= 4; m++) {
36074 GemmMicrokernelTester()
36075 .mr(4)
36076 .nr(8)
36077 .kr(1)
36078 .sr(1)
36079 .m(m)
36080 .n(n)
36081 .k(k)
36082 .iterations(1)
36083 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36084 }
36085 }
36086 }
36087 }
36088
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,n_gt_8)36089 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8) {
36090 TEST_REQUIRES_ARM_NEON;
36091 for (uint32_t n = 9; n < 16; n++) {
36092 for (size_t k = 1; k <= 20; k += 5) {
36093 GemmMicrokernelTester()
36094 .mr(4)
36095 .nr(8)
36096 .kr(1)
36097 .sr(1)
36098 .m(4)
36099 .n(n)
36100 .k(k)
36101 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36102 }
36103 }
36104 }
36105
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,n_gt_8_strided_cn)36106 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_cn) {
36107 TEST_REQUIRES_ARM_NEON;
36108 for (uint32_t n = 9; n < 16; n++) {
36109 for (size_t k = 1; k <= 20; k += 5) {
36110 GemmMicrokernelTester()
36111 .mr(4)
36112 .nr(8)
36113 .kr(1)
36114 .sr(1)
36115 .m(4)
36116 .n(n)
36117 .k(k)
36118 .cn_stride(11)
36119 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36120 }
36121 }
36122 }
36123
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,n_gt_8_strided_a)36124 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_a) {
36125 TEST_REQUIRES_ARM_NEON;
36126 for (uint32_t n = 9; n < 16; n++) {
36127 for (size_t k = 1; k <= 20; k += 5) {
36128 GemmMicrokernelTester()
36129 .mr(4)
36130 .nr(8)
36131 .kr(1)
36132 .sr(1)
36133 .m(4)
36134 .n(n)
36135 .k(k)
36136 .a_stride(23)
36137 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36138 }
36139 }
36140 }
36141
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,n_gt_8_subtile)36142 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_subtile) {
36143 TEST_REQUIRES_ARM_NEON;
36144 for (uint32_t n = 9; n < 16; n++) {
36145 for (size_t k = 1; k <= 20; k += 5) {
36146 for (uint32_t m = 1; m <= 4; m++) {
36147 GemmMicrokernelTester()
36148 .mr(4)
36149 .nr(8)
36150 .kr(1)
36151 .sr(1)
36152 .m(m)
36153 .n(n)
36154 .k(k)
36155 .iterations(1)
36156 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36157 }
36158 }
36159 }
36160 }
36161
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,n_div_8)36162 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8) {
36163 TEST_REQUIRES_ARM_NEON;
36164 for (uint32_t n = 16; n <= 24; n += 8) {
36165 for (size_t k = 1; k <= 20; k += 5) {
36166 GemmMicrokernelTester()
36167 .mr(4)
36168 .nr(8)
36169 .kr(1)
36170 .sr(1)
36171 .m(4)
36172 .n(n)
36173 .k(k)
36174 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36175 }
36176 }
36177 }
36178
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,n_div_8_strided_cn)36179 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_cn) {
36180 TEST_REQUIRES_ARM_NEON;
36181 for (uint32_t n = 16; n <= 24; n += 8) {
36182 for (size_t k = 1; k <= 20; k += 5) {
36183 GemmMicrokernelTester()
36184 .mr(4)
36185 .nr(8)
36186 .kr(1)
36187 .sr(1)
36188 .m(4)
36189 .n(n)
36190 .k(k)
36191 .cn_stride(11)
36192 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36193 }
36194 }
36195 }
36196
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,n_div_8_strided_a)36197 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_a) {
36198 TEST_REQUIRES_ARM_NEON;
36199 for (uint32_t n = 16; n <= 24; n += 8) {
36200 for (size_t k = 1; k <= 20; k += 5) {
36201 GemmMicrokernelTester()
36202 .mr(4)
36203 .nr(8)
36204 .kr(1)
36205 .sr(1)
36206 .m(4)
36207 .n(n)
36208 .k(k)
36209 .a_stride(23)
36210 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36211 }
36212 }
36213 }
36214
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,n_div_8_subtile)36215 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_subtile) {
36216 TEST_REQUIRES_ARM_NEON;
36217 for (uint32_t n = 16; n <= 24; n += 8) {
36218 for (size_t k = 1; k <= 20; k += 5) {
36219 for (uint32_t m = 1; m <= 4; m++) {
36220 GemmMicrokernelTester()
36221 .mr(4)
36222 .nr(8)
36223 .kr(1)
36224 .sr(1)
36225 .m(m)
36226 .n(n)
36227 .k(k)
36228 .iterations(1)
36229 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36230 }
36231 }
36232 }
36233 }
36234
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,strided_cm_subtile)36235 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm_subtile) {
36236 TEST_REQUIRES_ARM_NEON;
36237 for (size_t k = 1; k <= 20; k += 5) {
36238 for (uint32_t n = 1; n <= 8; n++) {
36239 for (uint32_t m = 1; m <= 4; m++) {
36240 GemmMicrokernelTester()
36241 .mr(4)
36242 .nr(8)
36243 .kr(1)
36244 .sr(1)
36245 .m(m)
36246 .n(n)
36247 .k(k)
36248 .cm_stride(11)
36249 .iterations(1)
36250 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36251 }
36252 }
36253 }
36254 }
36255
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,qmin)36256 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, qmin) {
36257 TEST_REQUIRES_ARM_NEON;
36258 GemmMicrokernelTester()
36259 .mr(4)
36260 .nr(8)
36261 .kr(1)
36262 .sr(1)
36263 .m(4)
36264 .n(8)
36265 .k(4)
36266 .qmin(128)
36267 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36268 }
36269
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,qmax)36270 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, qmax) {
36271 TEST_REQUIRES_ARM_NEON;
36272 GemmMicrokernelTester()
36273 .mr(4)
36274 .nr(8)
36275 .kr(1)
36276 .sr(1)
36277 .m(4)
36278 .n(8)
36279 .k(4)
36280 .qmax(128)
36281 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36282 }
36283
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55,strided_cm)36284 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm) {
36285 TEST_REQUIRES_ARM_NEON;
36286 GemmMicrokernelTester()
36287 .mr(4)
36288 .nr(8)
36289 .kr(1)
36290 .sr(1)
36291 .m(4)
36292 .n(8)
36293 .k(4)
36294 .cm_stride(11)
36295 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, xnn_init_f32_minmax_scalar_params);
36296 }
36297 #endif // XNN_ARCH_ARM && XNN_PLATFORM_JIT
36298
36299
36300 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4)36301 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4) {
36302 TEST_REQUIRES_ARM_NEON;
36303 GemmMicrokernelTester()
36304 .mr(4)
36305 .nr(8)
36306 .kr(1)
36307 .sr(1)
36308 .m(4)
36309 .n(8)
36310 .k(4)
36311 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36312 }
36313
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,strided_cn)36314 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cn) {
36315 TEST_REQUIRES_ARM_NEON;
36316 GemmMicrokernelTester()
36317 .mr(4)
36318 .nr(8)
36319 .kr(1)
36320 .sr(1)
36321 .m(4)
36322 .n(8)
36323 .k(4)
36324 .cn_stride(11)
36325 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36326 }
36327
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4_strided_a)36328 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_strided_a) {
36329 TEST_REQUIRES_ARM_NEON;
36330 GemmMicrokernelTester()
36331 .mr(4)
36332 .nr(8)
36333 .kr(1)
36334 .sr(1)
36335 .m(4)
36336 .n(8)
36337 .k(4)
36338 .a_stride(7)
36339 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36340 }
36341
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4_subtile)36342 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile) {
36343 TEST_REQUIRES_ARM_NEON;
36344 for (uint32_t n = 1; n <= 8; n++) {
36345 for (uint32_t m = 1; m <= 4; m++) {
36346 GemmMicrokernelTester()
36347 .mr(4)
36348 .nr(8)
36349 .kr(1)
36350 .sr(1)
36351 .m(m)
36352 .n(n)
36353 .k(4)
36354 .iterations(1)
36355 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36356 }
36357 }
36358 }
36359
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4_subtile_m)36360 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_m) {
36361 TEST_REQUIRES_ARM_NEON;
36362 for (uint32_t m = 1; m <= 4; m++) {
36363 GemmMicrokernelTester()
36364 .mr(4)
36365 .nr(8)
36366 .kr(1)
36367 .sr(1)
36368 .m(m)
36369 .n(8)
36370 .k(4)
36371 .iterations(1)
36372 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36373 }
36374 }
36375
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_eq_4_subtile_n)36376 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_n) {
36377 TEST_REQUIRES_ARM_NEON;
36378 for (uint32_t n = 1; n <= 8; n++) {
36379 GemmMicrokernelTester()
36380 .mr(4)
36381 .nr(8)
36382 .kr(1)
36383 .sr(1)
36384 .m(4)
36385 .n(n)
36386 .k(4)
36387 .iterations(1)
36388 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36389 }
36390 }
36391
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_eq_8)36392 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8) {
36393 TEST_REQUIRES_ARM_NEON;
36394 GemmMicrokernelTester()
36395 .mr(4)
36396 .nr(8)
36397 .kr(1)
36398 .sr(1)
36399 .m(4)
36400 .n(8)
36401 .k(8)
36402 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36403 }
36404
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_eq_8_strided_a)36405 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_strided_a) {
36406 TEST_REQUIRES_ARM_NEON;
36407 GemmMicrokernelTester()
36408 .mr(4)
36409 .nr(8)
36410 .kr(1)
36411 .sr(1)
36412 .m(4)
36413 .n(8)
36414 .k(8)
36415 .a_stride(11)
36416 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36417 }
36418
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_eq_8_subtile)36419 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_subtile) {
36420 TEST_REQUIRES_ARM_NEON;
36421 for (uint32_t n = 1; n <= 8; n++) {
36422 for (uint32_t m = 1; m <= 4; m++) {
36423 GemmMicrokernelTester()
36424 .mr(4)
36425 .nr(8)
36426 .kr(1)
36427 .sr(1)
36428 .m(m)
36429 .n(n)
36430 .k(8)
36431 .iterations(1)
36432 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36433 }
36434 }
36435 }
36436
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_lt_8)36437 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8) {
36438 TEST_REQUIRES_ARM_NEON;
36439 for (size_t k = 1; k < 8; k++) {
36440 GemmMicrokernelTester()
36441 .mr(4)
36442 .nr(8)
36443 .kr(1)
36444 .sr(1)
36445 .m(4)
36446 .n(8)
36447 .k(k)
36448 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36449 }
36450 }
36451
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_lt_8_strided_a)36452 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_strided_a) {
36453 TEST_REQUIRES_ARM_NEON;
36454 for (size_t k = 1; k < 8; k++) {
36455 GemmMicrokernelTester()
36456 .mr(4)
36457 .nr(8)
36458 .kr(1)
36459 .sr(1)
36460 .m(4)
36461 .n(8)
36462 .k(k)
36463 .a_stride(11)
36464 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36465 }
36466 }
36467
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_lt_8_subtile)36468 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_subtile) {
36469 TEST_REQUIRES_ARM_NEON;
36470 for (size_t k = 1; k < 8; k++) {
36471 for (uint32_t n = 1; n <= 8; n++) {
36472 for (uint32_t m = 1; m <= 4; m++) {
36473 GemmMicrokernelTester()
36474 .mr(4)
36475 .nr(8)
36476 .kr(1)
36477 .sr(1)
36478 .m(m)
36479 .n(n)
36480 .k(k)
36481 .iterations(1)
36482 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36483 }
36484 }
36485 }
36486 }
36487
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_gt_8)36488 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8) {
36489 TEST_REQUIRES_ARM_NEON;
36490 for (size_t k = 9; k < 16; k++) {
36491 GemmMicrokernelTester()
36492 .mr(4)
36493 .nr(8)
36494 .kr(1)
36495 .sr(1)
36496 .m(4)
36497 .n(8)
36498 .k(k)
36499 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36500 }
36501 }
36502
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_gt_8_strided_a)36503 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8_strided_a) {
36504 TEST_REQUIRES_ARM_NEON;
36505 for (size_t k = 9; k < 16; k++) {
36506 GemmMicrokernelTester()
36507 .mr(4)
36508 .nr(8)
36509 .kr(1)
36510 .sr(1)
36511 .m(4)
36512 .n(8)
36513 .k(k)
36514 .a_stride(19)
36515 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36516 }
36517 }
36518
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_gt_8_subtile)36519 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8_subtile) {
36520 TEST_REQUIRES_ARM_NEON;
36521 for (size_t k = 9; k < 16; k++) {
36522 for (uint32_t n = 1; n <= 8; n++) {
36523 for (uint32_t m = 1; m <= 4; m++) {
36524 GemmMicrokernelTester()
36525 .mr(4)
36526 .nr(8)
36527 .kr(1)
36528 .sr(1)
36529 .m(m)
36530 .n(n)
36531 .k(k)
36532 .iterations(1)
36533 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36534 }
36535 }
36536 }
36537 }
36538
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_div_4)36539 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4) {
36540 TEST_REQUIRES_ARM_NEON;
36541 for (size_t k = 12; k <= 40; k += 4) {
36542 GemmMicrokernelTester()
36543 .mr(4)
36544 .nr(8)
36545 .kr(1)
36546 .sr(1)
36547 .m(4)
36548 .n(8)
36549 .k(k)
36550 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36551 }
36552 }
36553
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_div_4_strided_a)36554 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_strided_a) {
36555 TEST_REQUIRES_ARM_NEON;
36556 for (size_t k = 12; k <= 40; k += 4) {
36557 GemmMicrokernelTester()
36558 .mr(4)
36559 .nr(8)
36560 .kr(1)
36561 .sr(1)
36562 .m(4)
36563 .n(8)
36564 .k(k)
36565 .a_stride(43)
36566 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36567 }
36568 }
36569
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,k_div_4_subtile)36570 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_subtile) {
36571 TEST_REQUIRES_ARM_NEON;
36572 for (size_t k = 12; k <= 40; k += 4) {
36573 for (uint32_t n = 1; n <= 8; n++) {
36574 for (uint32_t m = 1; m <= 4; m++) {
36575 GemmMicrokernelTester()
36576 .mr(4)
36577 .nr(8)
36578 .kr(1)
36579 .sr(1)
36580 .m(m)
36581 .n(n)
36582 .k(k)
36583 .iterations(1)
36584 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36585 }
36586 }
36587 }
36588 }
36589
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,n_gt_8)36590 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8) {
36591 TEST_REQUIRES_ARM_NEON;
36592 for (uint32_t n = 9; n < 16; n++) {
36593 for (size_t k = 1; k <= 20; k += 5) {
36594 GemmMicrokernelTester()
36595 .mr(4)
36596 .nr(8)
36597 .kr(1)
36598 .sr(1)
36599 .m(4)
36600 .n(n)
36601 .k(k)
36602 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36603 }
36604 }
36605 }
36606
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,n_gt_8_strided_cn)36607 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_cn) {
36608 TEST_REQUIRES_ARM_NEON;
36609 for (uint32_t n = 9; n < 16; n++) {
36610 for (size_t k = 1; k <= 20; k += 5) {
36611 GemmMicrokernelTester()
36612 .mr(4)
36613 .nr(8)
36614 .kr(1)
36615 .sr(1)
36616 .m(4)
36617 .n(n)
36618 .k(k)
36619 .cn_stride(11)
36620 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36621 }
36622 }
36623 }
36624
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,n_gt_8_strided_a)36625 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_a) {
36626 TEST_REQUIRES_ARM_NEON;
36627 for (uint32_t n = 9; n < 16; n++) {
36628 for (size_t k = 1; k <= 20; k += 5) {
36629 GemmMicrokernelTester()
36630 .mr(4)
36631 .nr(8)
36632 .kr(1)
36633 .sr(1)
36634 .m(4)
36635 .n(n)
36636 .k(k)
36637 .a_stride(23)
36638 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36639 }
36640 }
36641 }
36642
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,n_gt_8_subtile)36643 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_subtile) {
36644 TEST_REQUIRES_ARM_NEON;
36645 for (uint32_t n = 9; n < 16; n++) {
36646 for (size_t k = 1; k <= 20; k += 5) {
36647 for (uint32_t m = 1; m <= 4; m++) {
36648 GemmMicrokernelTester()
36649 .mr(4)
36650 .nr(8)
36651 .kr(1)
36652 .sr(1)
36653 .m(m)
36654 .n(n)
36655 .k(k)
36656 .iterations(1)
36657 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36658 }
36659 }
36660 }
36661 }
36662
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,n_div_8)36663 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8) {
36664 TEST_REQUIRES_ARM_NEON;
36665 for (uint32_t n = 16; n <= 24; n += 8) {
36666 for (size_t k = 1; k <= 20; k += 5) {
36667 GemmMicrokernelTester()
36668 .mr(4)
36669 .nr(8)
36670 .kr(1)
36671 .sr(1)
36672 .m(4)
36673 .n(n)
36674 .k(k)
36675 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36676 }
36677 }
36678 }
36679
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,n_div_8_strided_cn)36680 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_cn) {
36681 TEST_REQUIRES_ARM_NEON;
36682 for (uint32_t n = 16; n <= 24; n += 8) {
36683 for (size_t k = 1; k <= 20; k += 5) {
36684 GemmMicrokernelTester()
36685 .mr(4)
36686 .nr(8)
36687 .kr(1)
36688 .sr(1)
36689 .m(4)
36690 .n(n)
36691 .k(k)
36692 .cn_stride(11)
36693 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36694 }
36695 }
36696 }
36697
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,n_div_8_strided_a)36698 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_a) {
36699 TEST_REQUIRES_ARM_NEON;
36700 for (uint32_t n = 16; n <= 24; n += 8) {
36701 for (size_t k = 1; k <= 20; k += 5) {
36702 GemmMicrokernelTester()
36703 .mr(4)
36704 .nr(8)
36705 .kr(1)
36706 .sr(1)
36707 .m(4)
36708 .n(n)
36709 .k(k)
36710 .a_stride(23)
36711 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36712 }
36713 }
36714 }
36715
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,n_div_8_subtile)36716 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_subtile) {
36717 TEST_REQUIRES_ARM_NEON;
36718 for (uint32_t n = 16; n <= 24; n += 8) {
36719 for (size_t k = 1; k <= 20; k += 5) {
36720 for (uint32_t m = 1; m <= 4; m++) {
36721 GemmMicrokernelTester()
36722 .mr(4)
36723 .nr(8)
36724 .kr(1)
36725 .sr(1)
36726 .m(m)
36727 .n(n)
36728 .k(k)
36729 .iterations(1)
36730 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36731 }
36732 }
36733 }
36734 }
36735
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,strided_cm_subtile)36736 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm_subtile) {
36737 TEST_REQUIRES_ARM_NEON;
36738 for (size_t k = 1; k <= 20; k += 5) {
36739 for (uint32_t n = 1; n <= 8; n++) {
36740 for (uint32_t m = 1; m <= 4; m++) {
36741 GemmMicrokernelTester()
36742 .mr(4)
36743 .nr(8)
36744 .kr(1)
36745 .sr(1)
36746 .m(m)
36747 .n(n)
36748 .k(k)
36749 .cm_stride(11)
36750 .iterations(1)
36751 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36752 }
36753 }
36754 }
36755 }
36756
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,qmin)36757 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, qmin) {
36758 TEST_REQUIRES_ARM_NEON;
36759 GemmMicrokernelTester()
36760 .mr(4)
36761 .nr(8)
36762 .kr(1)
36763 .sr(1)
36764 .m(4)
36765 .n(8)
36766 .k(4)
36767 .qmin(128)
36768 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36769 }
36770
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,qmax)36771 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, qmax) {
36772 TEST_REQUIRES_ARM_NEON;
36773 GemmMicrokernelTester()
36774 .mr(4)
36775 .nr(8)
36776 .kr(1)
36777 .sr(1)
36778 .m(4)
36779 .n(8)
36780 .k(4)
36781 .qmax(128)
36782 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36783 }
36784
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75,strided_cm)36785 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm) {
36786 TEST_REQUIRES_ARM_NEON;
36787 GemmMicrokernelTester()
36788 .mr(4)
36789 .nr(8)
36790 .kr(1)
36791 .sr(1)
36792 .m(4)
36793 .n(8)
36794 .k(4)
36795 .cm_stride(11)
36796 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, xnn_init_f32_minmax_scalar_params);
36797 }
36798 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
36799
36800
36801 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_eq_4)36802 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_eq_4) {
36803 TEST_REQUIRES_ARM_NEON;
36804 GemmMicrokernelTester()
36805 .mr(4)
36806 .nr(8)
36807 .kr(1)
36808 .sr(1)
36809 .m(4)
36810 .n(8)
36811 .k(4)
36812 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36813 }
36814
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,strided_cn)36815 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, strided_cn) {
36816 TEST_REQUIRES_ARM_NEON;
36817 GemmMicrokernelTester()
36818 .mr(4)
36819 .nr(8)
36820 .kr(1)
36821 .sr(1)
36822 .m(4)
36823 .n(8)
36824 .k(4)
36825 .cn_stride(11)
36826 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36827 }
36828
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_eq_4_strided_a)36829 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_eq_4_strided_a) {
36830 TEST_REQUIRES_ARM_NEON;
36831 GemmMicrokernelTester()
36832 .mr(4)
36833 .nr(8)
36834 .kr(1)
36835 .sr(1)
36836 .m(4)
36837 .n(8)
36838 .k(4)
36839 .a_stride(7)
36840 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36841 }
36842
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_eq_4_subtile)36843 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_eq_4_subtile) {
36844 TEST_REQUIRES_ARM_NEON;
36845 for (uint32_t n = 1; n <= 8; n++) {
36846 for (uint32_t m = 1; m <= 4; m++) {
36847 GemmMicrokernelTester()
36848 .mr(4)
36849 .nr(8)
36850 .kr(1)
36851 .sr(1)
36852 .m(m)
36853 .n(n)
36854 .k(4)
36855 .iterations(1)
36856 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36857 }
36858 }
36859 }
36860
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_eq_4_subtile_m)36861 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_eq_4_subtile_m) {
36862 TEST_REQUIRES_ARM_NEON;
36863 for (uint32_t m = 1; m <= 4; m++) {
36864 GemmMicrokernelTester()
36865 .mr(4)
36866 .nr(8)
36867 .kr(1)
36868 .sr(1)
36869 .m(m)
36870 .n(8)
36871 .k(4)
36872 .iterations(1)
36873 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36874 }
36875 }
36876
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_eq_4_subtile_n)36877 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_eq_4_subtile_n) {
36878 TEST_REQUIRES_ARM_NEON;
36879 for (uint32_t n = 1; n <= 8; n++) {
36880 GemmMicrokernelTester()
36881 .mr(4)
36882 .nr(8)
36883 .kr(1)
36884 .sr(1)
36885 .m(4)
36886 .n(n)
36887 .k(4)
36888 .iterations(1)
36889 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36890 }
36891 }
36892
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_eq_8)36893 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_eq_8) {
36894 TEST_REQUIRES_ARM_NEON;
36895 GemmMicrokernelTester()
36896 .mr(4)
36897 .nr(8)
36898 .kr(1)
36899 .sr(1)
36900 .m(4)
36901 .n(8)
36902 .k(8)
36903 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36904 }
36905
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_eq_8_strided_a)36906 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_eq_8_strided_a) {
36907 TEST_REQUIRES_ARM_NEON;
36908 GemmMicrokernelTester()
36909 .mr(4)
36910 .nr(8)
36911 .kr(1)
36912 .sr(1)
36913 .m(4)
36914 .n(8)
36915 .k(8)
36916 .a_stride(11)
36917 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36918 }
36919
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_eq_8_subtile)36920 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_eq_8_subtile) {
36921 TEST_REQUIRES_ARM_NEON;
36922 for (uint32_t n = 1; n <= 8; n++) {
36923 for (uint32_t m = 1; m <= 4; m++) {
36924 GemmMicrokernelTester()
36925 .mr(4)
36926 .nr(8)
36927 .kr(1)
36928 .sr(1)
36929 .m(m)
36930 .n(n)
36931 .k(8)
36932 .iterations(1)
36933 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36934 }
36935 }
36936 }
36937
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_lt_8)36938 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_lt_8) {
36939 TEST_REQUIRES_ARM_NEON;
36940 for (size_t k = 1; k < 8; k++) {
36941 GemmMicrokernelTester()
36942 .mr(4)
36943 .nr(8)
36944 .kr(1)
36945 .sr(1)
36946 .m(4)
36947 .n(8)
36948 .k(k)
36949 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36950 }
36951 }
36952
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_lt_8_strided_a)36953 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_lt_8_strided_a) {
36954 TEST_REQUIRES_ARM_NEON;
36955 for (size_t k = 1; k < 8; k++) {
36956 GemmMicrokernelTester()
36957 .mr(4)
36958 .nr(8)
36959 .kr(1)
36960 .sr(1)
36961 .m(4)
36962 .n(8)
36963 .k(k)
36964 .a_stride(11)
36965 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36966 }
36967 }
36968
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_lt_8_subtile)36969 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_lt_8_subtile) {
36970 TEST_REQUIRES_ARM_NEON;
36971 for (size_t k = 1; k < 8; k++) {
36972 for (uint32_t n = 1; n <= 8; n++) {
36973 for (uint32_t m = 1; m <= 4; m++) {
36974 GemmMicrokernelTester()
36975 .mr(4)
36976 .nr(8)
36977 .kr(1)
36978 .sr(1)
36979 .m(m)
36980 .n(n)
36981 .k(k)
36982 .iterations(1)
36983 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
36984 }
36985 }
36986 }
36987 }
36988
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_gt_8)36989 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_gt_8) {
36990 TEST_REQUIRES_ARM_NEON;
36991 for (size_t k = 9; k < 16; k++) {
36992 GemmMicrokernelTester()
36993 .mr(4)
36994 .nr(8)
36995 .kr(1)
36996 .sr(1)
36997 .m(4)
36998 .n(8)
36999 .k(k)
37000 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37001 }
37002 }
37003
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_gt_8_strided_a)37004 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_gt_8_strided_a) {
37005 TEST_REQUIRES_ARM_NEON;
37006 for (size_t k = 9; k < 16; k++) {
37007 GemmMicrokernelTester()
37008 .mr(4)
37009 .nr(8)
37010 .kr(1)
37011 .sr(1)
37012 .m(4)
37013 .n(8)
37014 .k(k)
37015 .a_stride(19)
37016 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37017 }
37018 }
37019
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_gt_8_subtile)37020 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_gt_8_subtile) {
37021 TEST_REQUIRES_ARM_NEON;
37022 for (size_t k = 9; k < 16; k++) {
37023 for (uint32_t n = 1; n <= 8; n++) {
37024 for (uint32_t m = 1; m <= 4; m++) {
37025 GemmMicrokernelTester()
37026 .mr(4)
37027 .nr(8)
37028 .kr(1)
37029 .sr(1)
37030 .m(m)
37031 .n(n)
37032 .k(k)
37033 .iterations(1)
37034 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37035 }
37036 }
37037 }
37038 }
37039
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_div_4)37040 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_div_4) {
37041 TEST_REQUIRES_ARM_NEON;
37042 for (size_t k = 12; k <= 40; k += 4) {
37043 GemmMicrokernelTester()
37044 .mr(4)
37045 .nr(8)
37046 .kr(1)
37047 .sr(1)
37048 .m(4)
37049 .n(8)
37050 .k(k)
37051 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37052 }
37053 }
37054
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_div_4_strided_a)37055 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_div_4_strided_a) {
37056 TEST_REQUIRES_ARM_NEON;
37057 for (size_t k = 12; k <= 40; k += 4) {
37058 GemmMicrokernelTester()
37059 .mr(4)
37060 .nr(8)
37061 .kr(1)
37062 .sr(1)
37063 .m(4)
37064 .n(8)
37065 .k(k)
37066 .a_stride(43)
37067 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37068 }
37069 }
37070
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,k_div_4_subtile)37071 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, k_div_4_subtile) {
37072 TEST_REQUIRES_ARM_NEON;
37073 for (size_t k = 12; k <= 40; k += 4) {
37074 for (uint32_t n = 1; n <= 8; n++) {
37075 for (uint32_t m = 1; m <= 4; m++) {
37076 GemmMicrokernelTester()
37077 .mr(4)
37078 .nr(8)
37079 .kr(1)
37080 .sr(1)
37081 .m(m)
37082 .n(n)
37083 .k(k)
37084 .iterations(1)
37085 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37086 }
37087 }
37088 }
37089 }
37090
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,n_gt_8)37091 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, n_gt_8) {
37092 TEST_REQUIRES_ARM_NEON;
37093 for (uint32_t n = 9; n < 16; n++) {
37094 for (size_t k = 1; k <= 20; k += 5) {
37095 GemmMicrokernelTester()
37096 .mr(4)
37097 .nr(8)
37098 .kr(1)
37099 .sr(1)
37100 .m(4)
37101 .n(n)
37102 .k(k)
37103 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37104 }
37105 }
37106 }
37107
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,n_gt_8_strided_cn)37108 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, n_gt_8_strided_cn) {
37109 TEST_REQUIRES_ARM_NEON;
37110 for (uint32_t n = 9; n < 16; n++) {
37111 for (size_t k = 1; k <= 20; k += 5) {
37112 GemmMicrokernelTester()
37113 .mr(4)
37114 .nr(8)
37115 .kr(1)
37116 .sr(1)
37117 .m(4)
37118 .n(n)
37119 .k(k)
37120 .cn_stride(11)
37121 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37122 }
37123 }
37124 }
37125
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,n_gt_8_strided_a)37126 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, n_gt_8_strided_a) {
37127 TEST_REQUIRES_ARM_NEON;
37128 for (uint32_t n = 9; n < 16; n++) {
37129 for (size_t k = 1; k <= 20; k += 5) {
37130 GemmMicrokernelTester()
37131 .mr(4)
37132 .nr(8)
37133 .kr(1)
37134 .sr(1)
37135 .m(4)
37136 .n(n)
37137 .k(k)
37138 .a_stride(23)
37139 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37140 }
37141 }
37142 }
37143
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,n_gt_8_subtile)37144 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, n_gt_8_subtile) {
37145 TEST_REQUIRES_ARM_NEON;
37146 for (uint32_t n = 9; n < 16; n++) {
37147 for (size_t k = 1; k <= 20; k += 5) {
37148 for (uint32_t m = 1; m <= 4; m++) {
37149 GemmMicrokernelTester()
37150 .mr(4)
37151 .nr(8)
37152 .kr(1)
37153 .sr(1)
37154 .m(m)
37155 .n(n)
37156 .k(k)
37157 .iterations(1)
37158 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37159 }
37160 }
37161 }
37162 }
37163
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,n_div_8)37164 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, n_div_8) {
37165 TEST_REQUIRES_ARM_NEON;
37166 for (uint32_t n = 16; n <= 24; n += 8) {
37167 for (size_t k = 1; k <= 20; k += 5) {
37168 GemmMicrokernelTester()
37169 .mr(4)
37170 .nr(8)
37171 .kr(1)
37172 .sr(1)
37173 .m(4)
37174 .n(n)
37175 .k(k)
37176 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37177 }
37178 }
37179 }
37180
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,n_div_8_strided_cn)37181 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, n_div_8_strided_cn) {
37182 TEST_REQUIRES_ARM_NEON;
37183 for (uint32_t n = 16; n <= 24; n += 8) {
37184 for (size_t k = 1; k <= 20; k += 5) {
37185 GemmMicrokernelTester()
37186 .mr(4)
37187 .nr(8)
37188 .kr(1)
37189 .sr(1)
37190 .m(4)
37191 .n(n)
37192 .k(k)
37193 .cn_stride(11)
37194 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37195 }
37196 }
37197 }
37198
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,n_div_8_strided_a)37199 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, n_div_8_strided_a) {
37200 TEST_REQUIRES_ARM_NEON;
37201 for (uint32_t n = 16; n <= 24; n += 8) {
37202 for (size_t k = 1; k <= 20; k += 5) {
37203 GemmMicrokernelTester()
37204 .mr(4)
37205 .nr(8)
37206 .kr(1)
37207 .sr(1)
37208 .m(4)
37209 .n(n)
37210 .k(k)
37211 .a_stride(23)
37212 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37213 }
37214 }
37215 }
37216
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,n_div_8_subtile)37217 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, n_div_8_subtile) {
37218 TEST_REQUIRES_ARM_NEON;
37219 for (uint32_t n = 16; n <= 24; n += 8) {
37220 for (size_t k = 1; k <= 20; k += 5) {
37221 for (uint32_t m = 1; m <= 4; m++) {
37222 GemmMicrokernelTester()
37223 .mr(4)
37224 .nr(8)
37225 .kr(1)
37226 .sr(1)
37227 .m(m)
37228 .n(n)
37229 .k(k)
37230 .iterations(1)
37231 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37232 }
37233 }
37234 }
37235 }
37236
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,strided_cm_subtile)37237 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, strided_cm_subtile) {
37238 TEST_REQUIRES_ARM_NEON;
37239 for (size_t k = 1; k <= 20; k += 5) {
37240 for (uint32_t n = 1; n <= 8; n++) {
37241 for (uint32_t m = 1; m <= 4; m++) {
37242 GemmMicrokernelTester()
37243 .mr(4)
37244 .nr(8)
37245 .kr(1)
37246 .sr(1)
37247 .m(m)
37248 .n(n)
37249 .k(k)
37250 .cm_stride(11)
37251 .iterations(1)
37252 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37253 }
37254 }
37255 }
37256 }
37257
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,qmin)37258 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, qmin) {
37259 TEST_REQUIRES_ARM_NEON;
37260 GemmMicrokernelTester()
37261 .mr(4)
37262 .nr(8)
37263 .kr(1)
37264 .sr(1)
37265 .m(4)
37266 .n(8)
37267 .k(4)
37268 .qmin(128)
37269 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37270 }
37271
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,qmax)37272 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, qmax) {
37273 TEST_REQUIRES_ARM_NEON;
37274 GemmMicrokernelTester()
37275 .mr(4)
37276 .nr(8)
37277 .kr(1)
37278 .sr(1)
37279 .m(4)
37280 .n(8)
37281 .k(4)
37282 .qmax(128)
37283 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37284 }
37285
TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75,strided_cm)37286 TEST(GENERATE_F32_GEMM_4X8__AARCH32_NEON_PRFM_CORTEX_A75, strided_cm) {
37287 TEST_REQUIRES_ARM_NEON;
37288 GemmMicrokernelTester()
37289 .mr(4)
37290 .nr(8)
37291 .kr(1)
37292 .sr(1)
37293 .m(4)
37294 .n(8)
37295 .k(4)
37296 .cm_stride(11)
37297 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37298 }
37299 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY && XNN_PLATFORM_JIT
37300
37301
37302 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8)37303 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8) {
37304 TEST_REQUIRES_ARM_NEON_FMA;
37305 GemmMicrokernelTester()
37306 .mr(6)
37307 .nr(8)
37308 .kr(1)
37309 .sr(1)
37310 .m(6)
37311 .n(8)
37312 .k(8)
37313 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37314 }
37315
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cn)37316 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cn) {
37317 TEST_REQUIRES_ARM_NEON_FMA;
37318 GemmMicrokernelTester()
37319 .mr(6)
37320 .nr(8)
37321 .kr(1)
37322 .sr(1)
37323 .m(6)
37324 .n(8)
37325 .k(8)
37326 .cn_stride(11)
37327 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37328 }
37329
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_strided_a)37330 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_strided_a) {
37331 TEST_REQUIRES_ARM_NEON_FMA;
37332 GemmMicrokernelTester()
37333 .mr(6)
37334 .nr(8)
37335 .kr(1)
37336 .sr(1)
37337 .m(6)
37338 .n(8)
37339 .k(8)
37340 .a_stride(11)
37341 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37342 }
37343
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile)37344 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile) {
37345 TEST_REQUIRES_ARM_NEON_FMA;
37346 for (uint32_t n = 1; n <= 8; n++) {
37347 for (uint32_t m = 1; m <= 6; m++) {
37348 GemmMicrokernelTester()
37349 .mr(6)
37350 .nr(8)
37351 .kr(1)
37352 .sr(1)
37353 .m(m)
37354 .n(n)
37355 .k(8)
37356 .iterations(1)
37357 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37358 }
37359 }
37360 }
37361
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_m)37362 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_m) {
37363 TEST_REQUIRES_ARM_NEON_FMA;
37364 for (uint32_t m = 1; m <= 6; m++) {
37365 GemmMicrokernelTester()
37366 .mr(6)
37367 .nr(8)
37368 .kr(1)
37369 .sr(1)
37370 .m(m)
37371 .n(8)
37372 .k(8)
37373 .iterations(1)
37374 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37375 }
37376 }
37377
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_n)37378 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_n) {
37379 TEST_REQUIRES_ARM_NEON_FMA;
37380 for (uint32_t n = 1; n <= 8; n++) {
37381 GemmMicrokernelTester()
37382 .mr(6)
37383 .nr(8)
37384 .kr(1)
37385 .sr(1)
37386 .m(6)
37387 .n(n)
37388 .k(8)
37389 .iterations(1)
37390 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37391 }
37392 }
37393
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16)37394 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16) {
37395 TEST_REQUIRES_ARM_NEON_FMA;
37396 GemmMicrokernelTester()
37397 .mr(6)
37398 .nr(8)
37399 .kr(1)
37400 .sr(1)
37401 .m(6)
37402 .n(8)
37403 .k(16)
37404 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37405 }
37406
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_strided_a)37407 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_strided_a) {
37408 TEST_REQUIRES_ARM_NEON_FMA;
37409 GemmMicrokernelTester()
37410 .mr(6)
37411 .nr(8)
37412 .kr(1)
37413 .sr(1)
37414 .m(6)
37415 .n(8)
37416 .k(16)
37417 .a_stride(19)
37418 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37419 }
37420
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_subtile)37421 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_subtile) {
37422 TEST_REQUIRES_ARM_NEON_FMA;
37423 for (uint32_t n = 1; n <= 8; n++) {
37424 for (uint32_t m = 1; m <= 6; m++) {
37425 GemmMicrokernelTester()
37426 .mr(6)
37427 .nr(8)
37428 .kr(1)
37429 .sr(1)
37430 .m(m)
37431 .n(n)
37432 .k(16)
37433 .iterations(1)
37434 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37435 }
37436 }
37437 }
37438
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16)37439 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16) {
37440 TEST_REQUIRES_ARM_NEON_FMA;
37441 for (size_t k = 1; k < 16; k++) {
37442 GemmMicrokernelTester()
37443 .mr(6)
37444 .nr(8)
37445 .kr(1)
37446 .sr(1)
37447 .m(6)
37448 .n(8)
37449 .k(k)
37450 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37451 }
37452 }
37453
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_strided_a)37454 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_strided_a) {
37455 TEST_REQUIRES_ARM_NEON_FMA;
37456 for (size_t k = 1; k < 16; k++) {
37457 GemmMicrokernelTester()
37458 .mr(6)
37459 .nr(8)
37460 .kr(1)
37461 .sr(1)
37462 .m(6)
37463 .n(8)
37464 .k(k)
37465 .a_stride(19)
37466 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37467 }
37468 }
37469
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_subtile)37470 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_subtile) {
37471 TEST_REQUIRES_ARM_NEON_FMA;
37472 for (size_t k = 1; k < 16; k++) {
37473 for (uint32_t n = 1; n <= 8; n++) {
37474 for (uint32_t m = 1; m <= 6; m++) {
37475 GemmMicrokernelTester()
37476 .mr(6)
37477 .nr(8)
37478 .kr(1)
37479 .sr(1)
37480 .m(m)
37481 .n(n)
37482 .k(k)
37483 .iterations(1)
37484 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37485 }
37486 }
37487 }
37488 }
37489
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16)37490 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16) {
37491 TEST_REQUIRES_ARM_NEON_FMA;
37492 for (size_t k = 17; k < 32; k++) {
37493 GemmMicrokernelTester()
37494 .mr(6)
37495 .nr(8)
37496 .kr(1)
37497 .sr(1)
37498 .m(6)
37499 .n(8)
37500 .k(k)
37501 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37502 }
37503 }
37504
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_strided_a)37505 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_strided_a) {
37506 TEST_REQUIRES_ARM_NEON_FMA;
37507 for (size_t k = 17; k < 32; k++) {
37508 GemmMicrokernelTester()
37509 .mr(6)
37510 .nr(8)
37511 .kr(1)
37512 .sr(1)
37513 .m(6)
37514 .n(8)
37515 .k(k)
37516 .a_stride(37)
37517 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37518 }
37519 }
37520
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_subtile)37521 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_subtile) {
37522 TEST_REQUIRES_ARM_NEON_FMA;
37523 for (size_t k = 17; k < 32; k++) {
37524 for (uint32_t n = 1; n <= 8; n++) {
37525 for (uint32_t m = 1; m <= 6; m++) {
37526 GemmMicrokernelTester()
37527 .mr(6)
37528 .nr(8)
37529 .kr(1)
37530 .sr(1)
37531 .m(m)
37532 .n(n)
37533 .k(k)
37534 .iterations(1)
37535 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37536 }
37537 }
37538 }
37539 }
37540
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8)37541 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8) {
37542 TEST_REQUIRES_ARM_NEON_FMA;
37543 for (size_t k = 24; k <= 80; k += 8) {
37544 GemmMicrokernelTester()
37545 .mr(6)
37546 .nr(8)
37547 .kr(1)
37548 .sr(1)
37549 .m(6)
37550 .n(8)
37551 .k(k)
37552 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37553 }
37554 }
37555
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_strided_a)37556 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_strided_a) {
37557 TEST_REQUIRES_ARM_NEON_FMA;
37558 for (size_t k = 24; k <= 80; k += 8) {
37559 GemmMicrokernelTester()
37560 .mr(6)
37561 .nr(8)
37562 .kr(1)
37563 .sr(1)
37564 .m(6)
37565 .n(8)
37566 .k(k)
37567 .a_stride(83)
37568 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37569 }
37570 }
37571
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_subtile)37572 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_subtile) {
37573 TEST_REQUIRES_ARM_NEON_FMA;
37574 for (size_t k = 24; k <= 80; k += 8) {
37575 for (uint32_t n = 1; n <= 8; n++) {
37576 for (uint32_t m = 1; m <= 6; m++) {
37577 GemmMicrokernelTester()
37578 .mr(6)
37579 .nr(8)
37580 .kr(1)
37581 .sr(1)
37582 .m(m)
37583 .n(n)
37584 .k(k)
37585 .iterations(1)
37586 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37587 }
37588 }
37589 }
37590 }
37591
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8)37592 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8) {
37593 TEST_REQUIRES_ARM_NEON_FMA;
37594 for (uint32_t n = 9; n < 16; n++) {
37595 for (size_t k = 1; k <= 40; k += 9) {
37596 GemmMicrokernelTester()
37597 .mr(6)
37598 .nr(8)
37599 .kr(1)
37600 .sr(1)
37601 .m(6)
37602 .n(n)
37603 .k(k)
37604 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37605 }
37606 }
37607 }
37608
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_cn)37609 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_cn) {
37610 TEST_REQUIRES_ARM_NEON_FMA;
37611 for (uint32_t n = 9; n < 16; n++) {
37612 for (size_t k = 1; k <= 40; k += 9) {
37613 GemmMicrokernelTester()
37614 .mr(6)
37615 .nr(8)
37616 .kr(1)
37617 .sr(1)
37618 .m(6)
37619 .n(n)
37620 .k(k)
37621 .cn_stride(11)
37622 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37623 }
37624 }
37625 }
37626
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_a)37627 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_a) {
37628 TEST_REQUIRES_ARM_NEON_FMA;
37629 for (uint32_t n = 9; n < 16; n++) {
37630 for (size_t k = 1; k <= 40; k += 9) {
37631 GemmMicrokernelTester()
37632 .mr(6)
37633 .nr(8)
37634 .kr(1)
37635 .sr(1)
37636 .m(6)
37637 .n(n)
37638 .k(k)
37639 .a_stride(43)
37640 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37641 }
37642 }
37643 }
37644
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_subtile)37645 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_subtile) {
37646 TEST_REQUIRES_ARM_NEON_FMA;
37647 for (uint32_t n = 9; n < 16; n++) {
37648 for (size_t k = 1; k <= 40; k += 9) {
37649 for (uint32_t m = 1; m <= 6; m++) {
37650 GemmMicrokernelTester()
37651 .mr(6)
37652 .nr(8)
37653 .kr(1)
37654 .sr(1)
37655 .m(m)
37656 .n(n)
37657 .k(k)
37658 .iterations(1)
37659 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37660 }
37661 }
37662 }
37663 }
37664
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8)37665 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8) {
37666 TEST_REQUIRES_ARM_NEON_FMA;
37667 for (uint32_t n = 16; n <= 24; n += 8) {
37668 for (size_t k = 1; k <= 40; k += 9) {
37669 GemmMicrokernelTester()
37670 .mr(6)
37671 .nr(8)
37672 .kr(1)
37673 .sr(1)
37674 .m(6)
37675 .n(n)
37676 .k(k)
37677 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37678 }
37679 }
37680 }
37681
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_cn)37682 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_cn) {
37683 TEST_REQUIRES_ARM_NEON_FMA;
37684 for (uint32_t n = 16; n <= 24; n += 8) {
37685 for (size_t k = 1; k <= 40; k += 9) {
37686 GemmMicrokernelTester()
37687 .mr(6)
37688 .nr(8)
37689 .kr(1)
37690 .sr(1)
37691 .m(6)
37692 .n(n)
37693 .k(k)
37694 .cn_stride(11)
37695 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37696 }
37697 }
37698 }
37699
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_a)37700 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_a) {
37701 TEST_REQUIRES_ARM_NEON_FMA;
37702 for (uint32_t n = 16; n <= 24; n += 8) {
37703 for (size_t k = 1; k <= 40; k += 9) {
37704 GemmMicrokernelTester()
37705 .mr(6)
37706 .nr(8)
37707 .kr(1)
37708 .sr(1)
37709 .m(6)
37710 .n(n)
37711 .k(k)
37712 .a_stride(43)
37713 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37714 }
37715 }
37716 }
37717
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_subtile)37718 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_subtile) {
37719 TEST_REQUIRES_ARM_NEON_FMA;
37720 for (uint32_t n = 16; n <= 24; n += 8) {
37721 for (size_t k = 1; k <= 40; k += 9) {
37722 for (uint32_t m = 1; m <= 6; m++) {
37723 GemmMicrokernelTester()
37724 .mr(6)
37725 .nr(8)
37726 .kr(1)
37727 .sr(1)
37728 .m(m)
37729 .n(n)
37730 .k(k)
37731 .iterations(1)
37732 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37733 }
37734 }
37735 }
37736 }
37737
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm_subtile)37738 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm_subtile) {
37739 TEST_REQUIRES_ARM_NEON_FMA;
37740 for (size_t k = 1; k <= 40; k += 9) {
37741 for (uint32_t n = 1; n <= 8; n++) {
37742 for (uint32_t m = 1; m <= 6; m++) {
37743 GemmMicrokernelTester()
37744 .mr(6)
37745 .nr(8)
37746 .kr(1)
37747 .sr(1)
37748 .m(m)
37749 .n(n)
37750 .k(k)
37751 .cm_stride(11)
37752 .iterations(1)
37753 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37754 }
37755 }
37756 }
37757 }
37758
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmin)37759 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmin) {
37760 TEST_REQUIRES_ARM_NEON_FMA;
37761 GemmMicrokernelTester()
37762 .mr(6)
37763 .nr(8)
37764 .kr(1)
37765 .sr(1)
37766 .m(6)
37767 .n(8)
37768 .k(8)
37769 .qmin(128)
37770 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37771 }
37772
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmax)37773 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmax) {
37774 TEST_REQUIRES_ARM_NEON_FMA;
37775 GemmMicrokernelTester()
37776 .mr(6)
37777 .nr(8)
37778 .kr(1)
37779 .sr(1)
37780 .m(6)
37781 .n(8)
37782 .k(8)
37783 .qmax(128)
37784 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37785 }
37786
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm)37787 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm) {
37788 TEST_REQUIRES_ARM_NEON_FMA;
37789 GemmMicrokernelTester()
37790 .mr(6)
37791 .nr(8)
37792 .kr(1)
37793 .sr(1)
37794 .m(6)
37795 .n(8)
37796 .k(8)
37797 .cm_stride(11)
37798 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37799 }
37800
TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_m_upto_mr)37801 TEST(GENERATE_F32_GEMM_UPTO6X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_m_upto_mr) {
37802 TEST_REQUIRES_ARM_NEON_FMA;
37803 for (uint32_t max_mr = 1; max_mr <= 6; max_mr++) {
37804 for (uint32_t m = 1; m <= max_mr; m++) {
37805 GemmMicrokernelTester()
37806 .mr(max_mr)
37807 .nr(8)
37808 .kr(1)
37809 .sr(1)
37810 .m(m)
37811 .n(8)
37812 .k(8)
37813 .iterations(1)
37814 .Test(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37815 }
37816 }
37817 }
37818 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
37819
37820
37821 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8)37822 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8) {
37823 TEST_REQUIRES_ARM_NEON_FMA;
37824 GemmMicrokernelTester()
37825 .mr(1)
37826 .nr(8)
37827 .kr(1)
37828 .sr(1)
37829 .m(1)
37830 .n(8)
37831 .k(8)
37832 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37833 }
37834
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cn)37835 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cn) {
37836 TEST_REQUIRES_ARM_NEON_FMA;
37837 GemmMicrokernelTester()
37838 .mr(1)
37839 .nr(8)
37840 .kr(1)
37841 .sr(1)
37842 .m(1)
37843 .n(8)
37844 .k(8)
37845 .cn_stride(11)
37846 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37847 }
37848
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_strided_a)37849 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_strided_a) {
37850 TEST_REQUIRES_ARM_NEON_FMA;
37851 GemmMicrokernelTester()
37852 .mr(1)
37853 .nr(8)
37854 .kr(1)
37855 .sr(1)
37856 .m(1)
37857 .n(8)
37858 .k(8)
37859 .a_stride(11)
37860 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37861 }
37862
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile)37863 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile) {
37864 TEST_REQUIRES_ARM_NEON_FMA;
37865 for (uint32_t n = 1; n <= 8; n++) {
37866 for (uint32_t m = 1; m <= 1; m++) {
37867 GemmMicrokernelTester()
37868 .mr(1)
37869 .nr(8)
37870 .kr(1)
37871 .sr(1)
37872 .m(m)
37873 .n(n)
37874 .k(8)
37875 .iterations(1)
37876 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37877 }
37878 }
37879 }
37880
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_m)37881 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_m) {
37882 TEST_REQUIRES_ARM_NEON_FMA;
37883 for (uint32_t m = 1; m <= 1; m++) {
37884 GemmMicrokernelTester()
37885 .mr(1)
37886 .nr(8)
37887 .kr(1)
37888 .sr(1)
37889 .m(m)
37890 .n(8)
37891 .k(8)
37892 .iterations(1)
37893 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37894 }
37895 }
37896
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_n)37897 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_n) {
37898 TEST_REQUIRES_ARM_NEON_FMA;
37899 for (uint32_t n = 1; n <= 8; n++) {
37900 GemmMicrokernelTester()
37901 .mr(1)
37902 .nr(8)
37903 .kr(1)
37904 .sr(1)
37905 .m(1)
37906 .n(n)
37907 .k(8)
37908 .iterations(1)
37909 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37910 }
37911 }
37912
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16)37913 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16) {
37914 TEST_REQUIRES_ARM_NEON_FMA;
37915 GemmMicrokernelTester()
37916 .mr(1)
37917 .nr(8)
37918 .kr(1)
37919 .sr(1)
37920 .m(1)
37921 .n(8)
37922 .k(16)
37923 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37924 }
37925
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_strided_a)37926 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_strided_a) {
37927 TEST_REQUIRES_ARM_NEON_FMA;
37928 GemmMicrokernelTester()
37929 .mr(1)
37930 .nr(8)
37931 .kr(1)
37932 .sr(1)
37933 .m(1)
37934 .n(8)
37935 .k(16)
37936 .a_stride(19)
37937 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37938 }
37939
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_subtile)37940 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_subtile) {
37941 TEST_REQUIRES_ARM_NEON_FMA;
37942 for (uint32_t n = 1; n <= 8; n++) {
37943 for (uint32_t m = 1; m <= 1; m++) {
37944 GemmMicrokernelTester()
37945 .mr(1)
37946 .nr(8)
37947 .kr(1)
37948 .sr(1)
37949 .m(m)
37950 .n(n)
37951 .k(16)
37952 .iterations(1)
37953 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37954 }
37955 }
37956 }
37957
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16)37958 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16) {
37959 TEST_REQUIRES_ARM_NEON_FMA;
37960 for (size_t k = 1; k < 16; k++) {
37961 GemmMicrokernelTester()
37962 .mr(1)
37963 .nr(8)
37964 .kr(1)
37965 .sr(1)
37966 .m(1)
37967 .n(8)
37968 .k(k)
37969 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37970 }
37971 }
37972
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_strided_a)37973 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_strided_a) {
37974 TEST_REQUIRES_ARM_NEON_FMA;
37975 for (size_t k = 1; k < 16; k++) {
37976 GemmMicrokernelTester()
37977 .mr(1)
37978 .nr(8)
37979 .kr(1)
37980 .sr(1)
37981 .m(1)
37982 .n(8)
37983 .k(k)
37984 .a_stride(19)
37985 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
37986 }
37987 }
37988
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_subtile)37989 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_subtile) {
37990 TEST_REQUIRES_ARM_NEON_FMA;
37991 for (size_t k = 1; k < 16; k++) {
37992 for (uint32_t n = 1; n <= 8; n++) {
37993 for (uint32_t m = 1; m <= 1; m++) {
37994 GemmMicrokernelTester()
37995 .mr(1)
37996 .nr(8)
37997 .kr(1)
37998 .sr(1)
37999 .m(m)
38000 .n(n)
38001 .k(k)
38002 .iterations(1)
38003 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38004 }
38005 }
38006 }
38007 }
38008
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16)38009 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16) {
38010 TEST_REQUIRES_ARM_NEON_FMA;
38011 for (size_t k = 17; k < 32; k++) {
38012 GemmMicrokernelTester()
38013 .mr(1)
38014 .nr(8)
38015 .kr(1)
38016 .sr(1)
38017 .m(1)
38018 .n(8)
38019 .k(k)
38020 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38021 }
38022 }
38023
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_strided_a)38024 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_strided_a) {
38025 TEST_REQUIRES_ARM_NEON_FMA;
38026 for (size_t k = 17; k < 32; k++) {
38027 GemmMicrokernelTester()
38028 .mr(1)
38029 .nr(8)
38030 .kr(1)
38031 .sr(1)
38032 .m(1)
38033 .n(8)
38034 .k(k)
38035 .a_stride(37)
38036 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38037 }
38038 }
38039
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_subtile)38040 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_subtile) {
38041 TEST_REQUIRES_ARM_NEON_FMA;
38042 for (size_t k = 17; k < 32; k++) {
38043 for (uint32_t n = 1; n <= 8; n++) {
38044 for (uint32_t m = 1; m <= 1; m++) {
38045 GemmMicrokernelTester()
38046 .mr(1)
38047 .nr(8)
38048 .kr(1)
38049 .sr(1)
38050 .m(m)
38051 .n(n)
38052 .k(k)
38053 .iterations(1)
38054 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38055 }
38056 }
38057 }
38058 }
38059
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8)38060 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8) {
38061 TEST_REQUIRES_ARM_NEON_FMA;
38062 for (size_t k = 24; k <= 80; k += 8) {
38063 GemmMicrokernelTester()
38064 .mr(1)
38065 .nr(8)
38066 .kr(1)
38067 .sr(1)
38068 .m(1)
38069 .n(8)
38070 .k(k)
38071 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38072 }
38073 }
38074
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_strided_a)38075 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_strided_a) {
38076 TEST_REQUIRES_ARM_NEON_FMA;
38077 for (size_t k = 24; k <= 80; k += 8) {
38078 GemmMicrokernelTester()
38079 .mr(1)
38080 .nr(8)
38081 .kr(1)
38082 .sr(1)
38083 .m(1)
38084 .n(8)
38085 .k(k)
38086 .a_stride(83)
38087 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38088 }
38089 }
38090
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_subtile)38091 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_subtile) {
38092 TEST_REQUIRES_ARM_NEON_FMA;
38093 for (size_t k = 24; k <= 80; k += 8) {
38094 for (uint32_t n = 1; n <= 8; n++) {
38095 for (uint32_t m = 1; m <= 1; m++) {
38096 GemmMicrokernelTester()
38097 .mr(1)
38098 .nr(8)
38099 .kr(1)
38100 .sr(1)
38101 .m(m)
38102 .n(n)
38103 .k(k)
38104 .iterations(1)
38105 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38106 }
38107 }
38108 }
38109 }
38110
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8)38111 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8) {
38112 TEST_REQUIRES_ARM_NEON_FMA;
38113 for (uint32_t n = 9; n < 16; n++) {
38114 for (size_t k = 1; k <= 40; k += 9) {
38115 GemmMicrokernelTester()
38116 .mr(1)
38117 .nr(8)
38118 .kr(1)
38119 .sr(1)
38120 .m(1)
38121 .n(n)
38122 .k(k)
38123 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38124 }
38125 }
38126 }
38127
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_cn)38128 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_cn) {
38129 TEST_REQUIRES_ARM_NEON_FMA;
38130 for (uint32_t n = 9; n < 16; n++) {
38131 for (size_t k = 1; k <= 40; k += 9) {
38132 GemmMicrokernelTester()
38133 .mr(1)
38134 .nr(8)
38135 .kr(1)
38136 .sr(1)
38137 .m(1)
38138 .n(n)
38139 .k(k)
38140 .cn_stride(11)
38141 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38142 }
38143 }
38144 }
38145
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_a)38146 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_a) {
38147 TEST_REQUIRES_ARM_NEON_FMA;
38148 for (uint32_t n = 9; n < 16; n++) {
38149 for (size_t k = 1; k <= 40; k += 9) {
38150 GemmMicrokernelTester()
38151 .mr(1)
38152 .nr(8)
38153 .kr(1)
38154 .sr(1)
38155 .m(1)
38156 .n(n)
38157 .k(k)
38158 .a_stride(43)
38159 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38160 }
38161 }
38162 }
38163
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_subtile)38164 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_subtile) {
38165 TEST_REQUIRES_ARM_NEON_FMA;
38166 for (uint32_t n = 9; n < 16; n++) {
38167 for (size_t k = 1; k <= 40; k += 9) {
38168 for (uint32_t m = 1; m <= 1; m++) {
38169 GemmMicrokernelTester()
38170 .mr(1)
38171 .nr(8)
38172 .kr(1)
38173 .sr(1)
38174 .m(m)
38175 .n(n)
38176 .k(k)
38177 .iterations(1)
38178 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38179 }
38180 }
38181 }
38182 }
38183
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8)38184 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8) {
38185 TEST_REQUIRES_ARM_NEON_FMA;
38186 for (uint32_t n = 16; n <= 24; n += 8) {
38187 for (size_t k = 1; k <= 40; k += 9) {
38188 GemmMicrokernelTester()
38189 .mr(1)
38190 .nr(8)
38191 .kr(1)
38192 .sr(1)
38193 .m(1)
38194 .n(n)
38195 .k(k)
38196 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38197 }
38198 }
38199 }
38200
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_cn)38201 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_cn) {
38202 TEST_REQUIRES_ARM_NEON_FMA;
38203 for (uint32_t n = 16; n <= 24; n += 8) {
38204 for (size_t k = 1; k <= 40; k += 9) {
38205 GemmMicrokernelTester()
38206 .mr(1)
38207 .nr(8)
38208 .kr(1)
38209 .sr(1)
38210 .m(1)
38211 .n(n)
38212 .k(k)
38213 .cn_stride(11)
38214 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38215 }
38216 }
38217 }
38218
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_a)38219 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_a) {
38220 TEST_REQUIRES_ARM_NEON_FMA;
38221 for (uint32_t n = 16; n <= 24; n += 8) {
38222 for (size_t k = 1; k <= 40; k += 9) {
38223 GemmMicrokernelTester()
38224 .mr(1)
38225 .nr(8)
38226 .kr(1)
38227 .sr(1)
38228 .m(1)
38229 .n(n)
38230 .k(k)
38231 .a_stride(43)
38232 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38233 }
38234 }
38235 }
38236
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_subtile)38237 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_subtile) {
38238 TEST_REQUIRES_ARM_NEON_FMA;
38239 for (uint32_t n = 16; n <= 24; n += 8) {
38240 for (size_t k = 1; k <= 40; k += 9) {
38241 for (uint32_t m = 1; m <= 1; m++) {
38242 GemmMicrokernelTester()
38243 .mr(1)
38244 .nr(8)
38245 .kr(1)
38246 .sr(1)
38247 .m(m)
38248 .n(n)
38249 .k(k)
38250 .iterations(1)
38251 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38252 }
38253 }
38254 }
38255 }
38256
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm_subtile)38257 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm_subtile) {
38258 TEST_REQUIRES_ARM_NEON_FMA;
38259 for (size_t k = 1; k <= 40; k += 9) {
38260 for (uint32_t n = 1; n <= 8; n++) {
38261 for (uint32_t m = 1; m <= 1; m++) {
38262 GemmMicrokernelTester()
38263 .mr(1)
38264 .nr(8)
38265 .kr(1)
38266 .sr(1)
38267 .m(m)
38268 .n(n)
38269 .k(k)
38270 .cm_stride(11)
38271 .iterations(1)
38272 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38273 }
38274 }
38275 }
38276 }
38277
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmin)38278 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmin) {
38279 TEST_REQUIRES_ARM_NEON_FMA;
38280 GemmMicrokernelTester()
38281 .mr(1)
38282 .nr(8)
38283 .kr(1)
38284 .sr(1)
38285 .m(1)
38286 .n(8)
38287 .k(8)
38288 .qmin(128)
38289 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38290 }
38291
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmax)38292 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmax) {
38293 TEST_REQUIRES_ARM_NEON_FMA;
38294 GemmMicrokernelTester()
38295 .mr(1)
38296 .nr(8)
38297 .kr(1)
38298 .sr(1)
38299 .m(1)
38300 .n(8)
38301 .k(8)
38302 .qmax(128)
38303 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38304 }
38305
TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm)38306 TEST(GENERATE_F32_GEMM_1X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm) {
38307 TEST_REQUIRES_ARM_NEON_FMA;
38308 GemmMicrokernelTester()
38309 .mr(1)
38310 .nr(8)
38311 .kr(1)
38312 .sr(1)
38313 .m(1)
38314 .n(8)
38315 .k(8)
38316 .cm_stride(11)
38317 .Test(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38318 }
38319 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
38320
38321
38322 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8)38323 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
38324 TEST_REQUIRES_ARM_NEON_FMA;
38325 GemmMicrokernelTester()
38326 .mr(4)
38327 .nr(8)
38328 .kr(1)
38329 .sr(1)
38330 .m(4)
38331 .n(8)
38332 .k(8)
38333 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38334 }
38335
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,strided_cn)38336 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
38337 TEST_REQUIRES_ARM_NEON_FMA;
38338 GemmMicrokernelTester()
38339 .mr(4)
38340 .nr(8)
38341 .kr(1)
38342 .sr(1)
38343 .m(4)
38344 .n(8)
38345 .k(8)
38346 .cn_stride(11)
38347 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38348 }
38349
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_strided_a)38350 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
38351 TEST_REQUIRES_ARM_NEON_FMA;
38352 GemmMicrokernelTester()
38353 .mr(4)
38354 .nr(8)
38355 .kr(1)
38356 .sr(1)
38357 .m(4)
38358 .n(8)
38359 .k(8)
38360 .a_stride(11)
38361 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38362 }
38363
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile)38364 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
38365 TEST_REQUIRES_ARM_NEON_FMA;
38366 for (uint32_t n = 1; n <= 8; n++) {
38367 for (uint32_t m = 1; m <= 4; m++) {
38368 GemmMicrokernelTester()
38369 .mr(4)
38370 .nr(8)
38371 .kr(1)
38372 .sr(1)
38373 .m(m)
38374 .n(n)
38375 .k(8)
38376 .iterations(1)
38377 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38378 }
38379 }
38380 }
38381
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile_m)38382 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
38383 TEST_REQUIRES_ARM_NEON_FMA;
38384 for (uint32_t m = 1; m <= 4; m++) {
38385 GemmMicrokernelTester()
38386 .mr(4)
38387 .nr(8)
38388 .kr(1)
38389 .sr(1)
38390 .m(m)
38391 .n(8)
38392 .k(8)
38393 .iterations(1)
38394 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38395 }
38396 }
38397
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_8_subtile_n)38398 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
38399 TEST_REQUIRES_ARM_NEON_FMA;
38400 for (uint32_t n = 1; n <= 8; n++) {
38401 GemmMicrokernelTester()
38402 .mr(4)
38403 .nr(8)
38404 .kr(1)
38405 .sr(1)
38406 .m(4)
38407 .n(n)
38408 .k(8)
38409 .iterations(1)
38410 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38411 }
38412 }
38413
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_16)38414 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
38415 TEST_REQUIRES_ARM_NEON_FMA;
38416 GemmMicrokernelTester()
38417 .mr(4)
38418 .nr(8)
38419 .kr(1)
38420 .sr(1)
38421 .m(4)
38422 .n(8)
38423 .k(16)
38424 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38425 }
38426
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_16_strided_a)38427 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
38428 TEST_REQUIRES_ARM_NEON_FMA;
38429 GemmMicrokernelTester()
38430 .mr(4)
38431 .nr(8)
38432 .kr(1)
38433 .sr(1)
38434 .m(4)
38435 .n(8)
38436 .k(16)
38437 .a_stride(19)
38438 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38439 }
38440
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_eq_16_subtile)38441 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
38442 TEST_REQUIRES_ARM_NEON_FMA;
38443 for (uint32_t n = 1; n <= 8; n++) {
38444 for (uint32_t m = 1; m <= 4; m++) {
38445 GemmMicrokernelTester()
38446 .mr(4)
38447 .nr(8)
38448 .kr(1)
38449 .sr(1)
38450 .m(m)
38451 .n(n)
38452 .k(16)
38453 .iterations(1)
38454 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38455 }
38456 }
38457 }
38458
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_lt_16)38459 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
38460 TEST_REQUIRES_ARM_NEON_FMA;
38461 for (size_t k = 1; k < 16; k++) {
38462 GemmMicrokernelTester()
38463 .mr(4)
38464 .nr(8)
38465 .kr(1)
38466 .sr(1)
38467 .m(4)
38468 .n(8)
38469 .k(k)
38470 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38471 }
38472 }
38473
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_lt_16_strided_a)38474 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
38475 TEST_REQUIRES_ARM_NEON_FMA;
38476 for (size_t k = 1; k < 16; k++) {
38477 GemmMicrokernelTester()
38478 .mr(4)
38479 .nr(8)
38480 .kr(1)
38481 .sr(1)
38482 .m(4)
38483 .n(8)
38484 .k(k)
38485 .a_stride(19)
38486 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38487 }
38488 }
38489
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_lt_16_subtile)38490 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
38491 TEST_REQUIRES_ARM_NEON_FMA;
38492 for (size_t k = 1; k < 16; k++) {
38493 for (uint32_t n = 1; n <= 8; n++) {
38494 for (uint32_t m = 1; m <= 4; m++) {
38495 GemmMicrokernelTester()
38496 .mr(4)
38497 .nr(8)
38498 .kr(1)
38499 .sr(1)
38500 .m(m)
38501 .n(n)
38502 .k(k)
38503 .iterations(1)
38504 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38505 }
38506 }
38507 }
38508 }
38509
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_gt_16)38510 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
38511 TEST_REQUIRES_ARM_NEON_FMA;
38512 for (size_t k = 17; k < 32; k++) {
38513 GemmMicrokernelTester()
38514 .mr(4)
38515 .nr(8)
38516 .kr(1)
38517 .sr(1)
38518 .m(4)
38519 .n(8)
38520 .k(k)
38521 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38522 }
38523 }
38524
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_gt_16_strided_a)38525 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_strided_a) {
38526 TEST_REQUIRES_ARM_NEON_FMA;
38527 for (size_t k = 17; k < 32; k++) {
38528 GemmMicrokernelTester()
38529 .mr(4)
38530 .nr(8)
38531 .kr(1)
38532 .sr(1)
38533 .m(4)
38534 .n(8)
38535 .k(k)
38536 .a_stride(37)
38537 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38538 }
38539 }
38540
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_gt_16_subtile)38541 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16_subtile) {
38542 TEST_REQUIRES_ARM_NEON_FMA;
38543 for (size_t k = 17; k < 32; k++) {
38544 for (uint32_t n = 1; n <= 8; n++) {
38545 for (uint32_t m = 1; m <= 4; m++) {
38546 GemmMicrokernelTester()
38547 .mr(4)
38548 .nr(8)
38549 .kr(1)
38550 .sr(1)
38551 .m(m)
38552 .n(n)
38553 .k(k)
38554 .iterations(1)
38555 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38556 }
38557 }
38558 }
38559 }
38560
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_div_8)38561 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
38562 TEST_REQUIRES_ARM_NEON_FMA;
38563 for (size_t k = 24; k <= 80; k += 8) {
38564 GemmMicrokernelTester()
38565 .mr(4)
38566 .nr(8)
38567 .kr(1)
38568 .sr(1)
38569 .m(4)
38570 .n(8)
38571 .k(k)
38572 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38573 }
38574 }
38575
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_div_8_strided_a)38576 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
38577 TEST_REQUIRES_ARM_NEON_FMA;
38578 for (size_t k = 24; k <= 80; k += 8) {
38579 GemmMicrokernelTester()
38580 .mr(4)
38581 .nr(8)
38582 .kr(1)
38583 .sr(1)
38584 .m(4)
38585 .n(8)
38586 .k(k)
38587 .a_stride(83)
38588 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38589 }
38590 }
38591
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,k_div_8_subtile)38592 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
38593 TEST_REQUIRES_ARM_NEON_FMA;
38594 for (size_t k = 24; k <= 80; k += 8) {
38595 for (uint32_t n = 1; n <= 8; n++) {
38596 for (uint32_t m = 1; m <= 4; m++) {
38597 GemmMicrokernelTester()
38598 .mr(4)
38599 .nr(8)
38600 .kr(1)
38601 .sr(1)
38602 .m(m)
38603 .n(n)
38604 .k(k)
38605 .iterations(1)
38606 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38607 }
38608 }
38609 }
38610 }
38611
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8)38612 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
38613 TEST_REQUIRES_ARM_NEON_FMA;
38614 for (uint32_t n = 9; n < 16; n++) {
38615 for (size_t k = 1; k <= 40; k += 9) {
38616 GemmMicrokernelTester()
38617 .mr(4)
38618 .nr(8)
38619 .kr(1)
38620 .sr(1)
38621 .m(4)
38622 .n(n)
38623 .k(k)
38624 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38625 }
38626 }
38627 }
38628
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8_strided_cn)38629 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
38630 TEST_REQUIRES_ARM_NEON_FMA;
38631 for (uint32_t n = 9; n < 16; n++) {
38632 for (size_t k = 1; k <= 40; k += 9) {
38633 GemmMicrokernelTester()
38634 .mr(4)
38635 .nr(8)
38636 .kr(1)
38637 .sr(1)
38638 .m(4)
38639 .n(n)
38640 .k(k)
38641 .cn_stride(11)
38642 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38643 }
38644 }
38645 }
38646
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8_strided_a)38647 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
38648 TEST_REQUIRES_ARM_NEON_FMA;
38649 for (uint32_t n = 9; n < 16; n++) {
38650 for (size_t k = 1; k <= 40; k += 9) {
38651 GemmMicrokernelTester()
38652 .mr(4)
38653 .nr(8)
38654 .kr(1)
38655 .sr(1)
38656 .m(4)
38657 .n(n)
38658 .k(k)
38659 .a_stride(43)
38660 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38661 }
38662 }
38663 }
38664
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,n_gt_8_subtile)38665 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
38666 TEST_REQUIRES_ARM_NEON_FMA;
38667 for (uint32_t n = 9; n < 16; n++) {
38668 for (size_t k = 1; k <= 40; k += 9) {
38669 for (uint32_t m = 1; m <= 4; m++) {
38670 GemmMicrokernelTester()
38671 .mr(4)
38672 .nr(8)
38673 .kr(1)
38674 .sr(1)
38675 .m(m)
38676 .n(n)
38677 .k(k)
38678 .iterations(1)
38679 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38680 }
38681 }
38682 }
38683 }
38684
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8)38685 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
38686 TEST_REQUIRES_ARM_NEON_FMA;
38687 for (uint32_t n = 16; n <= 24; n += 8) {
38688 for (size_t k = 1; k <= 40; k += 9) {
38689 GemmMicrokernelTester()
38690 .mr(4)
38691 .nr(8)
38692 .kr(1)
38693 .sr(1)
38694 .m(4)
38695 .n(n)
38696 .k(k)
38697 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38698 }
38699 }
38700 }
38701
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8_strided_cn)38702 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
38703 TEST_REQUIRES_ARM_NEON_FMA;
38704 for (uint32_t n = 16; n <= 24; n += 8) {
38705 for (size_t k = 1; k <= 40; k += 9) {
38706 GemmMicrokernelTester()
38707 .mr(4)
38708 .nr(8)
38709 .kr(1)
38710 .sr(1)
38711 .m(4)
38712 .n(n)
38713 .k(k)
38714 .cn_stride(11)
38715 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38716 }
38717 }
38718 }
38719
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8_strided_a)38720 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
38721 TEST_REQUIRES_ARM_NEON_FMA;
38722 for (uint32_t n = 16; n <= 24; n += 8) {
38723 for (size_t k = 1; k <= 40; k += 9) {
38724 GemmMicrokernelTester()
38725 .mr(4)
38726 .nr(8)
38727 .kr(1)
38728 .sr(1)
38729 .m(4)
38730 .n(n)
38731 .k(k)
38732 .a_stride(43)
38733 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38734 }
38735 }
38736 }
38737
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,n_div_8_subtile)38738 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
38739 TEST_REQUIRES_ARM_NEON_FMA;
38740 for (uint32_t n = 16; n <= 24; n += 8) {
38741 for (size_t k = 1; k <= 40; k += 9) {
38742 for (uint32_t m = 1; m <= 4; m++) {
38743 GemmMicrokernelTester()
38744 .mr(4)
38745 .nr(8)
38746 .kr(1)
38747 .sr(1)
38748 .m(m)
38749 .n(n)
38750 .k(k)
38751 .iterations(1)
38752 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38753 }
38754 }
38755 }
38756 }
38757
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,strided_cm_subtile)38758 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
38759 TEST_REQUIRES_ARM_NEON_FMA;
38760 for (size_t k = 1; k <= 40; k += 9) {
38761 for (uint32_t n = 1; n <= 8; n++) {
38762 for (uint32_t m = 1; m <= 4; m++) {
38763 GemmMicrokernelTester()
38764 .mr(4)
38765 .nr(8)
38766 .kr(1)
38767 .sr(1)
38768 .m(m)
38769 .n(n)
38770 .k(k)
38771 .cm_stride(11)
38772 .iterations(1)
38773 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38774 }
38775 }
38776 }
38777 }
38778
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,qmin)38779 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
38780 TEST_REQUIRES_ARM_NEON_FMA;
38781 GemmMicrokernelTester()
38782 .mr(4)
38783 .nr(8)
38784 .kr(1)
38785 .sr(1)
38786 .m(4)
38787 .n(8)
38788 .k(8)
38789 .qmin(128)
38790 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38791 }
38792
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,qmax)38793 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
38794 TEST_REQUIRES_ARM_NEON_FMA;
38795 GemmMicrokernelTester()
38796 .mr(4)
38797 .nr(8)
38798 .kr(1)
38799 .sr(1)
38800 .m(4)
38801 .n(8)
38802 .k(8)
38803 .qmax(128)
38804 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38805 }
38806
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75,strided_cm)38807 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
38808 TEST_REQUIRES_ARM_NEON_FMA;
38809 GemmMicrokernelTester()
38810 .mr(4)
38811 .nr(8)
38812 .kr(1)
38813 .sr(1)
38814 .m(4)
38815 .n(8)
38816 .k(8)
38817 .cm_stride(11)
38818 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75, xnn_init_f32_minmax_scalar_params);
38819 }
38820 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
38821
38822
38823 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8)38824 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8) {
38825 TEST_REQUIRES_ARM_NEON_FMA;
38826 GemmMicrokernelTester()
38827 .mr(4)
38828 .nr(8)
38829 .kr(1)
38830 .sr(1)
38831 .m(4)
38832 .n(8)
38833 .k(8)
38834 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38835 }
38836
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cn)38837 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cn) {
38838 TEST_REQUIRES_ARM_NEON_FMA;
38839 GemmMicrokernelTester()
38840 .mr(4)
38841 .nr(8)
38842 .kr(1)
38843 .sr(1)
38844 .m(4)
38845 .n(8)
38846 .k(8)
38847 .cn_stride(11)
38848 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38849 }
38850
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_strided_a)38851 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_strided_a) {
38852 TEST_REQUIRES_ARM_NEON_FMA;
38853 GemmMicrokernelTester()
38854 .mr(4)
38855 .nr(8)
38856 .kr(1)
38857 .sr(1)
38858 .m(4)
38859 .n(8)
38860 .k(8)
38861 .a_stride(11)
38862 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38863 }
38864
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile)38865 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile) {
38866 TEST_REQUIRES_ARM_NEON_FMA;
38867 for (uint32_t n = 1; n <= 8; n++) {
38868 for (uint32_t m = 1; m <= 4; m++) {
38869 GemmMicrokernelTester()
38870 .mr(4)
38871 .nr(8)
38872 .kr(1)
38873 .sr(1)
38874 .m(m)
38875 .n(n)
38876 .k(8)
38877 .iterations(1)
38878 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38879 }
38880 }
38881 }
38882
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_m)38883 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_m) {
38884 TEST_REQUIRES_ARM_NEON_FMA;
38885 for (uint32_t m = 1; m <= 4; m++) {
38886 GemmMicrokernelTester()
38887 .mr(4)
38888 .nr(8)
38889 .kr(1)
38890 .sr(1)
38891 .m(m)
38892 .n(8)
38893 .k(8)
38894 .iterations(1)
38895 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38896 }
38897 }
38898
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_8_subtile_n)38899 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_8_subtile_n) {
38900 TEST_REQUIRES_ARM_NEON_FMA;
38901 for (uint32_t n = 1; n <= 8; n++) {
38902 GemmMicrokernelTester()
38903 .mr(4)
38904 .nr(8)
38905 .kr(1)
38906 .sr(1)
38907 .m(4)
38908 .n(n)
38909 .k(8)
38910 .iterations(1)
38911 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38912 }
38913 }
38914
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16)38915 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16) {
38916 TEST_REQUIRES_ARM_NEON_FMA;
38917 GemmMicrokernelTester()
38918 .mr(4)
38919 .nr(8)
38920 .kr(1)
38921 .sr(1)
38922 .m(4)
38923 .n(8)
38924 .k(16)
38925 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38926 }
38927
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_strided_a)38928 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_strided_a) {
38929 TEST_REQUIRES_ARM_NEON_FMA;
38930 GemmMicrokernelTester()
38931 .mr(4)
38932 .nr(8)
38933 .kr(1)
38934 .sr(1)
38935 .m(4)
38936 .n(8)
38937 .k(16)
38938 .a_stride(19)
38939 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38940 }
38941
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_eq_16_subtile)38942 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_eq_16_subtile) {
38943 TEST_REQUIRES_ARM_NEON_FMA;
38944 for (uint32_t n = 1; n <= 8; n++) {
38945 for (uint32_t m = 1; m <= 4; m++) {
38946 GemmMicrokernelTester()
38947 .mr(4)
38948 .nr(8)
38949 .kr(1)
38950 .sr(1)
38951 .m(m)
38952 .n(n)
38953 .k(16)
38954 .iterations(1)
38955 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38956 }
38957 }
38958 }
38959
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16)38960 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16) {
38961 TEST_REQUIRES_ARM_NEON_FMA;
38962 for (size_t k = 1; k < 16; k++) {
38963 GemmMicrokernelTester()
38964 .mr(4)
38965 .nr(8)
38966 .kr(1)
38967 .sr(1)
38968 .m(4)
38969 .n(8)
38970 .k(k)
38971 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38972 }
38973 }
38974
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_strided_a)38975 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_strided_a) {
38976 TEST_REQUIRES_ARM_NEON_FMA;
38977 for (size_t k = 1; k < 16; k++) {
38978 GemmMicrokernelTester()
38979 .mr(4)
38980 .nr(8)
38981 .kr(1)
38982 .sr(1)
38983 .m(4)
38984 .n(8)
38985 .k(k)
38986 .a_stride(19)
38987 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
38988 }
38989 }
38990
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_lt_16_subtile)38991 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_lt_16_subtile) {
38992 TEST_REQUIRES_ARM_NEON_FMA;
38993 for (size_t k = 1; k < 16; k++) {
38994 for (uint32_t n = 1; n <= 8; n++) {
38995 for (uint32_t m = 1; m <= 4; m++) {
38996 GemmMicrokernelTester()
38997 .mr(4)
38998 .nr(8)
38999 .kr(1)
39000 .sr(1)
39001 .m(m)
39002 .n(n)
39003 .k(k)
39004 .iterations(1)
39005 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39006 }
39007 }
39008 }
39009 }
39010
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16)39011 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16) {
39012 TEST_REQUIRES_ARM_NEON_FMA;
39013 for (size_t k = 17; k < 32; k++) {
39014 GemmMicrokernelTester()
39015 .mr(4)
39016 .nr(8)
39017 .kr(1)
39018 .sr(1)
39019 .m(4)
39020 .n(8)
39021 .k(k)
39022 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39023 }
39024 }
39025
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_strided_a)39026 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_strided_a) {
39027 TEST_REQUIRES_ARM_NEON_FMA;
39028 for (size_t k = 17; k < 32; k++) {
39029 GemmMicrokernelTester()
39030 .mr(4)
39031 .nr(8)
39032 .kr(1)
39033 .sr(1)
39034 .m(4)
39035 .n(8)
39036 .k(k)
39037 .a_stride(37)
39038 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39039 }
39040 }
39041
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_gt_16_subtile)39042 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_gt_16_subtile) {
39043 TEST_REQUIRES_ARM_NEON_FMA;
39044 for (size_t k = 17; k < 32; k++) {
39045 for (uint32_t n = 1; n <= 8; n++) {
39046 for (uint32_t m = 1; m <= 4; m++) {
39047 GemmMicrokernelTester()
39048 .mr(4)
39049 .nr(8)
39050 .kr(1)
39051 .sr(1)
39052 .m(m)
39053 .n(n)
39054 .k(k)
39055 .iterations(1)
39056 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39057 }
39058 }
39059 }
39060 }
39061
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8)39062 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8) {
39063 TEST_REQUIRES_ARM_NEON_FMA;
39064 for (size_t k = 24; k <= 80; k += 8) {
39065 GemmMicrokernelTester()
39066 .mr(4)
39067 .nr(8)
39068 .kr(1)
39069 .sr(1)
39070 .m(4)
39071 .n(8)
39072 .k(k)
39073 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39074 }
39075 }
39076
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_strided_a)39077 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_strided_a) {
39078 TEST_REQUIRES_ARM_NEON_FMA;
39079 for (size_t k = 24; k <= 80; k += 8) {
39080 GemmMicrokernelTester()
39081 .mr(4)
39082 .nr(8)
39083 .kr(1)
39084 .sr(1)
39085 .m(4)
39086 .n(8)
39087 .k(k)
39088 .a_stride(83)
39089 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39090 }
39091 }
39092
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,k_div_8_subtile)39093 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, k_div_8_subtile) {
39094 TEST_REQUIRES_ARM_NEON_FMA;
39095 for (size_t k = 24; k <= 80; k += 8) {
39096 for (uint32_t n = 1; n <= 8; n++) {
39097 for (uint32_t m = 1; m <= 4; m++) {
39098 GemmMicrokernelTester()
39099 .mr(4)
39100 .nr(8)
39101 .kr(1)
39102 .sr(1)
39103 .m(m)
39104 .n(n)
39105 .k(k)
39106 .iterations(1)
39107 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39108 }
39109 }
39110 }
39111 }
39112
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8)39113 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8) {
39114 TEST_REQUIRES_ARM_NEON_FMA;
39115 for (uint32_t n = 9; n < 16; n++) {
39116 for (size_t k = 1; k <= 40; k += 9) {
39117 GemmMicrokernelTester()
39118 .mr(4)
39119 .nr(8)
39120 .kr(1)
39121 .sr(1)
39122 .m(4)
39123 .n(n)
39124 .k(k)
39125 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39126 }
39127 }
39128 }
39129
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_cn)39130 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_cn) {
39131 TEST_REQUIRES_ARM_NEON_FMA;
39132 for (uint32_t n = 9; n < 16; n++) {
39133 for (size_t k = 1; k <= 40; k += 9) {
39134 GemmMicrokernelTester()
39135 .mr(4)
39136 .nr(8)
39137 .kr(1)
39138 .sr(1)
39139 .m(4)
39140 .n(n)
39141 .k(k)
39142 .cn_stride(11)
39143 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39144 }
39145 }
39146 }
39147
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_strided_a)39148 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_strided_a) {
39149 TEST_REQUIRES_ARM_NEON_FMA;
39150 for (uint32_t n = 9; n < 16; n++) {
39151 for (size_t k = 1; k <= 40; k += 9) {
39152 GemmMicrokernelTester()
39153 .mr(4)
39154 .nr(8)
39155 .kr(1)
39156 .sr(1)
39157 .m(4)
39158 .n(n)
39159 .k(k)
39160 .a_stride(43)
39161 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39162 }
39163 }
39164 }
39165
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_gt_8_subtile)39166 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_gt_8_subtile) {
39167 TEST_REQUIRES_ARM_NEON_FMA;
39168 for (uint32_t n = 9; n < 16; n++) {
39169 for (size_t k = 1; k <= 40; k += 9) {
39170 for (uint32_t m = 1; m <= 4; m++) {
39171 GemmMicrokernelTester()
39172 .mr(4)
39173 .nr(8)
39174 .kr(1)
39175 .sr(1)
39176 .m(m)
39177 .n(n)
39178 .k(k)
39179 .iterations(1)
39180 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39181 }
39182 }
39183 }
39184 }
39185
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8)39186 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8) {
39187 TEST_REQUIRES_ARM_NEON_FMA;
39188 for (uint32_t n = 16; n <= 24; n += 8) {
39189 for (size_t k = 1; k <= 40; k += 9) {
39190 GemmMicrokernelTester()
39191 .mr(4)
39192 .nr(8)
39193 .kr(1)
39194 .sr(1)
39195 .m(4)
39196 .n(n)
39197 .k(k)
39198 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39199 }
39200 }
39201 }
39202
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_cn)39203 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_cn) {
39204 TEST_REQUIRES_ARM_NEON_FMA;
39205 for (uint32_t n = 16; n <= 24; n += 8) {
39206 for (size_t k = 1; k <= 40; k += 9) {
39207 GemmMicrokernelTester()
39208 .mr(4)
39209 .nr(8)
39210 .kr(1)
39211 .sr(1)
39212 .m(4)
39213 .n(n)
39214 .k(k)
39215 .cn_stride(11)
39216 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39217 }
39218 }
39219 }
39220
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_strided_a)39221 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_strided_a) {
39222 TEST_REQUIRES_ARM_NEON_FMA;
39223 for (uint32_t n = 16; n <= 24; n += 8) {
39224 for (size_t k = 1; k <= 40; k += 9) {
39225 GemmMicrokernelTester()
39226 .mr(4)
39227 .nr(8)
39228 .kr(1)
39229 .sr(1)
39230 .m(4)
39231 .n(n)
39232 .k(k)
39233 .a_stride(43)
39234 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39235 }
39236 }
39237 }
39238
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,n_div_8_subtile)39239 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, n_div_8_subtile) {
39240 TEST_REQUIRES_ARM_NEON_FMA;
39241 for (uint32_t n = 16; n <= 24; n += 8) {
39242 for (size_t k = 1; k <= 40; k += 9) {
39243 for (uint32_t m = 1; m <= 4; m++) {
39244 GemmMicrokernelTester()
39245 .mr(4)
39246 .nr(8)
39247 .kr(1)
39248 .sr(1)
39249 .m(m)
39250 .n(n)
39251 .k(k)
39252 .iterations(1)
39253 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39254 }
39255 }
39256 }
39257 }
39258
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm_subtile)39259 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm_subtile) {
39260 TEST_REQUIRES_ARM_NEON_FMA;
39261 for (size_t k = 1; k <= 40; k += 9) {
39262 for (uint32_t n = 1; n <= 8; n++) {
39263 for (uint32_t m = 1; m <= 4; m++) {
39264 GemmMicrokernelTester()
39265 .mr(4)
39266 .nr(8)
39267 .kr(1)
39268 .sr(1)
39269 .m(m)
39270 .n(n)
39271 .k(k)
39272 .cm_stride(11)
39273 .iterations(1)
39274 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39275 }
39276 }
39277 }
39278 }
39279
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmin)39280 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmin) {
39281 TEST_REQUIRES_ARM_NEON_FMA;
39282 GemmMicrokernelTester()
39283 .mr(4)
39284 .nr(8)
39285 .kr(1)
39286 .sr(1)
39287 .m(4)
39288 .n(8)
39289 .k(8)
39290 .qmin(128)
39291 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39292 }
39293
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,qmax)39294 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, qmax) {
39295 TEST_REQUIRES_ARM_NEON_FMA;
39296 GemmMicrokernelTester()
39297 .mr(4)
39298 .nr(8)
39299 .kr(1)
39300 .sr(1)
39301 .m(4)
39302 .n(8)
39303 .k(8)
39304 .qmax(128)
39305 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39306 }
39307
TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75,strided_cm)39308 TEST(GENERATE_F32_GEMM_4X8__AARCH64_NEONFMA_PRFM_CORTEX_A75, strided_cm) {
39309 TEST_REQUIRES_ARM_NEON_FMA;
39310 GemmMicrokernelTester()
39311 .mr(4)
39312 .nr(8)
39313 .kr(1)
39314 .sr(1)
39315 .m(4)
39316 .n(8)
39317 .k(8)
39318 .cm_stride(11)
39319 .Test(xnn_generate_f32_gemm_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75, xnn_init_f32_minmax_scalar_params);
39320 }
39321 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
39322
39323
39324 #if XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_eq_4)39325 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4) {
39326 TEST_REQUIRES_ARM_NEON_FMA;
39327 GemmMicrokernelTester()
39328 .mr(6)
39329 .nr(8)
39330 .kr(1)
39331 .sr(1)
39332 .m(6)
39333 .n(8)
39334 .k(4)
39335 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39336 }
39337
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,strided_cn)39338 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, strided_cn) {
39339 TEST_REQUIRES_ARM_NEON_FMA;
39340 GemmMicrokernelTester()
39341 .mr(6)
39342 .nr(8)
39343 .kr(1)
39344 .sr(1)
39345 .m(6)
39346 .n(8)
39347 .k(4)
39348 .cn_stride(11)
39349 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39350 }
39351
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_eq_4_strided_a)39352 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
39353 TEST_REQUIRES_ARM_NEON_FMA;
39354 GemmMicrokernelTester()
39355 .mr(6)
39356 .nr(8)
39357 .kr(1)
39358 .sr(1)
39359 .m(6)
39360 .n(8)
39361 .k(4)
39362 .a_stride(7)
39363 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39364 }
39365
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_eq_4_subtile)39366 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
39367 TEST_REQUIRES_ARM_NEON_FMA;
39368 for (uint32_t n = 1; n <= 8; n++) {
39369 for (uint32_t m = 1; m <= 6; m++) {
39370 GemmMicrokernelTester()
39371 .mr(6)
39372 .nr(8)
39373 .kr(1)
39374 .sr(1)
39375 .m(m)
39376 .n(n)
39377 .k(4)
39378 .iterations(1)
39379 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39380 }
39381 }
39382 }
39383
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_eq_4_subtile_m)39384 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
39385 TEST_REQUIRES_ARM_NEON_FMA;
39386 for (uint32_t m = 1; m <= 6; m++) {
39387 GemmMicrokernelTester()
39388 .mr(6)
39389 .nr(8)
39390 .kr(1)
39391 .sr(1)
39392 .m(m)
39393 .n(8)
39394 .k(4)
39395 .iterations(1)
39396 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39397 }
39398 }
39399
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_eq_4_subtile_n)39400 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
39401 TEST_REQUIRES_ARM_NEON_FMA;
39402 for (uint32_t n = 1; n <= 8; n++) {
39403 GemmMicrokernelTester()
39404 .mr(6)
39405 .nr(8)
39406 .kr(1)
39407 .sr(1)
39408 .m(6)
39409 .n(n)
39410 .k(4)
39411 .iterations(1)
39412 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39413 }
39414 }
39415
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_lt_4)39416 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_lt_4) {
39417 TEST_REQUIRES_ARM_NEON_FMA;
39418 for (size_t k = 1; k < 4; k++) {
39419 GemmMicrokernelTester()
39420 .mr(6)
39421 .nr(8)
39422 .kr(1)
39423 .sr(1)
39424 .m(6)
39425 .n(8)
39426 .k(k)
39427 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39428 }
39429 }
39430
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_lt_4_strided_a)39431 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
39432 TEST_REQUIRES_ARM_NEON_FMA;
39433 for (size_t k = 1; k < 4; k++) {
39434 GemmMicrokernelTester()
39435 .mr(6)
39436 .nr(8)
39437 .kr(1)
39438 .sr(1)
39439 .m(6)
39440 .n(8)
39441 .k(k)
39442 .a_stride(7)
39443 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39444 }
39445 }
39446
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_lt_4_subtile)39447 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
39448 TEST_REQUIRES_ARM_NEON_FMA;
39449 for (size_t k = 1; k < 4; k++) {
39450 for (uint32_t n = 1; n <= 8; n++) {
39451 for (uint32_t m = 1; m <= 6; m++) {
39452 GemmMicrokernelTester()
39453 .mr(6)
39454 .nr(8)
39455 .kr(1)
39456 .sr(1)
39457 .m(m)
39458 .n(n)
39459 .k(k)
39460 .iterations(1)
39461 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39462 }
39463 }
39464 }
39465 }
39466
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_gt_4)39467 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_gt_4) {
39468 TEST_REQUIRES_ARM_NEON_FMA;
39469 for (size_t k = 5; k < 8; k++) {
39470 GemmMicrokernelTester()
39471 .mr(6)
39472 .nr(8)
39473 .kr(1)
39474 .sr(1)
39475 .m(6)
39476 .n(8)
39477 .k(k)
39478 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39479 }
39480 }
39481
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_gt_4_strided_a)39482 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
39483 TEST_REQUIRES_ARM_NEON_FMA;
39484 for (size_t k = 5; k < 8; k++) {
39485 GemmMicrokernelTester()
39486 .mr(6)
39487 .nr(8)
39488 .kr(1)
39489 .sr(1)
39490 .m(6)
39491 .n(8)
39492 .k(k)
39493 .a_stride(11)
39494 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39495 }
39496 }
39497
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_gt_4_subtile)39498 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
39499 TEST_REQUIRES_ARM_NEON_FMA;
39500 for (size_t k = 5; k < 8; k++) {
39501 for (uint32_t n = 1; n <= 8; n++) {
39502 for (uint32_t m = 1; m <= 6; m++) {
39503 GemmMicrokernelTester()
39504 .mr(6)
39505 .nr(8)
39506 .kr(1)
39507 .sr(1)
39508 .m(m)
39509 .n(n)
39510 .k(k)
39511 .iterations(1)
39512 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39513 }
39514 }
39515 }
39516 }
39517
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_div_4)39518 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_div_4) {
39519 TEST_REQUIRES_ARM_NEON_FMA;
39520 for (size_t k = 8; k <= 40; k += 4) {
39521 GemmMicrokernelTester()
39522 .mr(6)
39523 .nr(8)
39524 .kr(1)
39525 .sr(1)
39526 .m(6)
39527 .n(8)
39528 .k(k)
39529 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39530 }
39531 }
39532
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_div_4_strided_a)39533 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
39534 TEST_REQUIRES_ARM_NEON_FMA;
39535 for (size_t k = 8; k <= 40; k += 4) {
39536 GemmMicrokernelTester()
39537 .mr(6)
39538 .nr(8)
39539 .kr(1)
39540 .sr(1)
39541 .m(6)
39542 .n(8)
39543 .k(k)
39544 .a_stride(43)
39545 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39546 }
39547 }
39548
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,k_div_4_subtile)39549 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
39550 TEST_REQUIRES_ARM_NEON_FMA;
39551 for (size_t k = 8; k <= 40; k += 4) {
39552 for (uint32_t n = 1; n <= 8; n++) {
39553 for (uint32_t m = 1; m <= 6; m++) {
39554 GemmMicrokernelTester()
39555 .mr(6)
39556 .nr(8)
39557 .kr(1)
39558 .sr(1)
39559 .m(m)
39560 .n(n)
39561 .k(k)
39562 .iterations(1)
39563 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39564 }
39565 }
39566 }
39567 }
39568
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,n_gt_8)39569 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8) {
39570 TEST_REQUIRES_ARM_NEON_FMA;
39571 for (uint32_t n = 9; n < 16; n++) {
39572 for (size_t k = 1; k <= 20; k += 5) {
39573 GemmMicrokernelTester()
39574 .mr(6)
39575 .nr(8)
39576 .kr(1)
39577 .sr(1)
39578 .m(6)
39579 .n(n)
39580 .k(k)
39581 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39582 }
39583 }
39584 }
39585
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,n_gt_8_strided_cn)39586 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
39587 TEST_REQUIRES_ARM_NEON_FMA;
39588 for (uint32_t n = 9; n < 16; n++) {
39589 for (size_t k = 1; k <= 20; k += 5) {
39590 GemmMicrokernelTester()
39591 .mr(6)
39592 .nr(8)
39593 .kr(1)
39594 .sr(1)
39595 .m(6)
39596 .n(n)
39597 .k(k)
39598 .cn_stride(11)
39599 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39600 }
39601 }
39602 }
39603
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,n_gt_8_strided_a)39604 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
39605 TEST_REQUIRES_ARM_NEON_FMA;
39606 for (uint32_t n = 9; n < 16; n++) {
39607 for (size_t k = 1; k <= 20; k += 5) {
39608 GemmMicrokernelTester()
39609 .mr(6)
39610 .nr(8)
39611 .kr(1)
39612 .sr(1)
39613 .m(6)
39614 .n(n)
39615 .k(k)
39616 .a_stride(23)
39617 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39618 }
39619 }
39620 }
39621
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,n_gt_8_subtile)39622 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
39623 TEST_REQUIRES_ARM_NEON_FMA;
39624 for (uint32_t n = 9; n < 16; n++) {
39625 for (size_t k = 1; k <= 20; k += 5) {
39626 for (uint32_t m = 1; m <= 6; m++) {
39627 GemmMicrokernelTester()
39628 .mr(6)
39629 .nr(8)
39630 .kr(1)
39631 .sr(1)
39632 .m(m)
39633 .n(n)
39634 .k(k)
39635 .iterations(1)
39636 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39637 }
39638 }
39639 }
39640 }
39641
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,n_div_8)39642 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8) {
39643 TEST_REQUIRES_ARM_NEON_FMA;
39644 for (uint32_t n = 16; n <= 24; n += 8) {
39645 for (size_t k = 1; k <= 20; k += 5) {
39646 GemmMicrokernelTester()
39647 .mr(6)
39648 .nr(8)
39649 .kr(1)
39650 .sr(1)
39651 .m(6)
39652 .n(n)
39653 .k(k)
39654 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39655 }
39656 }
39657 }
39658
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,n_div_8_strided_cn)39659 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
39660 TEST_REQUIRES_ARM_NEON_FMA;
39661 for (uint32_t n = 16; n <= 24; n += 8) {
39662 for (size_t k = 1; k <= 20; k += 5) {
39663 GemmMicrokernelTester()
39664 .mr(6)
39665 .nr(8)
39666 .kr(1)
39667 .sr(1)
39668 .m(6)
39669 .n(n)
39670 .k(k)
39671 .cn_stride(11)
39672 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39673 }
39674 }
39675 }
39676
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,n_div_8_strided_a)39677 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
39678 TEST_REQUIRES_ARM_NEON_FMA;
39679 for (uint32_t n = 16; n <= 24; n += 8) {
39680 for (size_t k = 1; k <= 20; k += 5) {
39681 GemmMicrokernelTester()
39682 .mr(6)
39683 .nr(8)
39684 .kr(1)
39685 .sr(1)
39686 .m(6)
39687 .n(n)
39688 .k(k)
39689 .a_stride(23)
39690 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39691 }
39692 }
39693 }
39694
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,n_div_8_subtile)39695 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
39696 TEST_REQUIRES_ARM_NEON_FMA;
39697 for (uint32_t n = 16; n <= 24; n += 8) {
39698 for (size_t k = 1; k <= 20; k += 5) {
39699 for (uint32_t m = 1; m <= 6; m++) {
39700 GemmMicrokernelTester()
39701 .mr(6)
39702 .nr(8)
39703 .kr(1)
39704 .sr(1)
39705 .m(m)
39706 .n(n)
39707 .k(k)
39708 .iterations(1)
39709 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39710 }
39711 }
39712 }
39713 }
39714
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,strided_cm_subtile)39715 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
39716 TEST_REQUIRES_ARM_NEON_FMA;
39717 for (size_t k = 1; k <= 20; k += 5) {
39718 for (uint32_t n = 1; n <= 8; n++) {
39719 for (uint32_t m = 1; m <= 6; m++) {
39720 GemmMicrokernelTester()
39721 .mr(6)
39722 .nr(8)
39723 .kr(1)
39724 .sr(1)
39725 .m(m)
39726 .n(n)
39727 .k(k)
39728 .cm_stride(11)
39729 .iterations(1)
39730 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39731 }
39732 }
39733 }
39734 }
39735
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,qmin)39736 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, qmin) {
39737 TEST_REQUIRES_ARM_NEON_FMA;
39738 GemmMicrokernelTester()
39739 .mr(6)
39740 .nr(8)
39741 .kr(1)
39742 .sr(1)
39743 .m(6)
39744 .n(8)
39745 .k(4)
39746 .qmin(128)
39747 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39748 }
39749
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,qmax)39750 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, qmax) {
39751 TEST_REQUIRES_ARM_NEON_FMA;
39752 GemmMicrokernelTester()
39753 .mr(6)
39754 .nr(8)
39755 .kr(1)
39756 .sr(1)
39757 .m(6)
39758 .n(8)
39759 .k(4)
39760 .qmax(128)
39761 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39762 }
39763
TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128,strided_cm)39764 TEST(GENERATE_F32_GEMM_6X8__AARCH64_NEONFMA_LD128, strided_cm) {
39765 TEST_REQUIRES_ARM_NEON_FMA;
39766 GemmMicrokernelTester()
39767 .mr(6)
39768 .nr(8)
39769 .kr(1)
39770 .sr(1)
39771 .m(6)
39772 .n(8)
39773 .k(4)
39774 .cm_stride(11)
39775 .Test(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128, xnn_init_f32_minmax_scalar_params);
39776 }
39777 #endif // XNN_ARCH_ARM64 && XNN_PLATFORM_JIT
39778