1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 // 9 // Auto-generated file. Do not edit! 10 // Specification: test/qu8-gemm-minmax-rndnu.yaml 11 // Generator: tools/generate-gemm-test.py 12 13 14 #include <gtest/gtest.h> 15 16 #include <xnnpack/allocator.h> 17 #include <xnnpack/common.h> 18 #include <xnnpack/isa-checks.h> 19 #include <xnnpack/microparams-init.h> 20 21 #include <xnnpack/gemm.h> 22 #include <xnnpack/igemm.h> 23 #include <xnnpack/ppmm.h> 24 #include "gemm-microkernel-tester.h" 25 26 27 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8)28 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8) { 29 TEST_REQUIRES_ARM_NEON; 30 GemmMicrokernelTester() 31 .mr(1) 32 .nr(8) 33 .kr(1) 34 .sr(1) 35 .m(1) 36 .n(8) 37 .k(8) 38 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 39 } 40 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,strided_cn)41 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, strided_cn) { 42 TEST_REQUIRES_ARM_NEON; 43 GemmMicrokernelTester() 44 .mr(1) 45 .nr(8) 46 .kr(1) 47 .sr(1) 48 .m(1) 49 .n(8) 50 .k(8) 51 .cn_stride(11) 52 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 53 } 54 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8_strided_a)55 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8_strided_a) { 56 TEST_REQUIRES_ARM_NEON; 57 GemmMicrokernelTester() 58 .mr(1) 59 .nr(8) 60 .kr(1) 61 .sr(1) 62 .m(1) 63 .n(8) 64 .k(8) 65 .a_stride(11) 66 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 67 } 68 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8_subtile)69 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8_subtile) { 70 TEST_REQUIRES_ARM_NEON; 71 for (uint32_t n = 1; n <= 8; n++) { 72 for (uint32_t m = 1; m <= 1; m++) { 73 GemmMicrokernelTester() 74 .mr(1) 75 .nr(8) 76 .kr(1) 77 .sr(1) 78 .m(m) 79 .n(n) 80 .k(8) 81 .iterations(1) 82 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 83 } 84 } 85 } 86 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8_subtile_m)87 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8_subtile_m) { 88 TEST_REQUIRES_ARM_NEON; 89 for (uint32_t m = 1; m <= 1; m++) { 90 GemmMicrokernelTester() 91 .mr(1) 92 .nr(8) 93 .kr(1) 94 .sr(1) 95 .m(m) 96 .n(8) 97 .k(8) 98 .iterations(1) 99 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 100 } 101 } 102 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8_subtile_n)103 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8_subtile_n) { 104 TEST_REQUIRES_ARM_NEON; 105 for (uint32_t n = 1; n <= 8; n++) { 106 GemmMicrokernelTester() 107 .mr(1) 108 .nr(8) 109 .kr(1) 110 .sr(1) 111 .m(1) 112 .n(n) 113 .k(8) 114 .iterations(1) 115 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 116 } 117 } 118 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_lt_8)119 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_lt_8) { 120 TEST_REQUIRES_ARM_NEON; 121 for (size_t k = 1; k < 8; k++) { 122 GemmMicrokernelTester() 123 .mr(1) 124 .nr(8) 125 .kr(1) 126 .sr(1) 127 .m(1) 128 .n(8) 129 .k(k) 130 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 131 } 132 } 133 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_lt_8_strided_a)134 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_lt_8_strided_a) { 135 TEST_REQUIRES_ARM_NEON; 136 for (size_t k = 1; k < 8; k++) { 137 GemmMicrokernelTester() 138 .mr(1) 139 .nr(8) 140 .kr(1) 141 .sr(1) 142 .m(1) 143 .n(8) 144 .k(k) 145 .a_stride(11) 146 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 147 } 148 } 149 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_lt_8_subtile)150 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_lt_8_subtile) { 151 TEST_REQUIRES_ARM_NEON; 152 for (size_t k = 1; k < 8; k++) { 153 for (uint32_t n = 1; n <= 8; n++) { 154 for (uint32_t m = 1; m <= 1; m++) { 155 GemmMicrokernelTester() 156 .mr(1) 157 .nr(8) 158 .kr(1) 159 .sr(1) 160 .m(m) 161 .n(n) 162 .k(k) 163 .iterations(1) 164 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 165 } 166 } 167 } 168 } 169 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_gt_8)170 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_gt_8) { 171 TEST_REQUIRES_ARM_NEON; 172 for (size_t k = 9; k < 16; k++) { 173 GemmMicrokernelTester() 174 .mr(1) 175 .nr(8) 176 .kr(1) 177 .sr(1) 178 .m(1) 179 .n(8) 180 .k(k) 181 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 182 } 183 } 184 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_gt_8_strided_a)185 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_gt_8_strided_a) { 186 TEST_REQUIRES_ARM_NEON; 187 for (size_t k = 9; k < 16; k++) { 188 GemmMicrokernelTester() 189 .mr(1) 190 .nr(8) 191 .kr(1) 192 .sr(1) 193 .m(1) 194 .n(8) 195 .k(k) 196 .a_stride(19) 197 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 198 } 199 } 200 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_gt_8_subtile)201 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_gt_8_subtile) { 202 TEST_REQUIRES_ARM_NEON; 203 for (size_t k = 9; k < 16; k++) { 204 for (uint32_t n = 1; n <= 8; n++) { 205 for (uint32_t m = 1; m <= 1; m++) { 206 GemmMicrokernelTester() 207 .mr(1) 208 .nr(8) 209 .kr(1) 210 .sr(1) 211 .m(m) 212 .n(n) 213 .k(k) 214 .iterations(1) 215 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 216 } 217 } 218 } 219 } 220 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_div_8)221 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_div_8) { 222 TEST_REQUIRES_ARM_NEON; 223 for (size_t k = 16; k <= 80; k += 8) { 224 GemmMicrokernelTester() 225 .mr(1) 226 .nr(8) 227 .kr(1) 228 .sr(1) 229 .m(1) 230 .n(8) 231 .k(k) 232 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 233 } 234 } 235 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_div_8_strided_a)236 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_div_8_strided_a) { 237 TEST_REQUIRES_ARM_NEON; 238 for (size_t k = 16; k <= 80; k += 8) { 239 GemmMicrokernelTester() 240 .mr(1) 241 .nr(8) 242 .kr(1) 243 .sr(1) 244 .m(1) 245 .n(8) 246 .k(k) 247 .a_stride(83) 248 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 249 } 250 } 251 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_div_8_subtile)252 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_div_8_subtile) { 253 TEST_REQUIRES_ARM_NEON; 254 for (size_t k = 16; k <= 80; k += 8) { 255 for (uint32_t n = 1; n <= 8; n++) { 256 for (uint32_t m = 1; m <= 1; m++) { 257 GemmMicrokernelTester() 258 .mr(1) 259 .nr(8) 260 .kr(1) 261 .sr(1) 262 .m(m) 263 .n(n) 264 .k(k) 265 .iterations(1) 266 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 267 } 268 } 269 } 270 } 271 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_gt_8)272 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_gt_8) { 273 TEST_REQUIRES_ARM_NEON; 274 for (uint32_t n = 9; n < 16; n++) { 275 for (size_t k = 1; k <= 40; k += 9) { 276 GemmMicrokernelTester() 277 .mr(1) 278 .nr(8) 279 .kr(1) 280 .sr(1) 281 .m(1) 282 .n(n) 283 .k(k) 284 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 285 } 286 } 287 } 288 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_gt_8_strided_cn)289 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_gt_8_strided_cn) { 290 TEST_REQUIRES_ARM_NEON; 291 for (uint32_t n = 9; n < 16; n++) { 292 for (size_t k = 1; k <= 40; k += 9) { 293 GemmMicrokernelTester() 294 .mr(1) 295 .nr(8) 296 .kr(1) 297 .sr(1) 298 .m(1) 299 .n(n) 300 .k(k) 301 .cn_stride(11) 302 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 303 } 304 } 305 } 306 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_gt_8_strided_a)307 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_gt_8_strided_a) { 308 TEST_REQUIRES_ARM_NEON; 309 for (uint32_t n = 9; n < 16; n++) { 310 for (size_t k = 1; k <= 40; k += 9) { 311 GemmMicrokernelTester() 312 .mr(1) 313 .nr(8) 314 .kr(1) 315 .sr(1) 316 .m(1) 317 .n(n) 318 .k(k) 319 .a_stride(43) 320 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 321 } 322 } 323 } 324 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_gt_8_subtile)325 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_gt_8_subtile) { 326 TEST_REQUIRES_ARM_NEON; 327 for (uint32_t n = 9; n < 16; n++) { 328 for (size_t k = 1; k <= 40; k += 9) { 329 for (uint32_t m = 1; m <= 1; m++) { 330 GemmMicrokernelTester() 331 .mr(1) 332 .nr(8) 333 .kr(1) 334 .sr(1) 335 .m(m) 336 .n(n) 337 .k(k) 338 .iterations(1) 339 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 340 } 341 } 342 } 343 } 344 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_div_8)345 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_div_8) { 346 TEST_REQUIRES_ARM_NEON; 347 for (uint32_t n = 16; n <= 24; n += 8) { 348 for (size_t k = 1; k <= 40; k += 9) { 349 GemmMicrokernelTester() 350 .mr(1) 351 .nr(8) 352 .kr(1) 353 .sr(1) 354 .m(1) 355 .n(n) 356 .k(k) 357 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 358 } 359 } 360 } 361 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_div_8_strided_cn)362 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_div_8_strided_cn) { 363 TEST_REQUIRES_ARM_NEON; 364 for (uint32_t n = 16; n <= 24; n += 8) { 365 for (size_t k = 1; k <= 40; k += 9) { 366 GemmMicrokernelTester() 367 .mr(1) 368 .nr(8) 369 .kr(1) 370 .sr(1) 371 .m(1) 372 .n(n) 373 .k(k) 374 .cn_stride(11) 375 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 376 } 377 } 378 } 379 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_div_8_strided_a)380 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_div_8_strided_a) { 381 TEST_REQUIRES_ARM_NEON; 382 for (uint32_t n = 16; n <= 24; n += 8) { 383 for (size_t k = 1; k <= 40; k += 9) { 384 GemmMicrokernelTester() 385 .mr(1) 386 .nr(8) 387 .kr(1) 388 .sr(1) 389 .m(1) 390 .n(n) 391 .k(k) 392 .a_stride(43) 393 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 394 } 395 } 396 } 397 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_div_8_subtile)398 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_div_8_subtile) { 399 TEST_REQUIRES_ARM_NEON; 400 for (uint32_t n = 16; n <= 24; n += 8) { 401 for (size_t k = 1; k <= 40; k += 9) { 402 for (uint32_t m = 1; m <= 1; m++) { 403 GemmMicrokernelTester() 404 .mr(1) 405 .nr(8) 406 .kr(1) 407 .sr(1) 408 .m(m) 409 .n(n) 410 .k(k) 411 .iterations(1) 412 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 413 } 414 } 415 } 416 } 417 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,strided_cm_subtile)418 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, strided_cm_subtile) { 419 TEST_REQUIRES_ARM_NEON; 420 for (size_t k = 1; k <= 40; k += 9) { 421 for (uint32_t n = 1; n <= 8; n++) { 422 for (uint32_t m = 1; m <= 1; m++) { 423 GemmMicrokernelTester() 424 .mr(1) 425 .nr(8) 426 .kr(1) 427 .sr(1) 428 .m(m) 429 .n(n) 430 .k(k) 431 .cm_stride(11) 432 .iterations(1) 433 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 434 } 435 } 436 } 437 } 438 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,qmin)439 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, qmin) { 440 TEST_REQUIRES_ARM_NEON; 441 GemmMicrokernelTester() 442 .mr(1) 443 .nr(8) 444 .kr(1) 445 .sr(1) 446 .m(1) 447 .n(8) 448 .k(8) 449 .qmin(128) 450 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 451 } 452 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,qmax)453 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, qmax) { 454 TEST_REQUIRES_ARM_NEON; 455 GemmMicrokernelTester() 456 .mr(1) 457 .nr(8) 458 .kr(1) 459 .sr(1) 460 .m(1) 461 .n(8) 462 .k(8) 463 .qmax(128) 464 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 465 } 466 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,strided_cm)467 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, strided_cm) { 468 TEST_REQUIRES_ARM_NEON; 469 GemmMicrokernelTester() 470 .mr(1) 471 .nr(8) 472 .kr(1) 473 .sr(1) 474 .m(1) 475 .n(8) 476 .k(8) 477 .cm_stride(11) 478 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 479 } 480 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,no_a_zero_point)481 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, no_a_zero_point) { 482 TEST_REQUIRES_ARM_NEON; 483 for (size_t k = 1; k <= 40; k += 9) { 484 GemmMicrokernelTester() 485 .mr(1) 486 .nr(8) 487 .kr(1) 488 .sr(1) 489 .m(1) 490 .n(8) 491 .k(k) 492 .a_zero_point(0) 493 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 494 } 495 } 496 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,no_b_zero_point)497 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, no_b_zero_point) { 498 TEST_REQUIRES_ARM_NEON; 499 for (size_t k = 1; k <= 40; k += 9) { 500 GemmMicrokernelTester() 501 .mr(1) 502 .nr(8) 503 .kr(1) 504 .sr(1) 505 .m(1) 506 .n(8) 507 .k(k) 508 .b_zero_point(0) 509 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 510 } 511 } 512 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,no_zero_point)513 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, no_zero_point) { 514 TEST_REQUIRES_ARM_NEON; 515 for (size_t k = 1; k <= 40; k += 9) { 516 GemmMicrokernelTester() 517 .mr(1) 518 .nr(8) 519 .kr(1) 520 .sr(1) 521 .m(1) 522 .n(8) 523 .k(k) 524 .a_zero_point(0) 525 .b_zero_point(0) 526 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 527 } 528 } 529 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY 530 531 532 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8)533 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8) { 534 TEST_REQUIRES_ARM_NEON; 535 GemmMicrokernelTester() 536 .mr(4) 537 .nr(8) 538 .kr(1) 539 .sr(1) 540 .m(4) 541 .n(8) 542 .k(8) 543 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 544 } 545 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,strided_cn)546 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, strided_cn) { 547 TEST_REQUIRES_ARM_NEON; 548 GemmMicrokernelTester() 549 .mr(4) 550 .nr(8) 551 .kr(1) 552 .sr(1) 553 .m(4) 554 .n(8) 555 .k(8) 556 .cn_stride(11) 557 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 558 } 559 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8_strided_a)560 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8_strided_a) { 561 TEST_REQUIRES_ARM_NEON; 562 GemmMicrokernelTester() 563 .mr(4) 564 .nr(8) 565 .kr(1) 566 .sr(1) 567 .m(4) 568 .n(8) 569 .k(8) 570 .a_stride(11) 571 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 572 } 573 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8_subtile)574 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8_subtile) { 575 TEST_REQUIRES_ARM_NEON; 576 for (uint32_t n = 1; n <= 8; n++) { 577 for (uint32_t m = 1; m <= 4; m++) { 578 GemmMicrokernelTester() 579 .mr(4) 580 .nr(8) 581 .kr(1) 582 .sr(1) 583 .m(m) 584 .n(n) 585 .k(8) 586 .iterations(1) 587 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 588 } 589 } 590 } 591 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8_subtile_m)592 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8_subtile_m) { 593 TEST_REQUIRES_ARM_NEON; 594 for (uint32_t m = 1; m <= 4; m++) { 595 GemmMicrokernelTester() 596 .mr(4) 597 .nr(8) 598 .kr(1) 599 .sr(1) 600 .m(m) 601 .n(8) 602 .k(8) 603 .iterations(1) 604 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 605 } 606 } 607 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_eq_8_subtile_n)608 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_eq_8_subtile_n) { 609 TEST_REQUIRES_ARM_NEON; 610 for (uint32_t n = 1; n <= 8; n++) { 611 GemmMicrokernelTester() 612 .mr(4) 613 .nr(8) 614 .kr(1) 615 .sr(1) 616 .m(4) 617 .n(n) 618 .k(8) 619 .iterations(1) 620 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 621 } 622 } 623 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_lt_8)624 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_lt_8) { 625 TEST_REQUIRES_ARM_NEON; 626 for (size_t k = 1; k < 8; k++) { 627 GemmMicrokernelTester() 628 .mr(4) 629 .nr(8) 630 .kr(1) 631 .sr(1) 632 .m(4) 633 .n(8) 634 .k(k) 635 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 636 } 637 } 638 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_lt_8_strided_a)639 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_lt_8_strided_a) { 640 TEST_REQUIRES_ARM_NEON; 641 for (size_t k = 1; k < 8; k++) { 642 GemmMicrokernelTester() 643 .mr(4) 644 .nr(8) 645 .kr(1) 646 .sr(1) 647 .m(4) 648 .n(8) 649 .k(k) 650 .a_stride(11) 651 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 652 } 653 } 654 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_lt_8_subtile)655 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_lt_8_subtile) { 656 TEST_REQUIRES_ARM_NEON; 657 for (size_t k = 1; k < 8; k++) { 658 for (uint32_t n = 1; n <= 8; n++) { 659 for (uint32_t m = 1; m <= 4; m++) { 660 GemmMicrokernelTester() 661 .mr(4) 662 .nr(8) 663 .kr(1) 664 .sr(1) 665 .m(m) 666 .n(n) 667 .k(k) 668 .iterations(1) 669 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 670 } 671 } 672 } 673 } 674 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_gt_8)675 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_gt_8) { 676 TEST_REQUIRES_ARM_NEON; 677 for (size_t k = 9; k < 16; k++) { 678 GemmMicrokernelTester() 679 .mr(4) 680 .nr(8) 681 .kr(1) 682 .sr(1) 683 .m(4) 684 .n(8) 685 .k(k) 686 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 687 } 688 } 689 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_gt_8_strided_a)690 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_gt_8_strided_a) { 691 TEST_REQUIRES_ARM_NEON; 692 for (size_t k = 9; k < 16; k++) { 693 GemmMicrokernelTester() 694 .mr(4) 695 .nr(8) 696 .kr(1) 697 .sr(1) 698 .m(4) 699 .n(8) 700 .k(k) 701 .a_stride(19) 702 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 703 } 704 } 705 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_gt_8_subtile)706 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_gt_8_subtile) { 707 TEST_REQUIRES_ARM_NEON; 708 for (size_t k = 9; k < 16; k++) { 709 for (uint32_t n = 1; n <= 8; n++) { 710 for (uint32_t m = 1; m <= 4; m++) { 711 GemmMicrokernelTester() 712 .mr(4) 713 .nr(8) 714 .kr(1) 715 .sr(1) 716 .m(m) 717 .n(n) 718 .k(k) 719 .iterations(1) 720 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 721 } 722 } 723 } 724 } 725 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_div_8)726 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_div_8) { 727 TEST_REQUIRES_ARM_NEON; 728 for (size_t k = 16; k <= 80; k += 8) { 729 GemmMicrokernelTester() 730 .mr(4) 731 .nr(8) 732 .kr(1) 733 .sr(1) 734 .m(4) 735 .n(8) 736 .k(k) 737 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 738 } 739 } 740 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_div_8_strided_a)741 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_div_8_strided_a) { 742 TEST_REQUIRES_ARM_NEON; 743 for (size_t k = 16; k <= 80; k += 8) { 744 GemmMicrokernelTester() 745 .mr(4) 746 .nr(8) 747 .kr(1) 748 .sr(1) 749 .m(4) 750 .n(8) 751 .k(k) 752 .a_stride(83) 753 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 754 } 755 } 756 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,k_div_8_subtile)757 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, k_div_8_subtile) { 758 TEST_REQUIRES_ARM_NEON; 759 for (size_t k = 16; k <= 80; k += 8) { 760 for (uint32_t n = 1; n <= 8; n++) { 761 for (uint32_t m = 1; m <= 4; m++) { 762 GemmMicrokernelTester() 763 .mr(4) 764 .nr(8) 765 .kr(1) 766 .sr(1) 767 .m(m) 768 .n(n) 769 .k(k) 770 .iterations(1) 771 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 772 } 773 } 774 } 775 } 776 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_gt_8)777 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_gt_8) { 778 TEST_REQUIRES_ARM_NEON; 779 for (uint32_t n = 9; n < 16; n++) { 780 for (size_t k = 1; k <= 40; k += 9) { 781 GemmMicrokernelTester() 782 .mr(4) 783 .nr(8) 784 .kr(1) 785 .sr(1) 786 .m(4) 787 .n(n) 788 .k(k) 789 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 790 } 791 } 792 } 793 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_gt_8_strided_cn)794 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_gt_8_strided_cn) { 795 TEST_REQUIRES_ARM_NEON; 796 for (uint32_t n = 9; n < 16; n++) { 797 for (size_t k = 1; k <= 40; k += 9) { 798 GemmMicrokernelTester() 799 .mr(4) 800 .nr(8) 801 .kr(1) 802 .sr(1) 803 .m(4) 804 .n(n) 805 .k(k) 806 .cn_stride(11) 807 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 808 } 809 } 810 } 811 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_gt_8_strided_a)812 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_gt_8_strided_a) { 813 TEST_REQUIRES_ARM_NEON; 814 for (uint32_t n = 9; n < 16; n++) { 815 for (size_t k = 1; k <= 40; k += 9) { 816 GemmMicrokernelTester() 817 .mr(4) 818 .nr(8) 819 .kr(1) 820 .sr(1) 821 .m(4) 822 .n(n) 823 .k(k) 824 .a_stride(43) 825 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 826 } 827 } 828 } 829 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_gt_8_subtile)830 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_gt_8_subtile) { 831 TEST_REQUIRES_ARM_NEON; 832 for (uint32_t n = 9; n < 16; n++) { 833 for (size_t k = 1; k <= 40; k += 9) { 834 for (uint32_t m = 1; m <= 4; m++) { 835 GemmMicrokernelTester() 836 .mr(4) 837 .nr(8) 838 .kr(1) 839 .sr(1) 840 .m(m) 841 .n(n) 842 .k(k) 843 .iterations(1) 844 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 845 } 846 } 847 } 848 } 849 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_div_8)850 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_div_8) { 851 TEST_REQUIRES_ARM_NEON; 852 for (uint32_t n = 16; n <= 24; n += 8) { 853 for (size_t k = 1; k <= 40; k += 9) { 854 GemmMicrokernelTester() 855 .mr(4) 856 .nr(8) 857 .kr(1) 858 .sr(1) 859 .m(4) 860 .n(n) 861 .k(k) 862 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 863 } 864 } 865 } 866 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_div_8_strided_cn)867 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_div_8_strided_cn) { 868 TEST_REQUIRES_ARM_NEON; 869 for (uint32_t n = 16; n <= 24; n += 8) { 870 for (size_t k = 1; k <= 40; k += 9) { 871 GemmMicrokernelTester() 872 .mr(4) 873 .nr(8) 874 .kr(1) 875 .sr(1) 876 .m(4) 877 .n(n) 878 .k(k) 879 .cn_stride(11) 880 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 881 } 882 } 883 } 884 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_div_8_strided_a)885 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_div_8_strided_a) { 886 TEST_REQUIRES_ARM_NEON; 887 for (uint32_t n = 16; n <= 24; n += 8) { 888 for (size_t k = 1; k <= 40; k += 9) { 889 GemmMicrokernelTester() 890 .mr(4) 891 .nr(8) 892 .kr(1) 893 .sr(1) 894 .m(4) 895 .n(n) 896 .k(k) 897 .a_stride(43) 898 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 899 } 900 } 901 } 902 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,n_div_8_subtile)903 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, n_div_8_subtile) { 904 TEST_REQUIRES_ARM_NEON; 905 for (uint32_t n = 16; n <= 24; n += 8) { 906 for (size_t k = 1; k <= 40; k += 9) { 907 for (uint32_t m = 1; m <= 4; m++) { 908 GemmMicrokernelTester() 909 .mr(4) 910 .nr(8) 911 .kr(1) 912 .sr(1) 913 .m(m) 914 .n(n) 915 .k(k) 916 .iterations(1) 917 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 918 } 919 } 920 } 921 } 922 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,strided_cm_subtile)923 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, strided_cm_subtile) { 924 TEST_REQUIRES_ARM_NEON; 925 for (size_t k = 1; k <= 40; k += 9) { 926 for (uint32_t n = 1; n <= 8; n++) { 927 for (uint32_t m = 1; m <= 4; m++) { 928 GemmMicrokernelTester() 929 .mr(4) 930 .nr(8) 931 .kr(1) 932 .sr(1) 933 .m(m) 934 .n(n) 935 .k(k) 936 .cm_stride(11) 937 .iterations(1) 938 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 939 } 940 } 941 } 942 } 943 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,qmin)944 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, qmin) { 945 TEST_REQUIRES_ARM_NEON; 946 GemmMicrokernelTester() 947 .mr(4) 948 .nr(8) 949 .kr(1) 950 .sr(1) 951 .m(4) 952 .n(8) 953 .k(8) 954 .qmin(128) 955 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 956 } 957 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,qmax)958 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, qmax) { 959 TEST_REQUIRES_ARM_NEON; 960 GemmMicrokernelTester() 961 .mr(4) 962 .nr(8) 963 .kr(1) 964 .sr(1) 965 .m(4) 966 .n(8) 967 .k(8) 968 .qmax(128) 969 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 970 } 971 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,strided_cm)972 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, strided_cm) { 973 TEST_REQUIRES_ARM_NEON; 974 GemmMicrokernelTester() 975 .mr(4) 976 .nr(8) 977 .kr(1) 978 .sr(1) 979 .m(4) 980 .n(8) 981 .k(8) 982 .cm_stride(11) 983 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 984 } 985 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,no_a_zero_point)986 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, no_a_zero_point) { 987 TEST_REQUIRES_ARM_NEON; 988 for (size_t k = 1; k <= 40; k += 9) { 989 GemmMicrokernelTester() 990 .mr(4) 991 .nr(8) 992 .kr(1) 993 .sr(1) 994 .m(4) 995 .n(8) 996 .k(k) 997 .a_zero_point(0) 998 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 999 } 1000 } 1001 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,no_b_zero_point)1002 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, no_b_zero_point) { 1003 TEST_REQUIRES_ARM_NEON; 1004 for (size_t k = 1; k <= 40; k += 9) { 1005 GemmMicrokernelTester() 1006 .mr(4) 1007 .nr(8) 1008 .kr(1) 1009 .sr(1) 1010 .m(4) 1011 .n(8) 1012 .k(k) 1013 .b_zero_point(0) 1014 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1015 } 1016 } 1017 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7,no_zero_point)1018 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A7, no_zero_point) { 1019 TEST_REQUIRES_ARM_NEON; 1020 for (size_t k = 1; k <= 40; k += 9) { 1021 GemmMicrokernelTester() 1022 .mr(4) 1023 .nr(8) 1024 .kr(1) 1025 .sr(1) 1026 .m(4) 1027 .n(8) 1028 .k(k) 1029 .a_zero_point(0) 1030 .b_zero_point(0) 1031 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1032 } 1033 } 1034 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY 1035 1036 1037 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_eq_8)1038 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8) { 1039 TEST_REQUIRES_ARM_NEON; 1040 GemmMicrokernelTester() 1041 .mr(4) 1042 .nr(8) 1043 .kr(1) 1044 .sr(1) 1045 .m(4) 1046 .n(8) 1047 .k(8) 1048 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1049 } 1050 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,strided_cn)1051 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, strided_cn) { 1052 TEST_REQUIRES_ARM_NEON; 1053 GemmMicrokernelTester() 1054 .mr(4) 1055 .nr(8) 1056 .kr(1) 1057 .sr(1) 1058 .m(4) 1059 .n(8) 1060 .k(8) 1061 .cn_stride(11) 1062 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1063 } 1064 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_eq_8_strided_a)1065 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_strided_a) { 1066 TEST_REQUIRES_ARM_NEON; 1067 GemmMicrokernelTester() 1068 .mr(4) 1069 .nr(8) 1070 .kr(1) 1071 .sr(1) 1072 .m(4) 1073 .n(8) 1074 .k(8) 1075 .a_stride(11) 1076 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1077 } 1078 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_eq_8_subtile)1079 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile) { 1080 TEST_REQUIRES_ARM_NEON; 1081 for (uint32_t n = 1; n <= 8; n++) { 1082 for (uint32_t m = 1; m <= 4; m++) { 1083 GemmMicrokernelTester() 1084 .mr(4) 1085 .nr(8) 1086 .kr(1) 1087 .sr(1) 1088 .m(m) 1089 .n(n) 1090 .k(8) 1091 .iterations(1) 1092 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1093 } 1094 } 1095 } 1096 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_eq_8_subtile_m)1097 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_m) { 1098 TEST_REQUIRES_ARM_NEON; 1099 for (uint32_t m = 1; m <= 4; m++) { 1100 GemmMicrokernelTester() 1101 .mr(4) 1102 .nr(8) 1103 .kr(1) 1104 .sr(1) 1105 .m(m) 1106 .n(8) 1107 .k(8) 1108 .iterations(1) 1109 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1110 } 1111 } 1112 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_eq_8_subtile_n)1113 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_n) { 1114 TEST_REQUIRES_ARM_NEON; 1115 for (uint32_t n = 1; n <= 8; n++) { 1116 GemmMicrokernelTester() 1117 .mr(4) 1118 .nr(8) 1119 .kr(1) 1120 .sr(1) 1121 .m(4) 1122 .n(n) 1123 .k(8) 1124 .iterations(1) 1125 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1126 } 1127 } 1128 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_lt_8)1129 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_lt_8) { 1130 TEST_REQUIRES_ARM_NEON; 1131 for (size_t k = 1; k < 8; k++) { 1132 GemmMicrokernelTester() 1133 .mr(4) 1134 .nr(8) 1135 .kr(1) 1136 .sr(1) 1137 .m(4) 1138 .n(8) 1139 .k(k) 1140 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1141 } 1142 } 1143 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_lt_8_strided_a)1144 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_lt_8_strided_a) { 1145 TEST_REQUIRES_ARM_NEON; 1146 for (size_t k = 1; k < 8; k++) { 1147 GemmMicrokernelTester() 1148 .mr(4) 1149 .nr(8) 1150 .kr(1) 1151 .sr(1) 1152 .m(4) 1153 .n(8) 1154 .k(k) 1155 .a_stride(11) 1156 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1157 } 1158 } 1159 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_lt_8_subtile)1160 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_lt_8_subtile) { 1161 TEST_REQUIRES_ARM_NEON; 1162 for (size_t k = 1; k < 8; k++) { 1163 for (uint32_t n = 1; n <= 8; n++) { 1164 for (uint32_t m = 1; m <= 4; m++) { 1165 GemmMicrokernelTester() 1166 .mr(4) 1167 .nr(8) 1168 .kr(1) 1169 .sr(1) 1170 .m(m) 1171 .n(n) 1172 .k(k) 1173 .iterations(1) 1174 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1175 } 1176 } 1177 } 1178 } 1179 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_gt_8)1180 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_gt_8) { 1181 TEST_REQUIRES_ARM_NEON; 1182 for (size_t k = 9; k < 16; k++) { 1183 GemmMicrokernelTester() 1184 .mr(4) 1185 .nr(8) 1186 .kr(1) 1187 .sr(1) 1188 .m(4) 1189 .n(8) 1190 .k(k) 1191 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1192 } 1193 } 1194 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_gt_8_strided_a)1195 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_gt_8_strided_a) { 1196 TEST_REQUIRES_ARM_NEON; 1197 for (size_t k = 9; k < 16; k++) { 1198 GemmMicrokernelTester() 1199 .mr(4) 1200 .nr(8) 1201 .kr(1) 1202 .sr(1) 1203 .m(4) 1204 .n(8) 1205 .k(k) 1206 .a_stride(19) 1207 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1208 } 1209 } 1210 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_gt_8_subtile)1211 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_gt_8_subtile) { 1212 TEST_REQUIRES_ARM_NEON; 1213 for (size_t k = 9; k < 16; k++) { 1214 for (uint32_t n = 1; n <= 8; n++) { 1215 for (uint32_t m = 1; m <= 4; m++) { 1216 GemmMicrokernelTester() 1217 .mr(4) 1218 .nr(8) 1219 .kr(1) 1220 .sr(1) 1221 .m(m) 1222 .n(n) 1223 .k(k) 1224 .iterations(1) 1225 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1226 } 1227 } 1228 } 1229 } 1230 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_div_8)1231 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_div_8) { 1232 TEST_REQUIRES_ARM_NEON; 1233 for (size_t k = 16; k <= 80; k += 8) { 1234 GemmMicrokernelTester() 1235 .mr(4) 1236 .nr(8) 1237 .kr(1) 1238 .sr(1) 1239 .m(4) 1240 .n(8) 1241 .k(k) 1242 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1243 } 1244 } 1245 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_div_8_strided_a)1246 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_div_8_strided_a) { 1247 TEST_REQUIRES_ARM_NEON; 1248 for (size_t k = 16; k <= 80; k += 8) { 1249 GemmMicrokernelTester() 1250 .mr(4) 1251 .nr(8) 1252 .kr(1) 1253 .sr(1) 1254 .m(4) 1255 .n(8) 1256 .k(k) 1257 .a_stride(83) 1258 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1259 } 1260 } 1261 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,k_div_8_subtile)1262 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, k_div_8_subtile) { 1263 TEST_REQUIRES_ARM_NEON; 1264 for (size_t k = 16; k <= 80; k += 8) { 1265 for (uint32_t n = 1; n <= 8; n++) { 1266 for (uint32_t m = 1; m <= 4; m++) { 1267 GemmMicrokernelTester() 1268 .mr(4) 1269 .nr(8) 1270 .kr(1) 1271 .sr(1) 1272 .m(m) 1273 .n(n) 1274 .k(k) 1275 .iterations(1) 1276 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1277 } 1278 } 1279 } 1280 } 1281 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,n_gt_8)1282 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_gt_8) { 1283 TEST_REQUIRES_ARM_NEON; 1284 for (uint32_t n = 9; n < 16; n++) { 1285 for (size_t k = 1; k <= 40; k += 9) { 1286 GemmMicrokernelTester() 1287 .mr(4) 1288 .nr(8) 1289 .kr(1) 1290 .sr(1) 1291 .m(4) 1292 .n(n) 1293 .k(k) 1294 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1295 } 1296 } 1297 } 1298 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,n_gt_8_strided_cn)1299 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_gt_8_strided_cn) { 1300 TEST_REQUIRES_ARM_NEON; 1301 for (uint32_t n = 9; n < 16; n++) { 1302 for (size_t k = 1; k <= 40; k += 9) { 1303 GemmMicrokernelTester() 1304 .mr(4) 1305 .nr(8) 1306 .kr(1) 1307 .sr(1) 1308 .m(4) 1309 .n(n) 1310 .k(k) 1311 .cn_stride(11) 1312 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1313 } 1314 } 1315 } 1316 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,n_gt_8_strided_a)1317 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_gt_8_strided_a) { 1318 TEST_REQUIRES_ARM_NEON; 1319 for (uint32_t n = 9; n < 16; n++) { 1320 for (size_t k = 1; k <= 40; k += 9) { 1321 GemmMicrokernelTester() 1322 .mr(4) 1323 .nr(8) 1324 .kr(1) 1325 .sr(1) 1326 .m(4) 1327 .n(n) 1328 .k(k) 1329 .a_stride(43) 1330 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1331 } 1332 } 1333 } 1334 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,n_gt_8_subtile)1335 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_gt_8_subtile) { 1336 TEST_REQUIRES_ARM_NEON; 1337 for (uint32_t n = 9; n < 16; n++) { 1338 for (size_t k = 1; k <= 40; k += 9) { 1339 for (uint32_t m = 1; m <= 4; m++) { 1340 GemmMicrokernelTester() 1341 .mr(4) 1342 .nr(8) 1343 .kr(1) 1344 .sr(1) 1345 .m(m) 1346 .n(n) 1347 .k(k) 1348 .iterations(1) 1349 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1350 } 1351 } 1352 } 1353 } 1354 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,n_div_8)1355 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_div_8) { 1356 TEST_REQUIRES_ARM_NEON; 1357 for (uint32_t n = 16; n <= 24; n += 8) { 1358 for (size_t k = 1; k <= 40; k += 9) { 1359 GemmMicrokernelTester() 1360 .mr(4) 1361 .nr(8) 1362 .kr(1) 1363 .sr(1) 1364 .m(4) 1365 .n(n) 1366 .k(k) 1367 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1368 } 1369 } 1370 } 1371 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,n_div_8_strided_cn)1372 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_div_8_strided_cn) { 1373 TEST_REQUIRES_ARM_NEON; 1374 for (uint32_t n = 16; n <= 24; n += 8) { 1375 for (size_t k = 1; k <= 40; k += 9) { 1376 GemmMicrokernelTester() 1377 .mr(4) 1378 .nr(8) 1379 .kr(1) 1380 .sr(1) 1381 .m(4) 1382 .n(n) 1383 .k(k) 1384 .cn_stride(11) 1385 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1386 } 1387 } 1388 } 1389 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,n_div_8_strided_a)1390 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_div_8_strided_a) { 1391 TEST_REQUIRES_ARM_NEON; 1392 for (uint32_t n = 16; n <= 24; n += 8) { 1393 for (size_t k = 1; k <= 40; k += 9) { 1394 GemmMicrokernelTester() 1395 .mr(4) 1396 .nr(8) 1397 .kr(1) 1398 .sr(1) 1399 .m(4) 1400 .n(n) 1401 .k(k) 1402 .a_stride(43) 1403 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1404 } 1405 } 1406 } 1407 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,n_div_8_subtile)1408 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, n_div_8_subtile) { 1409 TEST_REQUIRES_ARM_NEON; 1410 for (uint32_t n = 16; n <= 24; n += 8) { 1411 for (size_t k = 1; k <= 40; k += 9) { 1412 for (uint32_t m = 1; m <= 4; m++) { 1413 GemmMicrokernelTester() 1414 .mr(4) 1415 .nr(8) 1416 .kr(1) 1417 .sr(1) 1418 .m(m) 1419 .n(n) 1420 .k(k) 1421 .iterations(1) 1422 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1423 } 1424 } 1425 } 1426 } 1427 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,strided_cm_subtile)1428 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, strided_cm_subtile) { 1429 TEST_REQUIRES_ARM_NEON; 1430 for (size_t k = 1; k <= 40; k += 9) { 1431 for (uint32_t n = 1; n <= 8; n++) { 1432 for (uint32_t m = 1; m <= 4; m++) { 1433 GemmMicrokernelTester() 1434 .mr(4) 1435 .nr(8) 1436 .kr(1) 1437 .sr(1) 1438 .m(m) 1439 .n(n) 1440 .k(k) 1441 .cm_stride(11) 1442 .iterations(1) 1443 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1444 } 1445 } 1446 } 1447 } 1448 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,qmin)1449 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, qmin) { 1450 TEST_REQUIRES_ARM_NEON; 1451 GemmMicrokernelTester() 1452 .mr(4) 1453 .nr(8) 1454 .kr(1) 1455 .sr(1) 1456 .m(4) 1457 .n(8) 1458 .k(8) 1459 .qmin(128) 1460 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1461 } 1462 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,qmax)1463 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, qmax) { 1464 TEST_REQUIRES_ARM_NEON; 1465 GemmMicrokernelTester() 1466 .mr(4) 1467 .nr(8) 1468 .kr(1) 1469 .sr(1) 1470 .m(4) 1471 .n(8) 1472 .k(8) 1473 .qmax(128) 1474 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1475 } 1476 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,strided_cm)1477 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, strided_cm) { 1478 TEST_REQUIRES_ARM_NEON; 1479 GemmMicrokernelTester() 1480 .mr(4) 1481 .nr(8) 1482 .kr(1) 1483 .sr(1) 1484 .m(4) 1485 .n(8) 1486 .k(8) 1487 .cm_stride(11) 1488 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1489 } 1490 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,no_a_zero_point)1491 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, no_a_zero_point) { 1492 TEST_REQUIRES_ARM_NEON; 1493 for (size_t k = 1; k <= 40; k += 9) { 1494 GemmMicrokernelTester() 1495 .mr(4) 1496 .nr(8) 1497 .kr(1) 1498 .sr(1) 1499 .m(4) 1500 .n(8) 1501 .k(k) 1502 .a_zero_point(0) 1503 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1504 } 1505 } 1506 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,no_b_zero_point)1507 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, no_b_zero_point) { 1508 TEST_REQUIRES_ARM_NEON; 1509 for (size_t k = 1; k <= 40; k += 9) { 1510 GemmMicrokernelTester() 1511 .mr(4) 1512 .nr(8) 1513 .kr(1) 1514 .sr(1) 1515 .m(4) 1516 .n(8) 1517 .k(k) 1518 .b_zero_point(0) 1519 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1520 } 1521 } 1522 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53,no_zero_point)1523 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_CORTEX_A53, no_zero_point) { 1524 TEST_REQUIRES_ARM_NEON; 1525 for (size_t k = 1; k <= 40; k += 9) { 1526 GemmMicrokernelTester() 1527 .mr(4) 1528 .nr(8) 1529 .kr(1) 1530 .sr(1) 1531 .m(4) 1532 .n(8) 1533 .k(k) 1534 .a_zero_point(0) 1535 .b_zero_point(0) 1536 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1537 } 1538 } 1539 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY 1540 1541 1542 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_eq_8)1543 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8) { 1544 TEST_REQUIRES_ARM_NEON; 1545 GemmMicrokernelTester() 1546 .mr(4) 1547 .nr(8) 1548 .kr(1) 1549 .sr(1) 1550 .m(4) 1551 .n(8) 1552 .k(8) 1553 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1554 } 1555 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,strided_cn)1556 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, strided_cn) { 1557 TEST_REQUIRES_ARM_NEON; 1558 GemmMicrokernelTester() 1559 .mr(4) 1560 .nr(8) 1561 .kr(1) 1562 .sr(1) 1563 .m(4) 1564 .n(8) 1565 .k(8) 1566 .cn_stride(11) 1567 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1568 } 1569 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_eq_8_strided_a)1570 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8_strided_a) { 1571 TEST_REQUIRES_ARM_NEON; 1572 GemmMicrokernelTester() 1573 .mr(4) 1574 .nr(8) 1575 .kr(1) 1576 .sr(1) 1577 .m(4) 1578 .n(8) 1579 .k(8) 1580 .a_stride(11) 1581 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1582 } 1583 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_eq_8_subtile)1584 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8_subtile) { 1585 TEST_REQUIRES_ARM_NEON; 1586 for (uint32_t n = 1; n <= 8; n++) { 1587 for (uint32_t m = 1; m <= 4; m++) { 1588 GemmMicrokernelTester() 1589 .mr(4) 1590 .nr(8) 1591 .kr(1) 1592 .sr(1) 1593 .m(m) 1594 .n(n) 1595 .k(8) 1596 .iterations(1) 1597 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1598 } 1599 } 1600 } 1601 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_eq_8_subtile_m)1602 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8_subtile_m) { 1603 TEST_REQUIRES_ARM_NEON; 1604 for (uint32_t m = 1; m <= 4; m++) { 1605 GemmMicrokernelTester() 1606 .mr(4) 1607 .nr(8) 1608 .kr(1) 1609 .sr(1) 1610 .m(m) 1611 .n(8) 1612 .k(8) 1613 .iterations(1) 1614 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1615 } 1616 } 1617 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_eq_8_subtile_n)1618 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_eq_8_subtile_n) { 1619 TEST_REQUIRES_ARM_NEON; 1620 for (uint32_t n = 1; n <= 8; n++) { 1621 GemmMicrokernelTester() 1622 .mr(4) 1623 .nr(8) 1624 .kr(1) 1625 .sr(1) 1626 .m(4) 1627 .n(n) 1628 .k(8) 1629 .iterations(1) 1630 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1631 } 1632 } 1633 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_lt_8)1634 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_lt_8) { 1635 TEST_REQUIRES_ARM_NEON; 1636 for (size_t k = 1; k < 8; k++) { 1637 GemmMicrokernelTester() 1638 .mr(4) 1639 .nr(8) 1640 .kr(1) 1641 .sr(1) 1642 .m(4) 1643 .n(8) 1644 .k(k) 1645 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1646 } 1647 } 1648 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_lt_8_strided_a)1649 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_lt_8_strided_a) { 1650 TEST_REQUIRES_ARM_NEON; 1651 for (size_t k = 1; k < 8; k++) { 1652 GemmMicrokernelTester() 1653 .mr(4) 1654 .nr(8) 1655 .kr(1) 1656 .sr(1) 1657 .m(4) 1658 .n(8) 1659 .k(k) 1660 .a_stride(11) 1661 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1662 } 1663 } 1664 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_lt_8_subtile)1665 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_lt_8_subtile) { 1666 TEST_REQUIRES_ARM_NEON; 1667 for (size_t k = 1; k < 8; k++) { 1668 for (uint32_t n = 1; n <= 8; n++) { 1669 for (uint32_t m = 1; m <= 4; m++) { 1670 GemmMicrokernelTester() 1671 .mr(4) 1672 .nr(8) 1673 .kr(1) 1674 .sr(1) 1675 .m(m) 1676 .n(n) 1677 .k(k) 1678 .iterations(1) 1679 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1680 } 1681 } 1682 } 1683 } 1684 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_gt_8)1685 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_gt_8) { 1686 TEST_REQUIRES_ARM_NEON; 1687 for (size_t k = 9; k < 16; k++) { 1688 GemmMicrokernelTester() 1689 .mr(4) 1690 .nr(8) 1691 .kr(1) 1692 .sr(1) 1693 .m(4) 1694 .n(8) 1695 .k(k) 1696 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1697 } 1698 } 1699 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_gt_8_strided_a)1700 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_gt_8_strided_a) { 1701 TEST_REQUIRES_ARM_NEON; 1702 for (size_t k = 9; k < 16; k++) { 1703 GemmMicrokernelTester() 1704 .mr(4) 1705 .nr(8) 1706 .kr(1) 1707 .sr(1) 1708 .m(4) 1709 .n(8) 1710 .k(k) 1711 .a_stride(19) 1712 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1713 } 1714 } 1715 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_gt_8_subtile)1716 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_gt_8_subtile) { 1717 TEST_REQUIRES_ARM_NEON; 1718 for (size_t k = 9; k < 16; k++) { 1719 for (uint32_t n = 1; n <= 8; n++) { 1720 for (uint32_t m = 1; m <= 4; m++) { 1721 GemmMicrokernelTester() 1722 .mr(4) 1723 .nr(8) 1724 .kr(1) 1725 .sr(1) 1726 .m(m) 1727 .n(n) 1728 .k(k) 1729 .iterations(1) 1730 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1731 } 1732 } 1733 } 1734 } 1735 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_div_8)1736 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_div_8) { 1737 TEST_REQUIRES_ARM_NEON; 1738 for (size_t k = 16; k <= 80; k += 8) { 1739 GemmMicrokernelTester() 1740 .mr(4) 1741 .nr(8) 1742 .kr(1) 1743 .sr(1) 1744 .m(4) 1745 .n(8) 1746 .k(k) 1747 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1748 } 1749 } 1750 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_div_8_strided_a)1751 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_div_8_strided_a) { 1752 TEST_REQUIRES_ARM_NEON; 1753 for (size_t k = 16; k <= 80; k += 8) { 1754 GemmMicrokernelTester() 1755 .mr(4) 1756 .nr(8) 1757 .kr(1) 1758 .sr(1) 1759 .m(4) 1760 .n(8) 1761 .k(k) 1762 .a_stride(83) 1763 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1764 } 1765 } 1766 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,k_div_8_subtile)1767 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, k_div_8_subtile) { 1768 TEST_REQUIRES_ARM_NEON; 1769 for (size_t k = 16; k <= 80; k += 8) { 1770 for (uint32_t n = 1; n <= 8; n++) { 1771 for (uint32_t m = 1; m <= 4; m++) { 1772 GemmMicrokernelTester() 1773 .mr(4) 1774 .nr(8) 1775 .kr(1) 1776 .sr(1) 1777 .m(m) 1778 .n(n) 1779 .k(k) 1780 .iterations(1) 1781 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1782 } 1783 } 1784 } 1785 } 1786 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,n_gt_8)1787 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_gt_8) { 1788 TEST_REQUIRES_ARM_NEON; 1789 for (uint32_t n = 9; n < 16; n++) { 1790 for (size_t k = 1; k <= 40; k += 9) { 1791 GemmMicrokernelTester() 1792 .mr(4) 1793 .nr(8) 1794 .kr(1) 1795 .sr(1) 1796 .m(4) 1797 .n(n) 1798 .k(k) 1799 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1800 } 1801 } 1802 } 1803 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,n_gt_8_strided_cn)1804 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_gt_8_strided_cn) { 1805 TEST_REQUIRES_ARM_NEON; 1806 for (uint32_t n = 9; n < 16; n++) { 1807 for (size_t k = 1; k <= 40; k += 9) { 1808 GemmMicrokernelTester() 1809 .mr(4) 1810 .nr(8) 1811 .kr(1) 1812 .sr(1) 1813 .m(4) 1814 .n(n) 1815 .k(k) 1816 .cn_stride(11) 1817 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1818 } 1819 } 1820 } 1821 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,n_gt_8_strided_a)1822 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_gt_8_strided_a) { 1823 TEST_REQUIRES_ARM_NEON; 1824 for (uint32_t n = 9; n < 16; n++) { 1825 for (size_t k = 1; k <= 40; k += 9) { 1826 GemmMicrokernelTester() 1827 .mr(4) 1828 .nr(8) 1829 .kr(1) 1830 .sr(1) 1831 .m(4) 1832 .n(n) 1833 .k(k) 1834 .a_stride(43) 1835 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1836 } 1837 } 1838 } 1839 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,n_gt_8_subtile)1840 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_gt_8_subtile) { 1841 TEST_REQUIRES_ARM_NEON; 1842 for (uint32_t n = 9; n < 16; n++) { 1843 for (size_t k = 1; k <= 40; k += 9) { 1844 for (uint32_t m = 1; m <= 4; m++) { 1845 GemmMicrokernelTester() 1846 .mr(4) 1847 .nr(8) 1848 .kr(1) 1849 .sr(1) 1850 .m(m) 1851 .n(n) 1852 .k(k) 1853 .iterations(1) 1854 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1855 } 1856 } 1857 } 1858 } 1859 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,n_div_8)1860 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_div_8) { 1861 TEST_REQUIRES_ARM_NEON; 1862 for (uint32_t n = 16; n <= 24; n += 8) { 1863 for (size_t k = 1; k <= 40; k += 9) { 1864 GemmMicrokernelTester() 1865 .mr(4) 1866 .nr(8) 1867 .kr(1) 1868 .sr(1) 1869 .m(4) 1870 .n(n) 1871 .k(k) 1872 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1873 } 1874 } 1875 } 1876 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,n_div_8_strided_cn)1877 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_div_8_strided_cn) { 1878 TEST_REQUIRES_ARM_NEON; 1879 for (uint32_t n = 16; n <= 24; n += 8) { 1880 for (size_t k = 1; k <= 40; k += 9) { 1881 GemmMicrokernelTester() 1882 .mr(4) 1883 .nr(8) 1884 .kr(1) 1885 .sr(1) 1886 .m(4) 1887 .n(n) 1888 .k(k) 1889 .cn_stride(11) 1890 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1891 } 1892 } 1893 } 1894 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,n_div_8_strided_a)1895 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_div_8_strided_a) { 1896 TEST_REQUIRES_ARM_NEON; 1897 for (uint32_t n = 16; n <= 24; n += 8) { 1898 for (size_t k = 1; k <= 40; k += 9) { 1899 GemmMicrokernelTester() 1900 .mr(4) 1901 .nr(8) 1902 .kr(1) 1903 .sr(1) 1904 .m(4) 1905 .n(n) 1906 .k(k) 1907 .a_stride(43) 1908 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1909 } 1910 } 1911 } 1912 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,n_div_8_subtile)1913 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, n_div_8_subtile) { 1914 TEST_REQUIRES_ARM_NEON; 1915 for (uint32_t n = 16; n <= 24; n += 8) { 1916 for (size_t k = 1; k <= 40; k += 9) { 1917 for (uint32_t m = 1; m <= 4; m++) { 1918 GemmMicrokernelTester() 1919 .mr(4) 1920 .nr(8) 1921 .kr(1) 1922 .sr(1) 1923 .m(m) 1924 .n(n) 1925 .k(k) 1926 .iterations(1) 1927 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1928 } 1929 } 1930 } 1931 } 1932 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,strided_cm_subtile)1933 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, strided_cm_subtile) { 1934 TEST_REQUIRES_ARM_NEON; 1935 for (size_t k = 1; k <= 40; k += 9) { 1936 for (uint32_t n = 1; n <= 8; n++) { 1937 for (uint32_t m = 1; m <= 4; m++) { 1938 GemmMicrokernelTester() 1939 .mr(4) 1940 .nr(8) 1941 .kr(1) 1942 .sr(1) 1943 .m(m) 1944 .n(n) 1945 .k(k) 1946 .cm_stride(11) 1947 .iterations(1) 1948 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1949 } 1950 } 1951 } 1952 } 1953 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,qmin)1954 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, qmin) { 1955 TEST_REQUIRES_ARM_NEON; 1956 GemmMicrokernelTester() 1957 .mr(4) 1958 .nr(8) 1959 .kr(1) 1960 .sr(1) 1961 .m(4) 1962 .n(8) 1963 .k(8) 1964 .qmin(128) 1965 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1966 } 1967 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,qmax)1968 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, qmax) { 1969 TEST_REQUIRES_ARM_NEON; 1970 GemmMicrokernelTester() 1971 .mr(4) 1972 .nr(8) 1973 .kr(1) 1974 .sr(1) 1975 .m(4) 1976 .n(8) 1977 .k(8) 1978 .qmax(128) 1979 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1980 } 1981 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,strided_cm)1982 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, strided_cm) { 1983 TEST_REQUIRES_ARM_NEON; 1984 GemmMicrokernelTester() 1985 .mr(4) 1986 .nr(8) 1987 .kr(1) 1988 .sr(1) 1989 .m(4) 1990 .n(8) 1991 .k(8) 1992 .cm_stride(11) 1993 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1994 } 1995 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,no_a_zero_point)1996 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, no_a_zero_point) { 1997 TEST_REQUIRES_ARM_NEON; 1998 for (size_t k = 1; k <= 40; k += 9) { 1999 GemmMicrokernelTester() 2000 .mr(4) 2001 .nr(8) 2002 .kr(1) 2003 .sr(1) 2004 .m(4) 2005 .n(8) 2006 .k(k) 2007 .a_zero_point(0) 2008 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2009 } 2010 } 2011 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,no_b_zero_point)2012 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, no_b_zero_point) { 2013 TEST_REQUIRES_ARM_NEON; 2014 for (size_t k = 1; k <= 40; k += 9) { 2015 GemmMicrokernelTester() 2016 .mr(4) 2017 .nr(8) 2018 .kr(1) 2019 .sr(1) 2020 .m(4) 2021 .n(8) 2022 .k(k) 2023 .b_zero_point(0) 2024 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2025 } 2026 } 2027 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7,no_zero_point)2028 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A7, no_zero_point) { 2029 TEST_REQUIRES_ARM_NEON; 2030 for (size_t k = 1; k <= 40; k += 9) { 2031 GemmMicrokernelTester() 2032 .mr(4) 2033 .nr(8) 2034 .kr(1) 2035 .sr(1) 2036 .m(4) 2037 .n(8) 2038 .k(k) 2039 .a_zero_point(0) 2040 .b_zero_point(0) 2041 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2042 } 2043 } 2044 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY 2045 2046 2047 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8)2048 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) { 2049 TEST_REQUIRES_ARM_NEON; 2050 GemmMicrokernelTester() 2051 .mr(4) 2052 .nr(8) 2053 .kr(1) 2054 .sr(1) 2055 .m(4) 2056 .n(8) 2057 .k(8) 2058 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2059 } 2060 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,strided_cn)2061 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cn) { 2062 TEST_REQUIRES_ARM_NEON; 2063 GemmMicrokernelTester() 2064 .mr(4) 2065 .nr(8) 2066 .kr(1) 2067 .sr(1) 2068 .m(4) 2069 .n(8) 2070 .k(8) 2071 .cn_stride(11) 2072 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2073 } 2074 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8_strided_a)2075 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_strided_a) { 2076 TEST_REQUIRES_ARM_NEON; 2077 GemmMicrokernelTester() 2078 .mr(4) 2079 .nr(8) 2080 .kr(1) 2081 .sr(1) 2082 .m(4) 2083 .n(8) 2084 .k(8) 2085 .a_stride(11) 2086 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2087 } 2088 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8_subtile)2089 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile) { 2090 TEST_REQUIRES_ARM_NEON; 2091 for (uint32_t n = 1; n <= 8; n++) { 2092 for (uint32_t m = 1; m <= 4; m++) { 2093 GemmMicrokernelTester() 2094 .mr(4) 2095 .nr(8) 2096 .kr(1) 2097 .sr(1) 2098 .m(m) 2099 .n(n) 2100 .k(8) 2101 .iterations(1) 2102 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2103 } 2104 } 2105 } 2106 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8_subtile_m)2107 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_m) { 2108 TEST_REQUIRES_ARM_NEON; 2109 for (uint32_t m = 1; m <= 4; m++) { 2110 GemmMicrokernelTester() 2111 .mr(4) 2112 .nr(8) 2113 .kr(1) 2114 .sr(1) 2115 .m(m) 2116 .n(8) 2117 .k(8) 2118 .iterations(1) 2119 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2120 } 2121 } 2122 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8_subtile_n)2123 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_n) { 2124 TEST_REQUIRES_ARM_NEON; 2125 for (uint32_t n = 1; n <= 8; n++) { 2126 GemmMicrokernelTester() 2127 .mr(4) 2128 .nr(8) 2129 .kr(1) 2130 .sr(1) 2131 .m(4) 2132 .n(n) 2133 .k(8) 2134 .iterations(1) 2135 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2136 } 2137 } 2138 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_lt_8)2139 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8) { 2140 TEST_REQUIRES_ARM_NEON; 2141 for (size_t k = 1; k < 8; k++) { 2142 GemmMicrokernelTester() 2143 .mr(4) 2144 .nr(8) 2145 .kr(1) 2146 .sr(1) 2147 .m(4) 2148 .n(8) 2149 .k(k) 2150 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2151 } 2152 } 2153 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_lt_8_strided_a)2154 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_strided_a) { 2155 TEST_REQUIRES_ARM_NEON; 2156 for (size_t k = 1; k < 8; k++) { 2157 GemmMicrokernelTester() 2158 .mr(4) 2159 .nr(8) 2160 .kr(1) 2161 .sr(1) 2162 .m(4) 2163 .n(8) 2164 .k(k) 2165 .a_stride(11) 2166 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2167 } 2168 } 2169 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_lt_8_subtile)2170 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_subtile) { 2171 TEST_REQUIRES_ARM_NEON; 2172 for (size_t k = 1; k < 8; k++) { 2173 for (uint32_t n = 1; n <= 8; n++) { 2174 for (uint32_t m = 1; m <= 4; m++) { 2175 GemmMicrokernelTester() 2176 .mr(4) 2177 .nr(8) 2178 .kr(1) 2179 .sr(1) 2180 .m(m) 2181 .n(n) 2182 .k(k) 2183 .iterations(1) 2184 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2185 } 2186 } 2187 } 2188 } 2189 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_gt_8)2190 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8) { 2191 TEST_REQUIRES_ARM_NEON; 2192 for (size_t k = 9; k < 16; k++) { 2193 GemmMicrokernelTester() 2194 .mr(4) 2195 .nr(8) 2196 .kr(1) 2197 .sr(1) 2198 .m(4) 2199 .n(8) 2200 .k(k) 2201 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2202 } 2203 } 2204 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_gt_8_strided_a)2205 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_strided_a) { 2206 TEST_REQUIRES_ARM_NEON; 2207 for (size_t k = 9; k < 16; k++) { 2208 GemmMicrokernelTester() 2209 .mr(4) 2210 .nr(8) 2211 .kr(1) 2212 .sr(1) 2213 .m(4) 2214 .n(8) 2215 .k(k) 2216 .a_stride(19) 2217 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2218 } 2219 } 2220 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_gt_8_subtile)2221 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_subtile) { 2222 TEST_REQUIRES_ARM_NEON; 2223 for (size_t k = 9; k < 16; k++) { 2224 for (uint32_t n = 1; n <= 8; n++) { 2225 for (uint32_t m = 1; m <= 4; m++) { 2226 GemmMicrokernelTester() 2227 .mr(4) 2228 .nr(8) 2229 .kr(1) 2230 .sr(1) 2231 .m(m) 2232 .n(n) 2233 .k(k) 2234 .iterations(1) 2235 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2236 } 2237 } 2238 } 2239 } 2240 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_div_8)2241 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8) { 2242 TEST_REQUIRES_ARM_NEON; 2243 for (size_t k = 16; k <= 80; k += 8) { 2244 GemmMicrokernelTester() 2245 .mr(4) 2246 .nr(8) 2247 .kr(1) 2248 .sr(1) 2249 .m(4) 2250 .n(8) 2251 .k(k) 2252 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2253 } 2254 } 2255 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_div_8_strided_a)2256 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_strided_a) { 2257 TEST_REQUIRES_ARM_NEON; 2258 for (size_t k = 16; k <= 80; k += 8) { 2259 GemmMicrokernelTester() 2260 .mr(4) 2261 .nr(8) 2262 .kr(1) 2263 .sr(1) 2264 .m(4) 2265 .n(8) 2266 .k(k) 2267 .a_stride(83) 2268 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2269 } 2270 } 2271 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_div_8_subtile)2272 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_subtile) { 2273 TEST_REQUIRES_ARM_NEON; 2274 for (size_t k = 16; k <= 80; k += 8) { 2275 for (uint32_t n = 1; n <= 8; n++) { 2276 for (uint32_t m = 1; m <= 4; m++) { 2277 GemmMicrokernelTester() 2278 .mr(4) 2279 .nr(8) 2280 .kr(1) 2281 .sr(1) 2282 .m(m) 2283 .n(n) 2284 .k(k) 2285 .iterations(1) 2286 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2287 } 2288 } 2289 } 2290 } 2291 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_gt_8)2292 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_8) { 2293 TEST_REQUIRES_ARM_NEON; 2294 for (uint32_t n = 9; n < 16; n++) { 2295 for (size_t k = 1; k <= 40; k += 9) { 2296 GemmMicrokernelTester() 2297 .mr(4) 2298 .nr(8) 2299 .kr(1) 2300 .sr(1) 2301 .m(4) 2302 .n(n) 2303 .k(k) 2304 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2305 } 2306 } 2307 } 2308 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_gt_8_strided_cn)2309 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_8_strided_cn) { 2310 TEST_REQUIRES_ARM_NEON; 2311 for (uint32_t n = 9; n < 16; n++) { 2312 for (size_t k = 1; k <= 40; k += 9) { 2313 GemmMicrokernelTester() 2314 .mr(4) 2315 .nr(8) 2316 .kr(1) 2317 .sr(1) 2318 .m(4) 2319 .n(n) 2320 .k(k) 2321 .cn_stride(11) 2322 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2323 } 2324 } 2325 } 2326 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_gt_8_strided_a)2327 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_8_strided_a) { 2328 TEST_REQUIRES_ARM_NEON; 2329 for (uint32_t n = 9; n < 16; n++) { 2330 for (size_t k = 1; k <= 40; k += 9) { 2331 GemmMicrokernelTester() 2332 .mr(4) 2333 .nr(8) 2334 .kr(1) 2335 .sr(1) 2336 .m(4) 2337 .n(n) 2338 .k(k) 2339 .a_stride(43) 2340 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2341 } 2342 } 2343 } 2344 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_gt_8_subtile)2345 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_8_subtile) { 2346 TEST_REQUIRES_ARM_NEON; 2347 for (uint32_t n = 9; n < 16; n++) { 2348 for (size_t k = 1; k <= 40; k += 9) { 2349 for (uint32_t m = 1; m <= 4; m++) { 2350 GemmMicrokernelTester() 2351 .mr(4) 2352 .nr(8) 2353 .kr(1) 2354 .sr(1) 2355 .m(m) 2356 .n(n) 2357 .k(k) 2358 .iterations(1) 2359 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2360 } 2361 } 2362 } 2363 } 2364 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_div_8)2365 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_8) { 2366 TEST_REQUIRES_ARM_NEON; 2367 for (uint32_t n = 16; n <= 24; n += 8) { 2368 for (size_t k = 1; k <= 40; k += 9) { 2369 GemmMicrokernelTester() 2370 .mr(4) 2371 .nr(8) 2372 .kr(1) 2373 .sr(1) 2374 .m(4) 2375 .n(n) 2376 .k(k) 2377 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2378 } 2379 } 2380 } 2381 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_div_8_strided_cn)2382 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_8_strided_cn) { 2383 TEST_REQUIRES_ARM_NEON; 2384 for (uint32_t n = 16; n <= 24; n += 8) { 2385 for (size_t k = 1; k <= 40; k += 9) { 2386 GemmMicrokernelTester() 2387 .mr(4) 2388 .nr(8) 2389 .kr(1) 2390 .sr(1) 2391 .m(4) 2392 .n(n) 2393 .k(k) 2394 .cn_stride(11) 2395 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2396 } 2397 } 2398 } 2399 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_div_8_strided_a)2400 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_8_strided_a) { 2401 TEST_REQUIRES_ARM_NEON; 2402 for (uint32_t n = 16; n <= 24; n += 8) { 2403 for (size_t k = 1; k <= 40; k += 9) { 2404 GemmMicrokernelTester() 2405 .mr(4) 2406 .nr(8) 2407 .kr(1) 2408 .sr(1) 2409 .m(4) 2410 .n(n) 2411 .k(k) 2412 .a_stride(43) 2413 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2414 } 2415 } 2416 } 2417 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_div_8_subtile)2418 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_8_subtile) { 2419 TEST_REQUIRES_ARM_NEON; 2420 for (uint32_t n = 16; n <= 24; n += 8) { 2421 for (size_t k = 1; k <= 40; k += 9) { 2422 for (uint32_t m = 1; m <= 4; m++) { 2423 GemmMicrokernelTester() 2424 .mr(4) 2425 .nr(8) 2426 .kr(1) 2427 .sr(1) 2428 .m(m) 2429 .n(n) 2430 .k(k) 2431 .iterations(1) 2432 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2433 } 2434 } 2435 } 2436 } 2437 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,strided_cm_subtile)2438 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm_subtile) { 2439 TEST_REQUIRES_ARM_NEON; 2440 for (size_t k = 1; k <= 40; k += 9) { 2441 for (uint32_t n = 1; n <= 8; n++) { 2442 for (uint32_t m = 1; m <= 4; m++) { 2443 GemmMicrokernelTester() 2444 .mr(4) 2445 .nr(8) 2446 .kr(1) 2447 .sr(1) 2448 .m(m) 2449 .n(n) 2450 .k(k) 2451 .cm_stride(11) 2452 .iterations(1) 2453 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2454 } 2455 } 2456 } 2457 } 2458 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,qmin)2459 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmin) { 2460 TEST_REQUIRES_ARM_NEON; 2461 GemmMicrokernelTester() 2462 .mr(4) 2463 .nr(8) 2464 .kr(1) 2465 .sr(1) 2466 .m(4) 2467 .n(8) 2468 .k(8) 2469 .qmin(128) 2470 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2471 } 2472 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,qmax)2473 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmax) { 2474 TEST_REQUIRES_ARM_NEON; 2475 GemmMicrokernelTester() 2476 .mr(4) 2477 .nr(8) 2478 .kr(1) 2479 .sr(1) 2480 .m(4) 2481 .n(8) 2482 .k(8) 2483 .qmax(128) 2484 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2485 } 2486 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,strided_cm)2487 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm) { 2488 TEST_REQUIRES_ARM_NEON; 2489 GemmMicrokernelTester() 2490 .mr(4) 2491 .nr(8) 2492 .kr(1) 2493 .sr(1) 2494 .m(4) 2495 .n(8) 2496 .k(8) 2497 .cm_stride(11) 2498 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2499 } 2500 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,no_a_zero_point)2501 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, no_a_zero_point) { 2502 TEST_REQUIRES_ARM_NEON; 2503 for (size_t k = 1; k <= 40; k += 9) { 2504 GemmMicrokernelTester() 2505 .mr(4) 2506 .nr(8) 2507 .kr(1) 2508 .sr(1) 2509 .m(4) 2510 .n(8) 2511 .k(k) 2512 .a_zero_point(0) 2513 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2514 } 2515 } 2516 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,no_b_zero_point)2517 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, no_b_zero_point) { 2518 TEST_REQUIRES_ARM_NEON; 2519 for (size_t k = 1; k <= 40; k += 9) { 2520 GemmMicrokernelTester() 2521 .mr(4) 2522 .nr(8) 2523 .kr(1) 2524 .sr(1) 2525 .m(4) 2526 .n(8) 2527 .k(k) 2528 .b_zero_point(0) 2529 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2530 } 2531 } 2532 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53,no_zero_point)2533 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_CORTEX_A53, no_zero_point) { 2534 TEST_REQUIRES_ARM_NEON; 2535 for (size_t k = 1; k <= 40; k += 9) { 2536 GemmMicrokernelTester() 2537 .mr(4) 2538 .nr(8) 2539 .kr(1) 2540 .sr(1) 2541 .m(4) 2542 .n(8) 2543 .k(k) 2544 .a_zero_point(0) 2545 .b_zero_point(0) 2546 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2547 } 2548 } 2549 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY 2550 2551 2552 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_eq_8)2553 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_eq_8) { 2554 TEST_REQUIRES_ARM_NEON; 2555 GemmMicrokernelTester() 2556 .mr(1) 2557 .nr(16) 2558 .kr(1) 2559 .sr(1) 2560 .m(1) 2561 .n(16) 2562 .k(8) 2563 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2564 } 2565 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,strided_cn)2566 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, strided_cn) { 2567 TEST_REQUIRES_ARM_NEON; 2568 GemmMicrokernelTester() 2569 .mr(1) 2570 .nr(16) 2571 .kr(1) 2572 .sr(1) 2573 .m(1) 2574 .n(16) 2575 .k(8) 2576 .cn_stride(19) 2577 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2578 } 2579 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_eq_8_strided_a)2580 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_eq_8_strided_a) { 2581 TEST_REQUIRES_ARM_NEON; 2582 GemmMicrokernelTester() 2583 .mr(1) 2584 .nr(16) 2585 .kr(1) 2586 .sr(1) 2587 .m(1) 2588 .n(16) 2589 .k(8) 2590 .a_stride(11) 2591 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2592 } 2593 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_eq_8_subtile)2594 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_eq_8_subtile) { 2595 TEST_REQUIRES_ARM_NEON; 2596 for (uint32_t n = 1; n <= 16; n++) { 2597 for (uint32_t m = 1; m <= 1; m++) { 2598 GemmMicrokernelTester() 2599 .mr(1) 2600 .nr(16) 2601 .kr(1) 2602 .sr(1) 2603 .m(m) 2604 .n(n) 2605 .k(8) 2606 .iterations(1) 2607 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2608 } 2609 } 2610 } 2611 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_eq_8_subtile_m)2612 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_eq_8_subtile_m) { 2613 TEST_REQUIRES_ARM_NEON; 2614 for (uint32_t m = 1; m <= 1; m++) { 2615 GemmMicrokernelTester() 2616 .mr(1) 2617 .nr(16) 2618 .kr(1) 2619 .sr(1) 2620 .m(m) 2621 .n(16) 2622 .k(8) 2623 .iterations(1) 2624 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2625 } 2626 } 2627 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_eq_8_subtile_n)2628 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_eq_8_subtile_n) { 2629 TEST_REQUIRES_ARM_NEON; 2630 for (uint32_t n = 1; n <= 16; n++) { 2631 GemmMicrokernelTester() 2632 .mr(1) 2633 .nr(16) 2634 .kr(1) 2635 .sr(1) 2636 .m(1) 2637 .n(n) 2638 .k(8) 2639 .iterations(1) 2640 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2641 } 2642 } 2643 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_lt_8)2644 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_lt_8) { 2645 TEST_REQUIRES_ARM_NEON; 2646 for (size_t k = 1; k < 8; k++) { 2647 GemmMicrokernelTester() 2648 .mr(1) 2649 .nr(16) 2650 .kr(1) 2651 .sr(1) 2652 .m(1) 2653 .n(16) 2654 .k(k) 2655 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2656 } 2657 } 2658 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_lt_8_strided_a)2659 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_lt_8_strided_a) { 2660 TEST_REQUIRES_ARM_NEON; 2661 for (size_t k = 1; k < 8; k++) { 2662 GemmMicrokernelTester() 2663 .mr(1) 2664 .nr(16) 2665 .kr(1) 2666 .sr(1) 2667 .m(1) 2668 .n(16) 2669 .k(k) 2670 .a_stride(11) 2671 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2672 } 2673 } 2674 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_lt_8_subtile)2675 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_lt_8_subtile) { 2676 TEST_REQUIRES_ARM_NEON; 2677 for (size_t k = 1; k < 8; k++) { 2678 for (uint32_t n = 1; n <= 16; n++) { 2679 for (uint32_t m = 1; m <= 1; m++) { 2680 GemmMicrokernelTester() 2681 .mr(1) 2682 .nr(16) 2683 .kr(1) 2684 .sr(1) 2685 .m(m) 2686 .n(n) 2687 .k(k) 2688 .iterations(1) 2689 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2690 } 2691 } 2692 } 2693 } 2694 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_gt_8)2695 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_gt_8) { 2696 TEST_REQUIRES_ARM_NEON; 2697 for (size_t k = 9; k < 16; k++) { 2698 GemmMicrokernelTester() 2699 .mr(1) 2700 .nr(16) 2701 .kr(1) 2702 .sr(1) 2703 .m(1) 2704 .n(16) 2705 .k(k) 2706 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2707 } 2708 } 2709 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_gt_8_strided_a)2710 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_gt_8_strided_a) { 2711 TEST_REQUIRES_ARM_NEON; 2712 for (size_t k = 9; k < 16; k++) { 2713 GemmMicrokernelTester() 2714 .mr(1) 2715 .nr(16) 2716 .kr(1) 2717 .sr(1) 2718 .m(1) 2719 .n(16) 2720 .k(k) 2721 .a_stride(19) 2722 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2723 } 2724 } 2725 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_gt_8_subtile)2726 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_gt_8_subtile) { 2727 TEST_REQUIRES_ARM_NEON; 2728 for (size_t k = 9; k < 16; k++) { 2729 for (uint32_t n = 1; n <= 16; n++) { 2730 for (uint32_t m = 1; m <= 1; m++) { 2731 GemmMicrokernelTester() 2732 .mr(1) 2733 .nr(16) 2734 .kr(1) 2735 .sr(1) 2736 .m(m) 2737 .n(n) 2738 .k(k) 2739 .iterations(1) 2740 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2741 } 2742 } 2743 } 2744 } 2745 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_div_8)2746 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_div_8) { 2747 TEST_REQUIRES_ARM_NEON; 2748 for (size_t k = 16; k <= 80; k += 8) { 2749 GemmMicrokernelTester() 2750 .mr(1) 2751 .nr(16) 2752 .kr(1) 2753 .sr(1) 2754 .m(1) 2755 .n(16) 2756 .k(k) 2757 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2758 } 2759 } 2760 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_div_8_strided_a)2761 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_div_8_strided_a) { 2762 TEST_REQUIRES_ARM_NEON; 2763 for (size_t k = 16; k <= 80; k += 8) { 2764 GemmMicrokernelTester() 2765 .mr(1) 2766 .nr(16) 2767 .kr(1) 2768 .sr(1) 2769 .m(1) 2770 .n(16) 2771 .k(k) 2772 .a_stride(83) 2773 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2774 } 2775 } 2776 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,k_div_8_subtile)2777 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, k_div_8_subtile) { 2778 TEST_REQUIRES_ARM_NEON; 2779 for (size_t k = 16; k <= 80; k += 8) { 2780 for (uint32_t n = 1; n <= 16; n++) { 2781 for (uint32_t m = 1; m <= 1; m++) { 2782 GemmMicrokernelTester() 2783 .mr(1) 2784 .nr(16) 2785 .kr(1) 2786 .sr(1) 2787 .m(m) 2788 .n(n) 2789 .k(k) 2790 .iterations(1) 2791 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2792 } 2793 } 2794 } 2795 } 2796 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,n_gt_16)2797 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, n_gt_16) { 2798 TEST_REQUIRES_ARM_NEON; 2799 for (uint32_t n = 17; n < 32; n++) { 2800 for (size_t k = 1; k <= 40; k += 9) { 2801 GemmMicrokernelTester() 2802 .mr(1) 2803 .nr(16) 2804 .kr(1) 2805 .sr(1) 2806 .m(1) 2807 .n(n) 2808 .k(k) 2809 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2810 } 2811 } 2812 } 2813 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,n_gt_16_strided_cn)2814 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, n_gt_16_strided_cn) { 2815 TEST_REQUIRES_ARM_NEON; 2816 for (uint32_t n = 17; n < 32; n++) { 2817 for (size_t k = 1; k <= 40; k += 9) { 2818 GemmMicrokernelTester() 2819 .mr(1) 2820 .nr(16) 2821 .kr(1) 2822 .sr(1) 2823 .m(1) 2824 .n(n) 2825 .k(k) 2826 .cn_stride(19) 2827 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2828 } 2829 } 2830 } 2831 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,n_gt_16_strided_a)2832 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, n_gt_16_strided_a) { 2833 TEST_REQUIRES_ARM_NEON; 2834 for (uint32_t n = 17; n < 32; n++) { 2835 for (size_t k = 1; k <= 40; k += 9) { 2836 GemmMicrokernelTester() 2837 .mr(1) 2838 .nr(16) 2839 .kr(1) 2840 .sr(1) 2841 .m(1) 2842 .n(n) 2843 .k(k) 2844 .a_stride(43) 2845 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2846 } 2847 } 2848 } 2849 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,n_gt_16_subtile)2850 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, n_gt_16_subtile) { 2851 TEST_REQUIRES_ARM_NEON; 2852 for (uint32_t n = 17; n < 32; n++) { 2853 for (size_t k = 1; k <= 40; k += 9) { 2854 for (uint32_t m = 1; m <= 1; m++) { 2855 GemmMicrokernelTester() 2856 .mr(1) 2857 .nr(16) 2858 .kr(1) 2859 .sr(1) 2860 .m(m) 2861 .n(n) 2862 .k(k) 2863 .iterations(1) 2864 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2865 } 2866 } 2867 } 2868 } 2869 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,n_div_16)2870 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, n_div_16) { 2871 TEST_REQUIRES_ARM_NEON; 2872 for (uint32_t n = 32; n <= 48; n += 16) { 2873 for (size_t k = 1; k <= 40; k += 9) { 2874 GemmMicrokernelTester() 2875 .mr(1) 2876 .nr(16) 2877 .kr(1) 2878 .sr(1) 2879 .m(1) 2880 .n(n) 2881 .k(k) 2882 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2883 } 2884 } 2885 } 2886 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,n_div_16_strided_cn)2887 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, n_div_16_strided_cn) { 2888 TEST_REQUIRES_ARM_NEON; 2889 for (uint32_t n = 32; n <= 48; n += 16) { 2890 for (size_t k = 1; k <= 40; k += 9) { 2891 GemmMicrokernelTester() 2892 .mr(1) 2893 .nr(16) 2894 .kr(1) 2895 .sr(1) 2896 .m(1) 2897 .n(n) 2898 .k(k) 2899 .cn_stride(19) 2900 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2901 } 2902 } 2903 } 2904 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,n_div_16_strided_a)2905 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, n_div_16_strided_a) { 2906 TEST_REQUIRES_ARM_NEON; 2907 for (uint32_t n = 32; n <= 48; n += 16) { 2908 for (size_t k = 1; k <= 40; k += 9) { 2909 GemmMicrokernelTester() 2910 .mr(1) 2911 .nr(16) 2912 .kr(1) 2913 .sr(1) 2914 .m(1) 2915 .n(n) 2916 .k(k) 2917 .a_stride(43) 2918 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2919 } 2920 } 2921 } 2922 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,n_div_16_subtile)2923 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, n_div_16_subtile) { 2924 TEST_REQUIRES_ARM_NEON; 2925 for (uint32_t n = 32; n <= 48; n += 16) { 2926 for (size_t k = 1; k <= 40; k += 9) { 2927 for (uint32_t m = 1; m <= 1; m++) { 2928 GemmMicrokernelTester() 2929 .mr(1) 2930 .nr(16) 2931 .kr(1) 2932 .sr(1) 2933 .m(m) 2934 .n(n) 2935 .k(k) 2936 .iterations(1) 2937 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2938 } 2939 } 2940 } 2941 } 2942 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,strided_cm_subtile)2943 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, strided_cm_subtile) { 2944 TEST_REQUIRES_ARM_NEON; 2945 for (size_t k = 1; k <= 40; k += 9) { 2946 for (uint32_t n = 1; n <= 16; n++) { 2947 for (uint32_t m = 1; m <= 1; m++) { 2948 GemmMicrokernelTester() 2949 .mr(1) 2950 .nr(16) 2951 .kr(1) 2952 .sr(1) 2953 .m(m) 2954 .n(n) 2955 .k(k) 2956 .cm_stride(19) 2957 .iterations(1) 2958 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2959 } 2960 } 2961 } 2962 } 2963 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,qmin)2964 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, qmin) { 2965 TEST_REQUIRES_ARM_NEON; 2966 GemmMicrokernelTester() 2967 .mr(1) 2968 .nr(16) 2969 .kr(1) 2970 .sr(1) 2971 .m(1) 2972 .n(16) 2973 .k(8) 2974 .qmin(128) 2975 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2976 } 2977 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,qmax)2978 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, qmax) { 2979 TEST_REQUIRES_ARM_NEON; 2980 GemmMicrokernelTester() 2981 .mr(1) 2982 .nr(16) 2983 .kr(1) 2984 .sr(1) 2985 .m(1) 2986 .n(16) 2987 .k(8) 2988 .qmax(128) 2989 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2990 } 2991 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,strided_cm)2992 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, strided_cm) { 2993 TEST_REQUIRES_ARM_NEON; 2994 GemmMicrokernelTester() 2995 .mr(1) 2996 .nr(16) 2997 .kr(1) 2998 .sr(1) 2999 .m(1) 3000 .n(16) 3001 .k(8) 3002 .cm_stride(19) 3003 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3004 } 3005 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,no_a_zero_point)3006 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, no_a_zero_point) { 3007 TEST_REQUIRES_ARM_NEON; 3008 for (size_t k = 1; k <= 40; k += 9) { 3009 GemmMicrokernelTester() 3010 .mr(1) 3011 .nr(16) 3012 .kr(1) 3013 .sr(1) 3014 .m(1) 3015 .n(16) 3016 .k(k) 3017 .a_zero_point(0) 3018 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3019 } 3020 } 3021 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,no_b_zero_point)3022 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, no_b_zero_point) { 3023 TEST_REQUIRES_ARM_NEON; 3024 for (size_t k = 1; k <= 40; k += 9) { 3025 GemmMicrokernelTester() 3026 .mr(1) 3027 .nr(16) 3028 .kr(1) 3029 .sr(1) 3030 .m(1) 3031 .n(16) 3032 .k(k) 3033 .b_zero_point(0) 3034 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3035 } 3036 } 3037 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE,no_zero_point)3038 TEST(QU8_GEMM_MINMAX_RNDNU_1X16__NEON_MLAL_LANE, no_zero_point) { 3039 TEST_REQUIRES_ARM_NEON; 3040 for (size_t k = 1; k <= 40; k += 9) { 3041 GemmMicrokernelTester() 3042 .mr(1) 3043 .nr(16) 3044 .kr(1) 3045 .sr(1) 3046 .m(1) 3047 .n(16) 3048 .k(k) 3049 .a_zero_point(0) 3050 .b_zero_point(0) 3051 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3052 } 3053 } 3054 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 3055 3056 3057 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_eq_8)3058 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_eq_8) { 3059 TEST_REQUIRES_ARM_NEON; 3060 GemmMicrokernelTester() 3061 .mr(2) 3062 .nr(8) 3063 .kr(1) 3064 .sr(1) 3065 .m(2) 3066 .n(8) 3067 .k(8) 3068 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3069 } 3070 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,strided_cn)3071 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, strided_cn) { 3072 TEST_REQUIRES_ARM_NEON; 3073 GemmMicrokernelTester() 3074 .mr(2) 3075 .nr(8) 3076 .kr(1) 3077 .sr(1) 3078 .m(2) 3079 .n(8) 3080 .k(8) 3081 .cn_stride(11) 3082 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3083 } 3084 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_eq_8_strided_a)3085 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_eq_8_strided_a) { 3086 TEST_REQUIRES_ARM_NEON; 3087 GemmMicrokernelTester() 3088 .mr(2) 3089 .nr(8) 3090 .kr(1) 3091 .sr(1) 3092 .m(2) 3093 .n(8) 3094 .k(8) 3095 .a_stride(11) 3096 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3097 } 3098 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_eq_8_subtile)3099 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_eq_8_subtile) { 3100 TEST_REQUIRES_ARM_NEON; 3101 for (uint32_t n = 1; n <= 8; n++) { 3102 for (uint32_t m = 1; m <= 2; m++) { 3103 GemmMicrokernelTester() 3104 .mr(2) 3105 .nr(8) 3106 .kr(1) 3107 .sr(1) 3108 .m(m) 3109 .n(n) 3110 .k(8) 3111 .iterations(1) 3112 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3113 } 3114 } 3115 } 3116 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_eq_8_subtile_m)3117 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_eq_8_subtile_m) { 3118 TEST_REQUIRES_ARM_NEON; 3119 for (uint32_t m = 1; m <= 2; m++) { 3120 GemmMicrokernelTester() 3121 .mr(2) 3122 .nr(8) 3123 .kr(1) 3124 .sr(1) 3125 .m(m) 3126 .n(8) 3127 .k(8) 3128 .iterations(1) 3129 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3130 } 3131 } 3132 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_eq_8_subtile_n)3133 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_eq_8_subtile_n) { 3134 TEST_REQUIRES_ARM_NEON; 3135 for (uint32_t n = 1; n <= 8; n++) { 3136 GemmMicrokernelTester() 3137 .mr(2) 3138 .nr(8) 3139 .kr(1) 3140 .sr(1) 3141 .m(2) 3142 .n(n) 3143 .k(8) 3144 .iterations(1) 3145 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3146 } 3147 } 3148 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_lt_8)3149 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_lt_8) { 3150 TEST_REQUIRES_ARM_NEON; 3151 for (size_t k = 1; k < 8; k++) { 3152 GemmMicrokernelTester() 3153 .mr(2) 3154 .nr(8) 3155 .kr(1) 3156 .sr(1) 3157 .m(2) 3158 .n(8) 3159 .k(k) 3160 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3161 } 3162 } 3163 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_lt_8_strided_a)3164 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_lt_8_strided_a) { 3165 TEST_REQUIRES_ARM_NEON; 3166 for (size_t k = 1; k < 8; k++) { 3167 GemmMicrokernelTester() 3168 .mr(2) 3169 .nr(8) 3170 .kr(1) 3171 .sr(1) 3172 .m(2) 3173 .n(8) 3174 .k(k) 3175 .a_stride(11) 3176 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3177 } 3178 } 3179 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_lt_8_subtile)3180 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_lt_8_subtile) { 3181 TEST_REQUIRES_ARM_NEON; 3182 for (size_t k = 1; k < 8; k++) { 3183 for (uint32_t n = 1; n <= 8; n++) { 3184 for (uint32_t m = 1; m <= 2; m++) { 3185 GemmMicrokernelTester() 3186 .mr(2) 3187 .nr(8) 3188 .kr(1) 3189 .sr(1) 3190 .m(m) 3191 .n(n) 3192 .k(k) 3193 .iterations(1) 3194 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3195 } 3196 } 3197 } 3198 } 3199 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_gt_8)3200 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_gt_8) { 3201 TEST_REQUIRES_ARM_NEON; 3202 for (size_t k = 9; k < 16; k++) { 3203 GemmMicrokernelTester() 3204 .mr(2) 3205 .nr(8) 3206 .kr(1) 3207 .sr(1) 3208 .m(2) 3209 .n(8) 3210 .k(k) 3211 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3212 } 3213 } 3214 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_gt_8_strided_a)3215 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_gt_8_strided_a) { 3216 TEST_REQUIRES_ARM_NEON; 3217 for (size_t k = 9; k < 16; k++) { 3218 GemmMicrokernelTester() 3219 .mr(2) 3220 .nr(8) 3221 .kr(1) 3222 .sr(1) 3223 .m(2) 3224 .n(8) 3225 .k(k) 3226 .a_stride(19) 3227 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3228 } 3229 } 3230 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_gt_8_subtile)3231 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_gt_8_subtile) { 3232 TEST_REQUIRES_ARM_NEON; 3233 for (size_t k = 9; k < 16; k++) { 3234 for (uint32_t n = 1; n <= 8; n++) { 3235 for (uint32_t m = 1; m <= 2; m++) { 3236 GemmMicrokernelTester() 3237 .mr(2) 3238 .nr(8) 3239 .kr(1) 3240 .sr(1) 3241 .m(m) 3242 .n(n) 3243 .k(k) 3244 .iterations(1) 3245 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3246 } 3247 } 3248 } 3249 } 3250 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_div_8)3251 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_div_8) { 3252 TEST_REQUIRES_ARM_NEON; 3253 for (size_t k = 16; k <= 80; k += 8) { 3254 GemmMicrokernelTester() 3255 .mr(2) 3256 .nr(8) 3257 .kr(1) 3258 .sr(1) 3259 .m(2) 3260 .n(8) 3261 .k(k) 3262 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3263 } 3264 } 3265 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_div_8_strided_a)3266 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_div_8_strided_a) { 3267 TEST_REQUIRES_ARM_NEON; 3268 for (size_t k = 16; k <= 80; k += 8) { 3269 GemmMicrokernelTester() 3270 .mr(2) 3271 .nr(8) 3272 .kr(1) 3273 .sr(1) 3274 .m(2) 3275 .n(8) 3276 .k(k) 3277 .a_stride(83) 3278 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3279 } 3280 } 3281 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,k_div_8_subtile)3282 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, k_div_8_subtile) { 3283 TEST_REQUIRES_ARM_NEON; 3284 for (size_t k = 16; k <= 80; k += 8) { 3285 for (uint32_t n = 1; n <= 8; n++) { 3286 for (uint32_t m = 1; m <= 2; m++) { 3287 GemmMicrokernelTester() 3288 .mr(2) 3289 .nr(8) 3290 .kr(1) 3291 .sr(1) 3292 .m(m) 3293 .n(n) 3294 .k(k) 3295 .iterations(1) 3296 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3297 } 3298 } 3299 } 3300 } 3301 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,n_gt_8)3302 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, n_gt_8) { 3303 TEST_REQUIRES_ARM_NEON; 3304 for (uint32_t n = 9; n < 16; n++) { 3305 for (size_t k = 1; k <= 40; k += 9) { 3306 GemmMicrokernelTester() 3307 .mr(2) 3308 .nr(8) 3309 .kr(1) 3310 .sr(1) 3311 .m(2) 3312 .n(n) 3313 .k(k) 3314 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3315 } 3316 } 3317 } 3318 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,n_gt_8_strided_cn)3319 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, n_gt_8_strided_cn) { 3320 TEST_REQUIRES_ARM_NEON; 3321 for (uint32_t n = 9; n < 16; n++) { 3322 for (size_t k = 1; k <= 40; k += 9) { 3323 GemmMicrokernelTester() 3324 .mr(2) 3325 .nr(8) 3326 .kr(1) 3327 .sr(1) 3328 .m(2) 3329 .n(n) 3330 .k(k) 3331 .cn_stride(11) 3332 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3333 } 3334 } 3335 } 3336 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,n_gt_8_strided_a)3337 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, n_gt_8_strided_a) { 3338 TEST_REQUIRES_ARM_NEON; 3339 for (uint32_t n = 9; n < 16; n++) { 3340 for (size_t k = 1; k <= 40; k += 9) { 3341 GemmMicrokernelTester() 3342 .mr(2) 3343 .nr(8) 3344 .kr(1) 3345 .sr(1) 3346 .m(2) 3347 .n(n) 3348 .k(k) 3349 .a_stride(43) 3350 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3351 } 3352 } 3353 } 3354 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,n_gt_8_subtile)3355 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, n_gt_8_subtile) { 3356 TEST_REQUIRES_ARM_NEON; 3357 for (uint32_t n = 9; n < 16; n++) { 3358 for (size_t k = 1; k <= 40; k += 9) { 3359 for (uint32_t m = 1; m <= 2; m++) { 3360 GemmMicrokernelTester() 3361 .mr(2) 3362 .nr(8) 3363 .kr(1) 3364 .sr(1) 3365 .m(m) 3366 .n(n) 3367 .k(k) 3368 .iterations(1) 3369 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3370 } 3371 } 3372 } 3373 } 3374 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,n_div_8)3375 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, n_div_8) { 3376 TEST_REQUIRES_ARM_NEON; 3377 for (uint32_t n = 16; n <= 24; n += 8) { 3378 for (size_t k = 1; k <= 40; k += 9) { 3379 GemmMicrokernelTester() 3380 .mr(2) 3381 .nr(8) 3382 .kr(1) 3383 .sr(1) 3384 .m(2) 3385 .n(n) 3386 .k(k) 3387 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3388 } 3389 } 3390 } 3391 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,n_div_8_strided_cn)3392 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, n_div_8_strided_cn) { 3393 TEST_REQUIRES_ARM_NEON; 3394 for (uint32_t n = 16; n <= 24; n += 8) { 3395 for (size_t k = 1; k <= 40; k += 9) { 3396 GemmMicrokernelTester() 3397 .mr(2) 3398 .nr(8) 3399 .kr(1) 3400 .sr(1) 3401 .m(2) 3402 .n(n) 3403 .k(k) 3404 .cn_stride(11) 3405 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3406 } 3407 } 3408 } 3409 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,n_div_8_strided_a)3410 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, n_div_8_strided_a) { 3411 TEST_REQUIRES_ARM_NEON; 3412 for (uint32_t n = 16; n <= 24; n += 8) { 3413 for (size_t k = 1; k <= 40; k += 9) { 3414 GemmMicrokernelTester() 3415 .mr(2) 3416 .nr(8) 3417 .kr(1) 3418 .sr(1) 3419 .m(2) 3420 .n(n) 3421 .k(k) 3422 .a_stride(43) 3423 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3424 } 3425 } 3426 } 3427 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,n_div_8_subtile)3428 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, n_div_8_subtile) { 3429 TEST_REQUIRES_ARM_NEON; 3430 for (uint32_t n = 16; n <= 24; n += 8) { 3431 for (size_t k = 1; k <= 40; k += 9) { 3432 for (uint32_t m = 1; m <= 2; m++) { 3433 GemmMicrokernelTester() 3434 .mr(2) 3435 .nr(8) 3436 .kr(1) 3437 .sr(1) 3438 .m(m) 3439 .n(n) 3440 .k(k) 3441 .iterations(1) 3442 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3443 } 3444 } 3445 } 3446 } 3447 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,strided_cm_subtile)3448 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, strided_cm_subtile) { 3449 TEST_REQUIRES_ARM_NEON; 3450 for (size_t k = 1; k <= 40; k += 9) { 3451 for (uint32_t n = 1; n <= 8; n++) { 3452 for (uint32_t m = 1; m <= 2; m++) { 3453 GemmMicrokernelTester() 3454 .mr(2) 3455 .nr(8) 3456 .kr(1) 3457 .sr(1) 3458 .m(m) 3459 .n(n) 3460 .k(k) 3461 .cm_stride(11) 3462 .iterations(1) 3463 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3464 } 3465 } 3466 } 3467 } 3468 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,qmin)3469 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, qmin) { 3470 TEST_REQUIRES_ARM_NEON; 3471 GemmMicrokernelTester() 3472 .mr(2) 3473 .nr(8) 3474 .kr(1) 3475 .sr(1) 3476 .m(2) 3477 .n(8) 3478 .k(8) 3479 .qmin(128) 3480 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3481 } 3482 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,qmax)3483 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, qmax) { 3484 TEST_REQUIRES_ARM_NEON; 3485 GemmMicrokernelTester() 3486 .mr(2) 3487 .nr(8) 3488 .kr(1) 3489 .sr(1) 3490 .m(2) 3491 .n(8) 3492 .k(8) 3493 .qmax(128) 3494 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3495 } 3496 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,strided_cm)3497 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, strided_cm) { 3498 TEST_REQUIRES_ARM_NEON; 3499 GemmMicrokernelTester() 3500 .mr(2) 3501 .nr(8) 3502 .kr(1) 3503 .sr(1) 3504 .m(2) 3505 .n(8) 3506 .k(8) 3507 .cm_stride(11) 3508 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3509 } 3510 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,no_a_zero_point)3511 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, no_a_zero_point) { 3512 TEST_REQUIRES_ARM_NEON; 3513 for (size_t k = 1; k <= 40; k += 9) { 3514 GemmMicrokernelTester() 3515 .mr(2) 3516 .nr(8) 3517 .kr(1) 3518 .sr(1) 3519 .m(2) 3520 .n(8) 3521 .k(k) 3522 .a_zero_point(0) 3523 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3524 } 3525 } 3526 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,no_b_zero_point)3527 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, no_b_zero_point) { 3528 TEST_REQUIRES_ARM_NEON; 3529 for (size_t k = 1; k <= 40; k += 9) { 3530 GemmMicrokernelTester() 3531 .mr(2) 3532 .nr(8) 3533 .kr(1) 3534 .sr(1) 3535 .m(2) 3536 .n(8) 3537 .k(k) 3538 .b_zero_point(0) 3539 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3540 } 3541 } 3542 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE,no_zero_point)3543 TEST(QU8_GEMM_MINMAX_RNDNU_2X8__NEON_MLAL_LANE, no_zero_point) { 3544 TEST_REQUIRES_ARM_NEON; 3545 for (size_t k = 1; k <= 40; k += 9) { 3546 GemmMicrokernelTester() 3547 .mr(2) 3548 .nr(8) 3549 .kr(1) 3550 .sr(1) 3551 .m(2) 3552 .n(8) 3553 .k(k) 3554 .a_zero_point(0) 3555 .b_zero_point(0) 3556 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3557 } 3558 } 3559 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 3560 3561 3562 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_eq_8)3563 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_eq_8) { 3564 TEST_REQUIRES_ARM_NEON; 3565 GemmMicrokernelTester() 3566 .mr(2) 3567 .nr(16) 3568 .kr(1) 3569 .sr(1) 3570 .m(2) 3571 .n(16) 3572 .k(8) 3573 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3574 } 3575 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,strided_cn)3576 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, strided_cn) { 3577 TEST_REQUIRES_ARM_NEON; 3578 GemmMicrokernelTester() 3579 .mr(2) 3580 .nr(16) 3581 .kr(1) 3582 .sr(1) 3583 .m(2) 3584 .n(16) 3585 .k(8) 3586 .cn_stride(19) 3587 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3588 } 3589 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_eq_8_strided_a)3590 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_eq_8_strided_a) { 3591 TEST_REQUIRES_ARM_NEON; 3592 GemmMicrokernelTester() 3593 .mr(2) 3594 .nr(16) 3595 .kr(1) 3596 .sr(1) 3597 .m(2) 3598 .n(16) 3599 .k(8) 3600 .a_stride(11) 3601 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3602 } 3603 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_eq_8_subtile)3604 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_eq_8_subtile) { 3605 TEST_REQUIRES_ARM_NEON; 3606 for (uint32_t n = 1; n <= 16; n++) { 3607 for (uint32_t m = 1; m <= 2; m++) { 3608 GemmMicrokernelTester() 3609 .mr(2) 3610 .nr(16) 3611 .kr(1) 3612 .sr(1) 3613 .m(m) 3614 .n(n) 3615 .k(8) 3616 .iterations(1) 3617 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3618 } 3619 } 3620 } 3621 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_eq_8_subtile_m)3622 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_eq_8_subtile_m) { 3623 TEST_REQUIRES_ARM_NEON; 3624 for (uint32_t m = 1; m <= 2; m++) { 3625 GemmMicrokernelTester() 3626 .mr(2) 3627 .nr(16) 3628 .kr(1) 3629 .sr(1) 3630 .m(m) 3631 .n(16) 3632 .k(8) 3633 .iterations(1) 3634 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3635 } 3636 } 3637 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_eq_8_subtile_n)3638 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_eq_8_subtile_n) { 3639 TEST_REQUIRES_ARM_NEON; 3640 for (uint32_t n = 1; n <= 16; n++) { 3641 GemmMicrokernelTester() 3642 .mr(2) 3643 .nr(16) 3644 .kr(1) 3645 .sr(1) 3646 .m(2) 3647 .n(n) 3648 .k(8) 3649 .iterations(1) 3650 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3651 } 3652 } 3653 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_lt_8)3654 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_lt_8) { 3655 TEST_REQUIRES_ARM_NEON; 3656 for (size_t k = 1; k < 8; k++) { 3657 GemmMicrokernelTester() 3658 .mr(2) 3659 .nr(16) 3660 .kr(1) 3661 .sr(1) 3662 .m(2) 3663 .n(16) 3664 .k(k) 3665 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3666 } 3667 } 3668 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_lt_8_strided_a)3669 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_lt_8_strided_a) { 3670 TEST_REQUIRES_ARM_NEON; 3671 for (size_t k = 1; k < 8; k++) { 3672 GemmMicrokernelTester() 3673 .mr(2) 3674 .nr(16) 3675 .kr(1) 3676 .sr(1) 3677 .m(2) 3678 .n(16) 3679 .k(k) 3680 .a_stride(11) 3681 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3682 } 3683 } 3684 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_lt_8_subtile)3685 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_lt_8_subtile) { 3686 TEST_REQUIRES_ARM_NEON; 3687 for (size_t k = 1; k < 8; k++) { 3688 for (uint32_t n = 1; n <= 16; n++) { 3689 for (uint32_t m = 1; m <= 2; m++) { 3690 GemmMicrokernelTester() 3691 .mr(2) 3692 .nr(16) 3693 .kr(1) 3694 .sr(1) 3695 .m(m) 3696 .n(n) 3697 .k(k) 3698 .iterations(1) 3699 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3700 } 3701 } 3702 } 3703 } 3704 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_gt_8)3705 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_gt_8) { 3706 TEST_REQUIRES_ARM_NEON; 3707 for (size_t k = 9; k < 16; k++) { 3708 GemmMicrokernelTester() 3709 .mr(2) 3710 .nr(16) 3711 .kr(1) 3712 .sr(1) 3713 .m(2) 3714 .n(16) 3715 .k(k) 3716 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3717 } 3718 } 3719 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_gt_8_strided_a)3720 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_gt_8_strided_a) { 3721 TEST_REQUIRES_ARM_NEON; 3722 for (size_t k = 9; k < 16; k++) { 3723 GemmMicrokernelTester() 3724 .mr(2) 3725 .nr(16) 3726 .kr(1) 3727 .sr(1) 3728 .m(2) 3729 .n(16) 3730 .k(k) 3731 .a_stride(19) 3732 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3733 } 3734 } 3735 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_gt_8_subtile)3736 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_gt_8_subtile) { 3737 TEST_REQUIRES_ARM_NEON; 3738 for (size_t k = 9; k < 16; k++) { 3739 for (uint32_t n = 1; n <= 16; n++) { 3740 for (uint32_t m = 1; m <= 2; m++) { 3741 GemmMicrokernelTester() 3742 .mr(2) 3743 .nr(16) 3744 .kr(1) 3745 .sr(1) 3746 .m(m) 3747 .n(n) 3748 .k(k) 3749 .iterations(1) 3750 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3751 } 3752 } 3753 } 3754 } 3755 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_div_8)3756 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_div_8) { 3757 TEST_REQUIRES_ARM_NEON; 3758 for (size_t k = 16; k <= 80; k += 8) { 3759 GemmMicrokernelTester() 3760 .mr(2) 3761 .nr(16) 3762 .kr(1) 3763 .sr(1) 3764 .m(2) 3765 .n(16) 3766 .k(k) 3767 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3768 } 3769 } 3770 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_div_8_strided_a)3771 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_div_8_strided_a) { 3772 TEST_REQUIRES_ARM_NEON; 3773 for (size_t k = 16; k <= 80; k += 8) { 3774 GemmMicrokernelTester() 3775 .mr(2) 3776 .nr(16) 3777 .kr(1) 3778 .sr(1) 3779 .m(2) 3780 .n(16) 3781 .k(k) 3782 .a_stride(83) 3783 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3784 } 3785 } 3786 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,k_div_8_subtile)3787 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, k_div_8_subtile) { 3788 TEST_REQUIRES_ARM_NEON; 3789 for (size_t k = 16; k <= 80; k += 8) { 3790 for (uint32_t n = 1; n <= 16; n++) { 3791 for (uint32_t m = 1; m <= 2; m++) { 3792 GemmMicrokernelTester() 3793 .mr(2) 3794 .nr(16) 3795 .kr(1) 3796 .sr(1) 3797 .m(m) 3798 .n(n) 3799 .k(k) 3800 .iterations(1) 3801 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3802 } 3803 } 3804 } 3805 } 3806 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,n_gt_16)3807 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_gt_16) { 3808 TEST_REQUIRES_ARM_NEON; 3809 for (uint32_t n = 17; n < 32; n++) { 3810 for (size_t k = 1; k <= 40; k += 9) { 3811 GemmMicrokernelTester() 3812 .mr(2) 3813 .nr(16) 3814 .kr(1) 3815 .sr(1) 3816 .m(2) 3817 .n(n) 3818 .k(k) 3819 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3820 } 3821 } 3822 } 3823 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,n_gt_16_strided_cn)3824 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_gt_16_strided_cn) { 3825 TEST_REQUIRES_ARM_NEON; 3826 for (uint32_t n = 17; n < 32; n++) { 3827 for (size_t k = 1; k <= 40; k += 9) { 3828 GemmMicrokernelTester() 3829 .mr(2) 3830 .nr(16) 3831 .kr(1) 3832 .sr(1) 3833 .m(2) 3834 .n(n) 3835 .k(k) 3836 .cn_stride(19) 3837 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3838 } 3839 } 3840 } 3841 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,n_gt_16_strided_a)3842 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_gt_16_strided_a) { 3843 TEST_REQUIRES_ARM_NEON; 3844 for (uint32_t n = 17; n < 32; n++) { 3845 for (size_t k = 1; k <= 40; k += 9) { 3846 GemmMicrokernelTester() 3847 .mr(2) 3848 .nr(16) 3849 .kr(1) 3850 .sr(1) 3851 .m(2) 3852 .n(n) 3853 .k(k) 3854 .a_stride(43) 3855 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3856 } 3857 } 3858 } 3859 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,n_gt_16_subtile)3860 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_gt_16_subtile) { 3861 TEST_REQUIRES_ARM_NEON; 3862 for (uint32_t n = 17; n < 32; n++) { 3863 for (size_t k = 1; k <= 40; k += 9) { 3864 for (uint32_t m = 1; m <= 2; m++) { 3865 GemmMicrokernelTester() 3866 .mr(2) 3867 .nr(16) 3868 .kr(1) 3869 .sr(1) 3870 .m(m) 3871 .n(n) 3872 .k(k) 3873 .iterations(1) 3874 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3875 } 3876 } 3877 } 3878 } 3879 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,n_div_16)3880 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_div_16) { 3881 TEST_REQUIRES_ARM_NEON; 3882 for (uint32_t n = 32; n <= 48; n += 16) { 3883 for (size_t k = 1; k <= 40; k += 9) { 3884 GemmMicrokernelTester() 3885 .mr(2) 3886 .nr(16) 3887 .kr(1) 3888 .sr(1) 3889 .m(2) 3890 .n(n) 3891 .k(k) 3892 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3893 } 3894 } 3895 } 3896 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,n_div_16_strided_cn)3897 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_div_16_strided_cn) { 3898 TEST_REQUIRES_ARM_NEON; 3899 for (uint32_t n = 32; n <= 48; n += 16) { 3900 for (size_t k = 1; k <= 40; k += 9) { 3901 GemmMicrokernelTester() 3902 .mr(2) 3903 .nr(16) 3904 .kr(1) 3905 .sr(1) 3906 .m(2) 3907 .n(n) 3908 .k(k) 3909 .cn_stride(19) 3910 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3911 } 3912 } 3913 } 3914 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,n_div_16_strided_a)3915 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_div_16_strided_a) { 3916 TEST_REQUIRES_ARM_NEON; 3917 for (uint32_t n = 32; n <= 48; n += 16) { 3918 for (size_t k = 1; k <= 40; k += 9) { 3919 GemmMicrokernelTester() 3920 .mr(2) 3921 .nr(16) 3922 .kr(1) 3923 .sr(1) 3924 .m(2) 3925 .n(n) 3926 .k(k) 3927 .a_stride(43) 3928 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3929 } 3930 } 3931 } 3932 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,n_div_16_subtile)3933 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, n_div_16_subtile) { 3934 TEST_REQUIRES_ARM_NEON; 3935 for (uint32_t n = 32; n <= 48; n += 16) { 3936 for (size_t k = 1; k <= 40; k += 9) { 3937 for (uint32_t m = 1; m <= 2; m++) { 3938 GemmMicrokernelTester() 3939 .mr(2) 3940 .nr(16) 3941 .kr(1) 3942 .sr(1) 3943 .m(m) 3944 .n(n) 3945 .k(k) 3946 .iterations(1) 3947 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3948 } 3949 } 3950 } 3951 } 3952 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,strided_cm_subtile)3953 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, strided_cm_subtile) { 3954 TEST_REQUIRES_ARM_NEON; 3955 for (size_t k = 1; k <= 40; k += 9) { 3956 for (uint32_t n = 1; n <= 16; n++) { 3957 for (uint32_t m = 1; m <= 2; m++) { 3958 GemmMicrokernelTester() 3959 .mr(2) 3960 .nr(16) 3961 .kr(1) 3962 .sr(1) 3963 .m(m) 3964 .n(n) 3965 .k(k) 3966 .cm_stride(19) 3967 .iterations(1) 3968 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3969 } 3970 } 3971 } 3972 } 3973 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,qmin)3974 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, qmin) { 3975 TEST_REQUIRES_ARM_NEON; 3976 GemmMicrokernelTester() 3977 .mr(2) 3978 .nr(16) 3979 .kr(1) 3980 .sr(1) 3981 .m(2) 3982 .n(16) 3983 .k(8) 3984 .qmin(128) 3985 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3986 } 3987 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,qmax)3988 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, qmax) { 3989 TEST_REQUIRES_ARM_NEON; 3990 GemmMicrokernelTester() 3991 .mr(2) 3992 .nr(16) 3993 .kr(1) 3994 .sr(1) 3995 .m(2) 3996 .n(16) 3997 .k(8) 3998 .qmax(128) 3999 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4000 } 4001 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,strided_cm)4002 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, strided_cm) { 4003 TEST_REQUIRES_ARM_NEON; 4004 GemmMicrokernelTester() 4005 .mr(2) 4006 .nr(16) 4007 .kr(1) 4008 .sr(1) 4009 .m(2) 4010 .n(16) 4011 .k(8) 4012 .cm_stride(19) 4013 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4014 } 4015 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,no_a_zero_point)4016 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, no_a_zero_point) { 4017 TEST_REQUIRES_ARM_NEON; 4018 for (size_t k = 1; k <= 40; k += 9) { 4019 GemmMicrokernelTester() 4020 .mr(2) 4021 .nr(16) 4022 .kr(1) 4023 .sr(1) 4024 .m(2) 4025 .n(16) 4026 .k(k) 4027 .a_zero_point(0) 4028 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4029 } 4030 } 4031 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,no_b_zero_point)4032 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, no_b_zero_point) { 4033 TEST_REQUIRES_ARM_NEON; 4034 for (size_t k = 1; k <= 40; k += 9) { 4035 GemmMicrokernelTester() 4036 .mr(2) 4037 .nr(16) 4038 .kr(1) 4039 .sr(1) 4040 .m(2) 4041 .n(16) 4042 .k(k) 4043 .b_zero_point(0) 4044 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4045 } 4046 } 4047 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE,no_zero_point)4048 TEST(QU8_GEMM_MINMAX_RNDNU_2X16__NEON_MLAL_LANE, no_zero_point) { 4049 TEST_REQUIRES_ARM_NEON; 4050 for (size_t k = 1; k <= 40; k += 9) { 4051 GemmMicrokernelTester() 4052 .mr(2) 4053 .nr(16) 4054 .kr(1) 4055 .sr(1) 4056 .m(2) 4057 .n(16) 4058 .k(k) 4059 .a_zero_point(0) 4060 .b_zero_point(0) 4061 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4062 } 4063 } 4064 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 4065 4066 4067 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_eq_8)4068 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_eq_8) { 4069 TEST_REQUIRES_ARM_NEON; 4070 GemmMicrokernelTester() 4071 .mr(3) 4072 .nr(8) 4073 .kr(1) 4074 .sr(1) 4075 .m(3) 4076 .n(8) 4077 .k(8) 4078 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4079 } 4080 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,strided_cn)4081 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, strided_cn) { 4082 TEST_REQUIRES_ARM_NEON; 4083 GemmMicrokernelTester() 4084 .mr(3) 4085 .nr(8) 4086 .kr(1) 4087 .sr(1) 4088 .m(3) 4089 .n(8) 4090 .k(8) 4091 .cn_stride(11) 4092 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4093 } 4094 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_eq_8_strided_a)4095 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_eq_8_strided_a) { 4096 TEST_REQUIRES_ARM_NEON; 4097 GemmMicrokernelTester() 4098 .mr(3) 4099 .nr(8) 4100 .kr(1) 4101 .sr(1) 4102 .m(3) 4103 .n(8) 4104 .k(8) 4105 .a_stride(11) 4106 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4107 } 4108 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_eq_8_subtile)4109 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_eq_8_subtile) { 4110 TEST_REQUIRES_ARM_NEON; 4111 for (uint32_t n = 1; n <= 8; n++) { 4112 for (uint32_t m = 1; m <= 3; m++) { 4113 GemmMicrokernelTester() 4114 .mr(3) 4115 .nr(8) 4116 .kr(1) 4117 .sr(1) 4118 .m(m) 4119 .n(n) 4120 .k(8) 4121 .iterations(1) 4122 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4123 } 4124 } 4125 } 4126 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_eq_8_subtile_m)4127 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_eq_8_subtile_m) { 4128 TEST_REQUIRES_ARM_NEON; 4129 for (uint32_t m = 1; m <= 3; m++) { 4130 GemmMicrokernelTester() 4131 .mr(3) 4132 .nr(8) 4133 .kr(1) 4134 .sr(1) 4135 .m(m) 4136 .n(8) 4137 .k(8) 4138 .iterations(1) 4139 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4140 } 4141 } 4142 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_eq_8_subtile_n)4143 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_eq_8_subtile_n) { 4144 TEST_REQUIRES_ARM_NEON; 4145 for (uint32_t n = 1; n <= 8; n++) { 4146 GemmMicrokernelTester() 4147 .mr(3) 4148 .nr(8) 4149 .kr(1) 4150 .sr(1) 4151 .m(3) 4152 .n(n) 4153 .k(8) 4154 .iterations(1) 4155 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4156 } 4157 } 4158 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_lt_8)4159 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_lt_8) { 4160 TEST_REQUIRES_ARM_NEON; 4161 for (size_t k = 1; k < 8; k++) { 4162 GemmMicrokernelTester() 4163 .mr(3) 4164 .nr(8) 4165 .kr(1) 4166 .sr(1) 4167 .m(3) 4168 .n(8) 4169 .k(k) 4170 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4171 } 4172 } 4173 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_lt_8_strided_a)4174 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_lt_8_strided_a) { 4175 TEST_REQUIRES_ARM_NEON; 4176 for (size_t k = 1; k < 8; k++) { 4177 GemmMicrokernelTester() 4178 .mr(3) 4179 .nr(8) 4180 .kr(1) 4181 .sr(1) 4182 .m(3) 4183 .n(8) 4184 .k(k) 4185 .a_stride(11) 4186 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4187 } 4188 } 4189 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_lt_8_subtile)4190 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_lt_8_subtile) { 4191 TEST_REQUIRES_ARM_NEON; 4192 for (size_t k = 1; k < 8; k++) { 4193 for (uint32_t n = 1; n <= 8; n++) { 4194 for (uint32_t m = 1; m <= 3; m++) { 4195 GemmMicrokernelTester() 4196 .mr(3) 4197 .nr(8) 4198 .kr(1) 4199 .sr(1) 4200 .m(m) 4201 .n(n) 4202 .k(k) 4203 .iterations(1) 4204 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4205 } 4206 } 4207 } 4208 } 4209 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_gt_8)4210 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_gt_8) { 4211 TEST_REQUIRES_ARM_NEON; 4212 for (size_t k = 9; k < 16; k++) { 4213 GemmMicrokernelTester() 4214 .mr(3) 4215 .nr(8) 4216 .kr(1) 4217 .sr(1) 4218 .m(3) 4219 .n(8) 4220 .k(k) 4221 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4222 } 4223 } 4224 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_gt_8_strided_a)4225 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_gt_8_strided_a) { 4226 TEST_REQUIRES_ARM_NEON; 4227 for (size_t k = 9; k < 16; k++) { 4228 GemmMicrokernelTester() 4229 .mr(3) 4230 .nr(8) 4231 .kr(1) 4232 .sr(1) 4233 .m(3) 4234 .n(8) 4235 .k(k) 4236 .a_stride(19) 4237 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4238 } 4239 } 4240 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_gt_8_subtile)4241 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_gt_8_subtile) { 4242 TEST_REQUIRES_ARM_NEON; 4243 for (size_t k = 9; k < 16; k++) { 4244 for (uint32_t n = 1; n <= 8; n++) { 4245 for (uint32_t m = 1; m <= 3; m++) { 4246 GemmMicrokernelTester() 4247 .mr(3) 4248 .nr(8) 4249 .kr(1) 4250 .sr(1) 4251 .m(m) 4252 .n(n) 4253 .k(k) 4254 .iterations(1) 4255 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4256 } 4257 } 4258 } 4259 } 4260 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_div_8)4261 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_div_8) { 4262 TEST_REQUIRES_ARM_NEON; 4263 for (size_t k = 16; k <= 80; k += 8) { 4264 GemmMicrokernelTester() 4265 .mr(3) 4266 .nr(8) 4267 .kr(1) 4268 .sr(1) 4269 .m(3) 4270 .n(8) 4271 .k(k) 4272 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4273 } 4274 } 4275 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_div_8_strided_a)4276 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_div_8_strided_a) { 4277 TEST_REQUIRES_ARM_NEON; 4278 for (size_t k = 16; k <= 80; k += 8) { 4279 GemmMicrokernelTester() 4280 .mr(3) 4281 .nr(8) 4282 .kr(1) 4283 .sr(1) 4284 .m(3) 4285 .n(8) 4286 .k(k) 4287 .a_stride(83) 4288 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4289 } 4290 } 4291 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,k_div_8_subtile)4292 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, k_div_8_subtile) { 4293 TEST_REQUIRES_ARM_NEON; 4294 for (size_t k = 16; k <= 80; k += 8) { 4295 for (uint32_t n = 1; n <= 8; n++) { 4296 for (uint32_t m = 1; m <= 3; m++) { 4297 GemmMicrokernelTester() 4298 .mr(3) 4299 .nr(8) 4300 .kr(1) 4301 .sr(1) 4302 .m(m) 4303 .n(n) 4304 .k(k) 4305 .iterations(1) 4306 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4307 } 4308 } 4309 } 4310 } 4311 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,n_gt_8)4312 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, n_gt_8) { 4313 TEST_REQUIRES_ARM_NEON; 4314 for (uint32_t n = 9; n < 16; n++) { 4315 for (size_t k = 1; k <= 40; k += 9) { 4316 GemmMicrokernelTester() 4317 .mr(3) 4318 .nr(8) 4319 .kr(1) 4320 .sr(1) 4321 .m(3) 4322 .n(n) 4323 .k(k) 4324 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4325 } 4326 } 4327 } 4328 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,n_gt_8_strided_cn)4329 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, n_gt_8_strided_cn) { 4330 TEST_REQUIRES_ARM_NEON; 4331 for (uint32_t n = 9; n < 16; n++) { 4332 for (size_t k = 1; k <= 40; k += 9) { 4333 GemmMicrokernelTester() 4334 .mr(3) 4335 .nr(8) 4336 .kr(1) 4337 .sr(1) 4338 .m(3) 4339 .n(n) 4340 .k(k) 4341 .cn_stride(11) 4342 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4343 } 4344 } 4345 } 4346 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,n_gt_8_strided_a)4347 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, n_gt_8_strided_a) { 4348 TEST_REQUIRES_ARM_NEON; 4349 for (uint32_t n = 9; n < 16; n++) { 4350 for (size_t k = 1; k <= 40; k += 9) { 4351 GemmMicrokernelTester() 4352 .mr(3) 4353 .nr(8) 4354 .kr(1) 4355 .sr(1) 4356 .m(3) 4357 .n(n) 4358 .k(k) 4359 .a_stride(43) 4360 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4361 } 4362 } 4363 } 4364 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,n_gt_8_subtile)4365 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, n_gt_8_subtile) { 4366 TEST_REQUIRES_ARM_NEON; 4367 for (uint32_t n = 9; n < 16; n++) { 4368 for (size_t k = 1; k <= 40; k += 9) { 4369 for (uint32_t m = 1; m <= 3; m++) { 4370 GemmMicrokernelTester() 4371 .mr(3) 4372 .nr(8) 4373 .kr(1) 4374 .sr(1) 4375 .m(m) 4376 .n(n) 4377 .k(k) 4378 .iterations(1) 4379 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4380 } 4381 } 4382 } 4383 } 4384 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,n_div_8)4385 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, n_div_8) { 4386 TEST_REQUIRES_ARM_NEON; 4387 for (uint32_t n = 16; n <= 24; n += 8) { 4388 for (size_t k = 1; k <= 40; k += 9) { 4389 GemmMicrokernelTester() 4390 .mr(3) 4391 .nr(8) 4392 .kr(1) 4393 .sr(1) 4394 .m(3) 4395 .n(n) 4396 .k(k) 4397 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4398 } 4399 } 4400 } 4401 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,n_div_8_strided_cn)4402 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, n_div_8_strided_cn) { 4403 TEST_REQUIRES_ARM_NEON; 4404 for (uint32_t n = 16; n <= 24; n += 8) { 4405 for (size_t k = 1; k <= 40; k += 9) { 4406 GemmMicrokernelTester() 4407 .mr(3) 4408 .nr(8) 4409 .kr(1) 4410 .sr(1) 4411 .m(3) 4412 .n(n) 4413 .k(k) 4414 .cn_stride(11) 4415 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4416 } 4417 } 4418 } 4419 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,n_div_8_strided_a)4420 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, n_div_8_strided_a) { 4421 TEST_REQUIRES_ARM_NEON; 4422 for (uint32_t n = 16; n <= 24; n += 8) { 4423 for (size_t k = 1; k <= 40; k += 9) { 4424 GemmMicrokernelTester() 4425 .mr(3) 4426 .nr(8) 4427 .kr(1) 4428 .sr(1) 4429 .m(3) 4430 .n(n) 4431 .k(k) 4432 .a_stride(43) 4433 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4434 } 4435 } 4436 } 4437 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,n_div_8_subtile)4438 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, n_div_8_subtile) { 4439 TEST_REQUIRES_ARM_NEON; 4440 for (uint32_t n = 16; n <= 24; n += 8) { 4441 for (size_t k = 1; k <= 40; k += 9) { 4442 for (uint32_t m = 1; m <= 3; m++) { 4443 GemmMicrokernelTester() 4444 .mr(3) 4445 .nr(8) 4446 .kr(1) 4447 .sr(1) 4448 .m(m) 4449 .n(n) 4450 .k(k) 4451 .iterations(1) 4452 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4453 } 4454 } 4455 } 4456 } 4457 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,strided_cm_subtile)4458 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, strided_cm_subtile) { 4459 TEST_REQUIRES_ARM_NEON; 4460 for (size_t k = 1; k <= 40; k += 9) { 4461 for (uint32_t n = 1; n <= 8; n++) { 4462 for (uint32_t m = 1; m <= 3; m++) { 4463 GemmMicrokernelTester() 4464 .mr(3) 4465 .nr(8) 4466 .kr(1) 4467 .sr(1) 4468 .m(m) 4469 .n(n) 4470 .k(k) 4471 .cm_stride(11) 4472 .iterations(1) 4473 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4474 } 4475 } 4476 } 4477 } 4478 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,qmin)4479 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, qmin) { 4480 TEST_REQUIRES_ARM_NEON; 4481 GemmMicrokernelTester() 4482 .mr(3) 4483 .nr(8) 4484 .kr(1) 4485 .sr(1) 4486 .m(3) 4487 .n(8) 4488 .k(8) 4489 .qmin(128) 4490 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4491 } 4492 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,qmax)4493 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, qmax) { 4494 TEST_REQUIRES_ARM_NEON; 4495 GemmMicrokernelTester() 4496 .mr(3) 4497 .nr(8) 4498 .kr(1) 4499 .sr(1) 4500 .m(3) 4501 .n(8) 4502 .k(8) 4503 .qmax(128) 4504 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4505 } 4506 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,strided_cm)4507 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, strided_cm) { 4508 TEST_REQUIRES_ARM_NEON; 4509 GemmMicrokernelTester() 4510 .mr(3) 4511 .nr(8) 4512 .kr(1) 4513 .sr(1) 4514 .m(3) 4515 .n(8) 4516 .k(8) 4517 .cm_stride(11) 4518 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4519 } 4520 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,no_a_zero_point)4521 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, no_a_zero_point) { 4522 TEST_REQUIRES_ARM_NEON; 4523 for (size_t k = 1; k <= 40; k += 9) { 4524 GemmMicrokernelTester() 4525 .mr(3) 4526 .nr(8) 4527 .kr(1) 4528 .sr(1) 4529 .m(3) 4530 .n(8) 4531 .k(k) 4532 .a_zero_point(0) 4533 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4534 } 4535 } 4536 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,no_b_zero_point)4537 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, no_b_zero_point) { 4538 TEST_REQUIRES_ARM_NEON; 4539 for (size_t k = 1; k <= 40; k += 9) { 4540 GemmMicrokernelTester() 4541 .mr(3) 4542 .nr(8) 4543 .kr(1) 4544 .sr(1) 4545 .m(3) 4546 .n(8) 4547 .k(k) 4548 .b_zero_point(0) 4549 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4550 } 4551 } 4552 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE,no_zero_point)4553 TEST(QU8_GEMM_MINMAX_RNDNU_3X8__NEON_MLAL_LANE, no_zero_point) { 4554 TEST_REQUIRES_ARM_NEON; 4555 for (size_t k = 1; k <= 40; k += 9) { 4556 GemmMicrokernelTester() 4557 .mr(3) 4558 .nr(8) 4559 .kr(1) 4560 .sr(1) 4561 .m(3) 4562 .n(8) 4563 .k(k) 4564 .a_zero_point(0) 4565 .b_zero_point(0) 4566 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4567 } 4568 } 4569 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 4570 4571 4572 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_eq_8)4573 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8) { 4574 TEST_REQUIRES_ARM_NEON; 4575 GemmMicrokernelTester() 4576 .mr(3) 4577 .nr(16) 4578 .kr(1) 4579 .sr(1) 4580 .m(3) 4581 .n(16) 4582 .k(8) 4583 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4584 } 4585 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,strided_cn)4586 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, strided_cn) { 4587 TEST_REQUIRES_ARM_NEON; 4588 GemmMicrokernelTester() 4589 .mr(3) 4590 .nr(16) 4591 .kr(1) 4592 .sr(1) 4593 .m(3) 4594 .n(16) 4595 .k(8) 4596 .cn_stride(19) 4597 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4598 } 4599 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_eq_8_strided_a)4600 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_strided_a) { 4601 TEST_REQUIRES_ARM_NEON; 4602 GemmMicrokernelTester() 4603 .mr(3) 4604 .nr(16) 4605 .kr(1) 4606 .sr(1) 4607 .m(3) 4608 .n(16) 4609 .k(8) 4610 .a_stride(11) 4611 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4612 } 4613 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_eq_8_subtile)4614 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_subtile) { 4615 TEST_REQUIRES_ARM_NEON; 4616 for (uint32_t n = 1; n <= 16; n++) { 4617 for (uint32_t m = 1; m <= 3; m++) { 4618 GemmMicrokernelTester() 4619 .mr(3) 4620 .nr(16) 4621 .kr(1) 4622 .sr(1) 4623 .m(m) 4624 .n(n) 4625 .k(8) 4626 .iterations(1) 4627 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4628 } 4629 } 4630 } 4631 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_eq_8_subtile_m)4632 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_subtile_m) { 4633 TEST_REQUIRES_ARM_NEON; 4634 for (uint32_t m = 1; m <= 3; m++) { 4635 GemmMicrokernelTester() 4636 .mr(3) 4637 .nr(16) 4638 .kr(1) 4639 .sr(1) 4640 .m(m) 4641 .n(16) 4642 .k(8) 4643 .iterations(1) 4644 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4645 } 4646 } 4647 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_eq_8_subtile_n)4648 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_eq_8_subtile_n) { 4649 TEST_REQUIRES_ARM_NEON; 4650 for (uint32_t n = 1; n <= 16; n++) { 4651 GemmMicrokernelTester() 4652 .mr(3) 4653 .nr(16) 4654 .kr(1) 4655 .sr(1) 4656 .m(3) 4657 .n(n) 4658 .k(8) 4659 .iterations(1) 4660 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4661 } 4662 } 4663 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_lt_8)4664 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_lt_8) { 4665 TEST_REQUIRES_ARM_NEON; 4666 for (size_t k = 1; k < 8; k++) { 4667 GemmMicrokernelTester() 4668 .mr(3) 4669 .nr(16) 4670 .kr(1) 4671 .sr(1) 4672 .m(3) 4673 .n(16) 4674 .k(k) 4675 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4676 } 4677 } 4678 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_lt_8_strided_a)4679 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_lt_8_strided_a) { 4680 TEST_REQUIRES_ARM_NEON; 4681 for (size_t k = 1; k < 8; k++) { 4682 GemmMicrokernelTester() 4683 .mr(3) 4684 .nr(16) 4685 .kr(1) 4686 .sr(1) 4687 .m(3) 4688 .n(16) 4689 .k(k) 4690 .a_stride(11) 4691 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4692 } 4693 } 4694 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_lt_8_subtile)4695 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_lt_8_subtile) { 4696 TEST_REQUIRES_ARM_NEON; 4697 for (size_t k = 1; k < 8; k++) { 4698 for (uint32_t n = 1; n <= 16; n++) { 4699 for (uint32_t m = 1; m <= 3; m++) { 4700 GemmMicrokernelTester() 4701 .mr(3) 4702 .nr(16) 4703 .kr(1) 4704 .sr(1) 4705 .m(m) 4706 .n(n) 4707 .k(k) 4708 .iterations(1) 4709 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4710 } 4711 } 4712 } 4713 } 4714 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_gt_8)4715 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_gt_8) { 4716 TEST_REQUIRES_ARM_NEON; 4717 for (size_t k = 9; k < 16; k++) { 4718 GemmMicrokernelTester() 4719 .mr(3) 4720 .nr(16) 4721 .kr(1) 4722 .sr(1) 4723 .m(3) 4724 .n(16) 4725 .k(k) 4726 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4727 } 4728 } 4729 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_gt_8_strided_a)4730 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_gt_8_strided_a) { 4731 TEST_REQUIRES_ARM_NEON; 4732 for (size_t k = 9; k < 16; k++) { 4733 GemmMicrokernelTester() 4734 .mr(3) 4735 .nr(16) 4736 .kr(1) 4737 .sr(1) 4738 .m(3) 4739 .n(16) 4740 .k(k) 4741 .a_stride(19) 4742 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4743 } 4744 } 4745 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_gt_8_subtile)4746 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_gt_8_subtile) { 4747 TEST_REQUIRES_ARM_NEON; 4748 for (size_t k = 9; k < 16; k++) { 4749 for (uint32_t n = 1; n <= 16; n++) { 4750 for (uint32_t m = 1; m <= 3; m++) { 4751 GemmMicrokernelTester() 4752 .mr(3) 4753 .nr(16) 4754 .kr(1) 4755 .sr(1) 4756 .m(m) 4757 .n(n) 4758 .k(k) 4759 .iterations(1) 4760 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4761 } 4762 } 4763 } 4764 } 4765 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_div_8)4766 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_div_8) { 4767 TEST_REQUIRES_ARM_NEON; 4768 for (size_t k = 16; k <= 80; k += 8) { 4769 GemmMicrokernelTester() 4770 .mr(3) 4771 .nr(16) 4772 .kr(1) 4773 .sr(1) 4774 .m(3) 4775 .n(16) 4776 .k(k) 4777 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4778 } 4779 } 4780 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_div_8_strided_a)4781 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_div_8_strided_a) { 4782 TEST_REQUIRES_ARM_NEON; 4783 for (size_t k = 16; k <= 80; k += 8) { 4784 GemmMicrokernelTester() 4785 .mr(3) 4786 .nr(16) 4787 .kr(1) 4788 .sr(1) 4789 .m(3) 4790 .n(16) 4791 .k(k) 4792 .a_stride(83) 4793 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4794 } 4795 } 4796 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,k_div_8_subtile)4797 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, k_div_8_subtile) { 4798 TEST_REQUIRES_ARM_NEON; 4799 for (size_t k = 16; k <= 80; k += 8) { 4800 for (uint32_t n = 1; n <= 16; n++) { 4801 for (uint32_t m = 1; m <= 3; m++) { 4802 GemmMicrokernelTester() 4803 .mr(3) 4804 .nr(16) 4805 .kr(1) 4806 .sr(1) 4807 .m(m) 4808 .n(n) 4809 .k(k) 4810 .iterations(1) 4811 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4812 } 4813 } 4814 } 4815 } 4816 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,n_gt_16)4817 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16) { 4818 TEST_REQUIRES_ARM_NEON; 4819 for (uint32_t n = 17; n < 32; n++) { 4820 for (size_t k = 1; k <= 40; k += 9) { 4821 GemmMicrokernelTester() 4822 .mr(3) 4823 .nr(16) 4824 .kr(1) 4825 .sr(1) 4826 .m(3) 4827 .n(n) 4828 .k(k) 4829 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4830 } 4831 } 4832 } 4833 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,n_gt_16_strided_cn)4834 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16_strided_cn) { 4835 TEST_REQUIRES_ARM_NEON; 4836 for (uint32_t n = 17; n < 32; n++) { 4837 for (size_t k = 1; k <= 40; k += 9) { 4838 GemmMicrokernelTester() 4839 .mr(3) 4840 .nr(16) 4841 .kr(1) 4842 .sr(1) 4843 .m(3) 4844 .n(n) 4845 .k(k) 4846 .cn_stride(19) 4847 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4848 } 4849 } 4850 } 4851 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,n_gt_16_strided_a)4852 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16_strided_a) { 4853 TEST_REQUIRES_ARM_NEON; 4854 for (uint32_t n = 17; n < 32; n++) { 4855 for (size_t k = 1; k <= 40; k += 9) { 4856 GemmMicrokernelTester() 4857 .mr(3) 4858 .nr(16) 4859 .kr(1) 4860 .sr(1) 4861 .m(3) 4862 .n(n) 4863 .k(k) 4864 .a_stride(43) 4865 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4866 } 4867 } 4868 } 4869 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,n_gt_16_subtile)4870 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_gt_16_subtile) { 4871 TEST_REQUIRES_ARM_NEON; 4872 for (uint32_t n = 17; n < 32; n++) { 4873 for (size_t k = 1; k <= 40; k += 9) { 4874 for (uint32_t m = 1; m <= 3; m++) { 4875 GemmMicrokernelTester() 4876 .mr(3) 4877 .nr(16) 4878 .kr(1) 4879 .sr(1) 4880 .m(m) 4881 .n(n) 4882 .k(k) 4883 .iterations(1) 4884 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4885 } 4886 } 4887 } 4888 } 4889 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,n_div_16)4890 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16) { 4891 TEST_REQUIRES_ARM_NEON; 4892 for (uint32_t n = 32; n <= 48; n += 16) { 4893 for (size_t k = 1; k <= 40; k += 9) { 4894 GemmMicrokernelTester() 4895 .mr(3) 4896 .nr(16) 4897 .kr(1) 4898 .sr(1) 4899 .m(3) 4900 .n(n) 4901 .k(k) 4902 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4903 } 4904 } 4905 } 4906 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,n_div_16_strided_cn)4907 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16_strided_cn) { 4908 TEST_REQUIRES_ARM_NEON; 4909 for (uint32_t n = 32; n <= 48; n += 16) { 4910 for (size_t k = 1; k <= 40; k += 9) { 4911 GemmMicrokernelTester() 4912 .mr(3) 4913 .nr(16) 4914 .kr(1) 4915 .sr(1) 4916 .m(3) 4917 .n(n) 4918 .k(k) 4919 .cn_stride(19) 4920 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4921 } 4922 } 4923 } 4924 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,n_div_16_strided_a)4925 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16_strided_a) { 4926 TEST_REQUIRES_ARM_NEON; 4927 for (uint32_t n = 32; n <= 48; n += 16) { 4928 for (size_t k = 1; k <= 40; k += 9) { 4929 GemmMicrokernelTester() 4930 .mr(3) 4931 .nr(16) 4932 .kr(1) 4933 .sr(1) 4934 .m(3) 4935 .n(n) 4936 .k(k) 4937 .a_stride(43) 4938 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4939 } 4940 } 4941 } 4942 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,n_div_16_subtile)4943 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, n_div_16_subtile) { 4944 TEST_REQUIRES_ARM_NEON; 4945 for (uint32_t n = 32; n <= 48; n += 16) { 4946 for (size_t k = 1; k <= 40; k += 9) { 4947 for (uint32_t m = 1; m <= 3; m++) { 4948 GemmMicrokernelTester() 4949 .mr(3) 4950 .nr(16) 4951 .kr(1) 4952 .sr(1) 4953 .m(m) 4954 .n(n) 4955 .k(k) 4956 .iterations(1) 4957 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4958 } 4959 } 4960 } 4961 } 4962 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,strided_cm_subtile)4963 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, strided_cm_subtile) { 4964 TEST_REQUIRES_ARM_NEON; 4965 for (size_t k = 1; k <= 40; k += 9) { 4966 for (uint32_t n = 1; n <= 16; n++) { 4967 for (uint32_t m = 1; m <= 3; m++) { 4968 GemmMicrokernelTester() 4969 .mr(3) 4970 .nr(16) 4971 .kr(1) 4972 .sr(1) 4973 .m(m) 4974 .n(n) 4975 .k(k) 4976 .cm_stride(19) 4977 .iterations(1) 4978 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4979 } 4980 } 4981 } 4982 } 4983 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,qmin)4984 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, qmin) { 4985 TEST_REQUIRES_ARM_NEON; 4986 GemmMicrokernelTester() 4987 .mr(3) 4988 .nr(16) 4989 .kr(1) 4990 .sr(1) 4991 .m(3) 4992 .n(16) 4993 .k(8) 4994 .qmin(128) 4995 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4996 } 4997 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,qmax)4998 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, qmax) { 4999 TEST_REQUIRES_ARM_NEON; 5000 GemmMicrokernelTester() 5001 .mr(3) 5002 .nr(16) 5003 .kr(1) 5004 .sr(1) 5005 .m(3) 5006 .n(16) 5007 .k(8) 5008 .qmax(128) 5009 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5010 } 5011 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,strided_cm)5012 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, strided_cm) { 5013 TEST_REQUIRES_ARM_NEON; 5014 GemmMicrokernelTester() 5015 .mr(3) 5016 .nr(16) 5017 .kr(1) 5018 .sr(1) 5019 .m(3) 5020 .n(16) 5021 .k(8) 5022 .cm_stride(19) 5023 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5024 } 5025 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,no_a_zero_point)5026 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, no_a_zero_point) { 5027 TEST_REQUIRES_ARM_NEON; 5028 for (size_t k = 1; k <= 40; k += 9) { 5029 GemmMicrokernelTester() 5030 .mr(3) 5031 .nr(16) 5032 .kr(1) 5033 .sr(1) 5034 .m(3) 5035 .n(16) 5036 .k(k) 5037 .a_zero_point(0) 5038 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5039 } 5040 } 5041 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,no_b_zero_point)5042 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, no_b_zero_point) { 5043 TEST_REQUIRES_ARM_NEON; 5044 for (size_t k = 1; k <= 40; k += 9) { 5045 GemmMicrokernelTester() 5046 .mr(3) 5047 .nr(16) 5048 .kr(1) 5049 .sr(1) 5050 .m(3) 5051 .n(16) 5052 .k(k) 5053 .b_zero_point(0) 5054 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5055 } 5056 } 5057 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE,no_zero_point)5058 TEST(QU8_GEMM_MINMAX_RNDNU_3X16__NEON_MLAL_LANE, no_zero_point) { 5059 TEST_REQUIRES_ARM_NEON; 5060 for (size_t k = 1; k <= 40; k += 9) { 5061 GemmMicrokernelTester() 5062 .mr(3) 5063 .nr(16) 5064 .kr(1) 5065 .sr(1) 5066 .m(3) 5067 .n(16) 5068 .k(k) 5069 .a_zero_point(0) 5070 .b_zero_point(0) 5071 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5072 } 5073 } 5074 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 5075 5076 5077 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_eq_8)5078 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_eq_8) { 5079 TEST_REQUIRES_ARM_NEON; 5080 GemmMicrokernelTester() 5081 .mr(6) 5082 .nr(8) 5083 .kr(1) 5084 .sr(1) 5085 .m(6) 5086 .n(8) 5087 .k(8) 5088 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5089 } 5090 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,strided_cn)5091 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, strided_cn) { 5092 TEST_REQUIRES_ARM_NEON; 5093 GemmMicrokernelTester() 5094 .mr(6) 5095 .nr(8) 5096 .kr(1) 5097 .sr(1) 5098 .m(6) 5099 .n(8) 5100 .k(8) 5101 .cn_stride(11) 5102 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5103 } 5104 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_eq_8_strided_a)5105 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_eq_8_strided_a) { 5106 TEST_REQUIRES_ARM_NEON; 5107 GemmMicrokernelTester() 5108 .mr(6) 5109 .nr(8) 5110 .kr(1) 5111 .sr(1) 5112 .m(6) 5113 .n(8) 5114 .k(8) 5115 .a_stride(11) 5116 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5117 } 5118 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_eq_8_subtile)5119 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_eq_8_subtile) { 5120 TEST_REQUIRES_ARM_NEON; 5121 for (uint32_t n = 1; n <= 8; n++) { 5122 for (uint32_t m = 1; m <= 6; m++) { 5123 GemmMicrokernelTester() 5124 .mr(6) 5125 .nr(8) 5126 .kr(1) 5127 .sr(1) 5128 .m(m) 5129 .n(n) 5130 .k(8) 5131 .iterations(1) 5132 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5133 } 5134 } 5135 } 5136 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_eq_8_subtile_m)5137 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_eq_8_subtile_m) { 5138 TEST_REQUIRES_ARM_NEON; 5139 for (uint32_t m = 1; m <= 6; m++) { 5140 GemmMicrokernelTester() 5141 .mr(6) 5142 .nr(8) 5143 .kr(1) 5144 .sr(1) 5145 .m(m) 5146 .n(8) 5147 .k(8) 5148 .iterations(1) 5149 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5150 } 5151 } 5152 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_eq_8_subtile_n)5153 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_eq_8_subtile_n) { 5154 TEST_REQUIRES_ARM_NEON; 5155 for (uint32_t n = 1; n <= 8; n++) { 5156 GemmMicrokernelTester() 5157 .mr(6) 5158 .nr(8) 5159 .kr(1) 5160 .sr(1) 5161 .m(6) 5162 .n(n) 5163 .k(8) 5164 .iterations(1) 5165 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5166 } 5167 } 5168 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_lt_8)5169 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_lt_8) { 5170 TEST_REQUIRES_ARM_NEON; 5171 for (size_t k = 1; k < 8; k++) { 5172 GemmMicrokernelTester() 5173 .mr(6) 5174 .nr(8) 5175 .kr(1) 5176 .sr(1) 5177 .m(6) 5178 .n(8) 5179 .k(k) 5180 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5181 } 5182 } 5183 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_lt_8_strided_a)5184 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_lt_8_strided_a) { 5185 TEST_REQUIRES_ARM_NEON; 5186 for (size_t k = 1; k < 8; k++) { 5187 GemmMicrokernelTester() 5188 .mr(6) 5189 .nr(8) 5190 .kr(1) 5191 .sr(1) 5192 .m(6) 5193 .n(8) 5194 .k(k) 5195 .a_stride(11) 5196 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5197 } 5198 } 5199 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_lt_8_subtile)5200 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_lt_8_subtile) { 5201 TEST_REQUIRES_ARM_NEON; 5202 for (size_t k = 1; k < 8; k++) { 5203 for (uint32_t n = 1; n <= 8; n++) { 5204 for (uint32_t m = 1; m <= 6; m++) { 5205 GemmMicrokernelTester() 5206 .mr(6) 5207 .nr(8) 5208 .kr(1) 5209 .sr(1) 5210 .m(m) 5211 .n(n) 5212 .k(k) 5213 .iterations(1) 5214 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5215 } 5216 } 5217 } 5218 } 5219 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_gt_8)5220 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_gt_8) { 5221 TEST_REQUIRES_ARM_NEON; 5222 for (size_t k = 9; k < 16; k++) { 5223 GemmMicrokernelTester() 5224 .mr(6) 5225 .nr(8) 5226 .kr(1) 5227 .sr(1) 5228 .m(6) 5229 .n(8) 5230 .k(k) 5231 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5232 } 5233 } 5234 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_gt_8_strided_a)5235 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_gt_8_strided_a) { 5236 TEST_REQUIRES_ARM_NEON; 5237 for (size_t k = 9; k < 16; k++) { 5238 GemmMicrokernelTester() 5239 .mr(6) 5240 .nr(8) 5241 .kr(1) 5242 .sr(1) 5243 .m(6) 5244 .n(8) 5245 .k(k) 5246 .a_stride(19) 5247 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5248 } 5249 } 5250 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_gt_8_subtile)5251 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_gt_8_subtile) { 5252 TEST_REQUIRES_ARM_NEON; 5253 for (size_t k = 9; k < 16; k++) { 5254 for (uint32_t n = 1; n <= 8; n++) { 5255 for (uint32_t m = 1; m <= 6; m++) { 5256 GemmMicrokernelTester() 5257 .mr(6) 5258 .nr(8) 5259 .kr(1) 5260 .sr(1) 5261 .m(m) 5262 .n(n) 5263 .k(k) 5264 .iterations(1) 5265 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5266 } 5267 } 5268 } 5269 } 5270 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_div_8)5271 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_div_8) { 5272 TEST_REQUIRES_ARM_NEON; 5273 for (size_t k = 16; k <= 80; k += 8) { 5274 GemmMicrokernelTester() 5275 .mr(6) 5276 .nr(8) 5277 .kr(1) 5278 .sr(1) 5279 .m(6) 5280 .n(8) 5281 .k(k) 5282 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5283 } 5284 } 5285 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_div_8_strided_a)5286 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_div_8_strided_a) { 5287 TEST_REQUIRES_ARM_NEON; 5288 for (size_t k = 16; k <= 80; k += 8) { 5289 GemmMicrokernelTester() 5290 .mr(6) 5291 .nr(8) 5292 .kr(1) 5293 .sr(1) 5294 .m(6) 5295 .n(8) 5296 .k(k) 5297 .a_stride(83) 5298 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5299 } 5300 } 5301 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,k_div_8_subtile)5302 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, k_div_8_subtile) { 5303 TEST_REQUIRES_ARM_NEON; 5304 for (size_t k = 16; k <= 80; k += 8) { 5305 for (uint32_t n = 1; n <= 8; n++) { 5306 for (uint32_t m = 1; m <= 6; m++) { 5307 GemmMicrokernelTester() 5308 .mr(6) 5309 .nr(8) 5310 .kr(1) 5311 .sr(1) 5312 .m(m) 5313 .n(n) 5314 .k(k) 5315 .iterations(1) 5316 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5317 } 5318 } 5319 } 5320 } 5321 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,n_gt_8)5322 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, n_gt_8) { 5323 TEST_REQUIRES_ARM_NEON; 5324 for (uint32_t n = 9; n < 16; n++) { 5325 for (size_t k = 1; k <= 40; k += 9) { 5326 GemmMicrokernelTester() 5327 .mr(6) 5328 .nr(8) 5329 .kr(1) 5330 .sr(1) 5331 .m(6) 5332 .n(n) 5333 .k(k) 5334 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5335 } 5336 } 5337 } 5338 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,n_gt_8_strided_cn)5339 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, n_gt_8_strided_cn) { 5340 TEST_REQUIRES_ARM_NEON; 5341 for (uint32_t n = 9; n < 16; n++) { 5342 for (size_t k = 1; k <= 40; k += 9) { 5343 GemmMicrokernelTester() 5344 .mr(6) 5345 .nr(8) 5346 .kr(1) 5347 .sr(1) 5348 .m(6) 5349 .n(n) 5350 .k(k) 5351 .cn_stride(11) 5352 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5353 } 5354 } 5355 } 5356 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,n_gt_8_strided_a)5357 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, n_gt_8_strided_a) { 5358 TEST_REQUIRES_ARM_NEON; 5359 for (uint32_t n = 9; n < 16; n++) { 5360 for (size_t k = 1; k <= 40; k += 9) { 5361 GemmMicrokernelTester() 5362 .mr(6) 5363 .nr(8) 5364 .kr(1) 5365 .sr(1) 5366 .m(6) 5367 .n(n) 5368 .k(k) 5369 .a_stride(43) 5370 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5371 } 5372 } 5373 } 5374 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,n_gt_8_subtile)5375 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, n_gt_8_subtile) { 5376 TEST_REQUIRES_ARM_NEON; 5377 for (uint32_t n = 9; n < 16; n++) { 5378 for (size_t k = 1; k <= 40; k += 9) { 5379 for (uint32_t m = 1; m <= 6; m++) { 5380 GemmMicrokernelTester() 5381 .mr(6) 5382 .nr(8) 5383 .kr(1) 5384 .sr(1) 5385 .m(m) 5386 .n(n) 5387 .k(k) 5388 .iterations(1) 5389 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5390 } 5391 } 5392 } 5393 } 5394 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,n_div_8)5395 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, n_div_8) { 5396 TEST_REQUIRES_ARM_NEON; 5397 for (uint32_t n = 16; n <= 24; n += 8) { 5398 for (size_t k = 1; k <= 40; k += 9) { 5399 GemmMicrokernelTester() 5400 .mr(6) 5401 .nr(8) 5402 .kr(1) 5403 .sr(1) 5404 .m(6) 5405 .n(n) 5406 .k(k) 5407 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5408 } 5409 } 5410 } 5411 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,n_div_8_strided_cn)5412 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, n_div_8_strided_cn) { 5413 TEST_REQUIRES_ARM_NEON; 5414 for (uint32_t n = 16; n <= 24; n += 8) { 5415 for (size_t k = 1; k <= 40; k += 9) { 5416 GemmMicrokernelTester() 5417 .mr(6) 5418 .nr(8) 5419 .kr(1) 5420 .sr(1) 5421 .m(6) 5422 .n(n) 5423 .k(k) 5424 .cn_stride(11) 5425 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5426 } 5427 } 5428 } 5429 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,n_div_8_strided_a)5430 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, n_div_8_strided_a) { 5431 TEST_REQUIRES_ARM_NEON; 5432 for (uint32_t n = 16; n <= 24; n += 8) { 5433 for (size_t k = 1; k <= 40; k += 9) { 5434 GemmMicrokernelTester() 5435 .mr(6) 5436 .nr(8) 5437 .kr(1) 5438 .sr(1) 5439 .m(6) 5440 .n(n) 5441 .k(k) 5442 .a_stride(43) 5443 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5444 } 5445 } 5446 } 5447 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,n_div_8_subtile)5448 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, n_div_8_subtile) { 5449 TEST_REQUIRES_ARM_NEON; 5450 for (uint32_t n = 16; n <= 24; n += 8) { 5451 for (size_t k = 1; k <= 40; k += 9) { 5452 for (uint32_t m = 1; m <= 6; m++) { 5453 GemmMicrokernelTester() 5454 .mr(6) 5455 .nr(8) 5456 .kr(1) 5457 .sr(1) 5458 .m(m) 5459 .n(n) 5460 .k(k) 5461 .iterations(1) 5462 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5463 } 5464 } 5465 } 5466 } 5467 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,strided_cm_subtile)5468 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, strided_cm_subtile) { 5469 TEST_REQUIRES_ARM_NEON; 5470 for (size_t k = 1; k <= 40; k += 9) { 5471 for (uint32_t n = 1; n <= 8; n++) { 5472 for (uint32_t m = 1; m <= 6; m++) { 5473 GemmMicrokernelTester() 5474 .mr(6) 5475 .nr(8) 5476 .kr(1) 5477 .sr(1) 5478 .m(m) 5479 .n(n) 5480 .k(k) 5481 .cm_stride(11) 5482 .iterations(1) 5483 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5484 } 5485 } 5486 } 5487 } 5488 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,qmin)5489 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, qmin) { 5490 TEST_REQUIRES_ARM_NEON; 5491 GemmMicrokernelTester() 5492 .mr(6) 5493 .nr(8) 5494 .kr(1) 5495 .sr(1) 5496 .m(6) 5497 .n(8) 5498 .k(8) 5499 .qmin(128) 5500 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5501 } 5502 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,qmax)5503 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, qmax) { 5504 TEST_REQUIRES_ARM_NEON; 5505 GemmMicrokernelTester() 5506 .mr(6) 5507 .nr(8) 5508 .kr(1) 5509 .sr(1) 5510 .m(6) 5511 .n(8) 5512 .k(8) 5513 .qmax(128) 5514 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5515 } 5516 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,strided_cm)5517 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, strided_cm) { 5518 TEST_REQUIRES_ARM_NEON; 5519 GemmMicrokernelTester() 5520 .mr(6) 5521 .nr(8) 5522 .kr(1) 5523 .sr(1) 5524 .m(6) 5525 .n(8) 5526 .k(8) 5527 .cm_stride(11) 5528 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5529 } 5530 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,no_a_zero_point)5531 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, no_a_zero_point) { 5532 TEST_REQUIRES_ARM_NEON; 5533 for (size_t k = 1; k <= 40; k += 9) { 5534 GemmMicrokernelTester() 5535 .mr(6) 5536 .nr(8) 5537 .kr(1) 5538 .sr(1) 5539 .m(6) 5540 .n(8) 5541 .k(k) 5542 .a_zero_point(0) 5543 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5544 } 5545 } 5546 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,no_b_zero_point)5547 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, no_b_zero_point) { 5548 TEST_REQUIRES_ARM_NEON; 5549 for (size_t k = 1; k <= 40; k += 9) { 5550 GemmMicrokernelTester() 5551 .mr(6) 5552 .nr(8) 5553 .kr(1) 5554 .sr(1) 5555 .m(6) 5556 .n(8) 5557 .k(k) 5558 .b_zero_point(0) 5559 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5560 } 5561 } 5562 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE,no_zero_point)5563 TEST(QU8_GEMM_MINMAX_RNDNU_6X8__NEON_MLAL_LANE, no_zero_point) { 5564 TEST_REQUIRES_ARM_NEON; 5565 for (size_t k = 1; k <= 40; k += 9) { 5566 GemmMicrokernelTester() 5567 .mr(6) 5568 .nr(8) 5569 .kr(1) 5570 .sr(1) 5571 .m(6) 5572 .n(8) 5573 .k(k) 5574 .a_zero_point(0) 5575 .b_zero_point(0) 5576 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5577 } 5578 } 5579 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 5580 5581 5582 #if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_eq_16)5583 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_eq_16) { 5584 TEST_REQUIRES_ARM_NEON_DOT; 5585 GemmMicrokernelTester() 5586 .mr(4) 5587 .nr(8) 5588 .kr(4) 5589 .sr(1) 5590 .m(4) 5591 .n(8) 5592 .k(16) 5593 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5594 } 5595 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,strided_cn)5596 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, strided_cn) { 5597 TEST_REQUIRES_ARM_NEON_DOT; 5598 GemmMicrokernelTester() 5599 .mr(4) 5600 .nr(8) 5601 .kr(4) 5602 .sr(1) 5603 .m(4) 5604 .n(8) 5605 .k(16) 5606 .cn_stride(11) 5607 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5608 } 5609 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_eq_16_strided_a)5610 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_eq_16_strided_a) { 5611 TEST_REQUIRES_ARM_NEON_DOT; 5612 GemmMicrokernelTester() 5613 .mr(4) 5614 .nr(8) 5615 .kr(4) 5616 .sr(1) 5617 .m(4) 5618 .n(8) 5619 .k(16) 5620 .a_stride(19) 5621 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5622 } 5623 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_eq_16_subtile)5624 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) { 5625 TEST_REQUIRES_ARM_NEON_DOT; 5626 for (uint32_t n = 1; n <= 8; n++) { 5627 for (uint32_t m = 1; m <= 4; m++) { 5628 GemmMicrokernelTester() 5629 .mr(4) 5630 .nr(8) 5631 .kr(4) 5632 .sr(1) 5633 .m(m) 5634 .n(n) 5635 .k(16) 5636 .iterations(1) 5637 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5638 } 5639 } 5640 } 5641 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_eq_16_subtile_m)5642 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) { 5643 TEST_REQUIRES_ARM_NEON_DOT; 5644 for (uint32_t m = 1; m <= 4; m++) { 5645 GemmMicrokernelTester() 5646 .mr(4) 5647 .nr(8) 5648 .kr(4) 5649 .sr(1) 5650 .m(m) 5651 .n(8) 5652 .k(16) 5653 .iterations(1) 5654 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5655 } 5656 } 5657 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_eq_16_subtile_n)5658 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) { 5659 TEST_REQUIRES_ARM_NEON_DOT; 5660 for (uint32_t n = 1; n <= 8; n++) { 5661 GemmMicrokernelTester() 5662 .mr(4) 5663 .nr(8) 5664 .kr(4) 5665 .sr(1) 5666 .m(4) 5667 .n(n) 5668 .k(16) 5669 .iterations(1) 5670 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5671 } 5672 } 5673 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_lt_16)5674 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_lt_16) { 5675 TEST_REQUIRES_ARM_NEON_DOT; 5676 for (size_t k = 1; k < 16; k++) { 5677 GemmMicrokernelTester() 5678 .mr(4) 5679 .nr(8) 5680 .kr(4) 5681 .sr(1) 5682 .m(4) 5683 .n(8) 5684 .k(k) 5685 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5686 } 5687 } 5688 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_lt_16_strided_a)5689 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_lt_16_strided_a) { 5690 TEST_REQUIRES_ARM_NEON_DOT; 5691 for (size_t k = 1; k < 16; k++) { 5692 GemmMicrokernelTester() 5693 .mr(4) 5694 .nr(8) 5695 .kr(4) 5696 .sr(1) 5697 .m(4) 5698 .n(8) 5699 .k(k) 5700 .a_stride(19) 5701 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5702 } 5703 } 5704 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_lt_16_subtile)5705 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) { 5706 TEST_REQUIRES_ARM_NEON_DOT; 5707 for (size_t k = 1; k < 16; k++) { 5708 for (uint32_t n = 1; n <= 8; n++) { 5709 for (uint32_t m = 1; m <= 4; m++) { 5710 GemmMicrokernelTester() 5711 .mr(4) 5712 .nr(8) 5713 .kr(4) 5714 .sr(1) 5715 .m(m) 5716 .n(n) 5717 .k(k) 5718 .iterations(1) 5719 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5720 } 5721 } 5722 } 5723 } 5724 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_gt_16)5725 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_gt_16) { 5726 TEST_REQUIRES_ARM_NEON_DOT; 5727 for (size_t k = 17; k < 32; k++) { 5728 GemmMicrokernelTester() 5729 .mr(4) 5730 .nr(8) 5731 .kr(4) 5732 .sr(1) 5733 .m(4) 5734 .n(8) 5735 .k(k) 5736 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5737 } 5738 } 5739 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_gt_16_strided_a)5740 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_gt_16_strided_a) { 5741 TEST_REQUIRES_ARM_NEON_DOT; 5742 for (size_t k = 17; k < 32; k++) { 5743 GemmMicrokernelTester() 5744 .mr(4) 5745 .nr(8) 5746 .kr(4) 5747 .sr(1) 5748 .m(4) 5749 .n(8) 5750 .k(k) 5751 .a_stride(37) 5752 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5753 } 5754 } 5755 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_gt_16_subtile)5756 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) { 5757 TEST_REQUIRES_ARM_NEON_DOT; 5758 for (size_t k = 17; k < 32; k++) { 5759 for (uint32_t n = 1; n <= 8; n++) { 5760 for (uint32_t m = 1; m <= 4; m++) { 5761 GemmMicrokernelTester() 5762 .mr(4) 5763 .nr(8) 5764 .kr(4) 5765 .sr(1) 5766 .m(m) 5767 .n(n) 5768 .k(k) 5769 .iterations(1) 5770 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5771 } 5772 } 5773 } 5774 } 5775 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_div_16)5776 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_div_16) { 5777 TEST_REQUIRES_ARM_NEON_DOT; 5778 for (size_t k = 32; k <= 160; k += 16) { 5779 GemmMicrokernelTester() 5780 .mr(4) 5781 .nr(8) 5782 .kr(4) 5783 .sr(1) 5784 .m(4) 5785 .n(8) 5786 .k(k) 5787 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5788 } 5789 } 5790 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_div_16_strided_a)5791 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_div_16_strided_a) { 5792 TEST_REQUIRES_ARM_NEON_DOT; 5793 for (size_t k = 32; k <= 160; k += 16) { 5794 GemmMicrokernelTester() 5795 .mr(4) 5796 .nr(8) 5797 .kr(4) 5798 .sr(1) 5799 .m(4) 5800 .n(8) 5801 .k(k) 5802 .a_stride(163) 5803 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5804 } 5805 } 5806 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,k_div_16_subtile)5807 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) { 5808 TEST_REQUIRES_ARM_NEON_DOT; 5809 for (size_t k = 32; k <= 160; k += 16) { 5810 for (uint32_t n = 1; n <= 8; n++) { 5811 for (uint32_t m = 1; m <= 4; m++) { 5812 GemmMicrokernelTester() 5813 .mr(4) 5814 .nr(8) 5815 .kr(4) 5816 .sr(1) 5817 .m(m) 5818 .n(n) 5819 .k(k) 5820 .iterations(1) 5821 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5822 } 5823 } 5824 } 5825 } 5826 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,n_gt_8)5827 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, n_gt_8) { 5828 TEST_REQUIRES_ARM_NEON_DOT; 5829 for (uint32_t n = 9; n < 16; n++) { 5830 for (size_t k = 1; k <= 80; k += 17) { 5831 GemmMicrokernelTester() 5832 .mr(4) 5833 .nr(8) 5834 .kr(4) 5835 .sr(1) 5836 .m(4) 5837 .n(n) 5838 .k(k) 5839 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5840 } 5841 } 5842 } 5843 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,n_gt_8_strided_cn)5844 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, n_gt_8_strided_cn) { 5845 TEST_REQUIRES_ARM_NEON_DOT; 5846 for (uint32_t n = 9; n < 16; n++) { 5847 for (size_t k = 1; k <= 80; k += 17) { 5848 GemmMicrokernelTester() 5849 .mr(4) 5850 .nr(8) 5851 .kr(4) 5852 .sr(1) 5853 .m(4) 5854 .n(n) 5855 .k(k) 5856 .cn_stride(11) 5857 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5858 } 5859 } 5860 } 5861 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,n_gt_8_strided_a)5862 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, n_gt_8_strided_a) { 5863 TEST_REQUIRES_ARM_NEON_DOT; 5864 for (uint32_t n = 9; n < 16; n++) { 5865 for (size_t k = 1; k <= 80; k += 17) { 5866 GemmMicrokernelTester() 5867 .mr(4) 5868 .nr(8) 5869 .kr(4) 5870 .sr(1) 5871 .m(4) 5872 .n(n) 5873 .k(k) 5874 .a_stride(83) 5875 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5876 } 5877 } 5878 } 5879 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,n_gt_8_subtile)5880 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, n_gt_8_subtile) { 5881 TEST_REQUIRES_ARM_NEON_DOT; 5882 for (uint32_t n = 9; n < 16; n++) { 5883 for (size_t k = 1; k <= 80; k += 17) { 5884 for (uint32_t m = 1; m <= 4; m++) { 5885 GemmMicrokernelTester() 5886 .mr(4) 5887 .nr(8) 5888 .kr(4) 5889 .sr(1) 5890 .m(m) 5891 .n(n) 5892 .k(k) 5893 .iterations(1) 5894 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5895 } 5896 } 5897 } 5898 } 5899 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,n_div_8)5900 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, n_div_8) { 5901 TEST_REQUIRES_ARM_NEON_DOT; 5902 for (uint32_t n = 16; n <= 24; n += 8) { 5903 for (size_t k = 1; k <= 80; k += 17) { 5904 GemmMicrokernelTester() 5905 .mr(4) 5906 .nr(8) 5907 .kr(4) 5908 .sr(1) 5909 .m(4) 5910 .n(n) 5911 .k(k) 5912 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5913 } 5914 } 5915 } 5916 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,n_div_8_strided_cn)5917 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, n_div_8_strided_cn) { 5918 TEST_REQUIRES_ARM_NEON_DOT; 5919 for (uint32_t n = 16; n <= 24; n += 8) { 5920 for (size_t k = 1; k <= 80; k += 17) { 5921 GemmMicrokernelTester() 5922 .mr(4) 5923 .nr(8) 5924 .kr(4) 5925 .sr(1) 5926 .m(4) 5927 .n(n) 5928 .k(k) 5929 .cn_stride(11) 5930 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5931 } 5932 } 5933 } 5934 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,n_div_8_strided_a)5935 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, n_div_8_strided_a) { 5936 TEST_REQUIRES_ARM_NEON_DOT; 5937 for (uint32_t n = 16; n <= 24; n += 8) { 5938 for (size_t k = 1; k <= 80; k += 17) { 5939 GemmMicrokernelTester() 5940 .mr(4) 5941 .nr(8) 5942 .kr(4) 5943 .sr(1) 5944 .m(4) 5945 .n(n) 5946 .k(k) 5947 .a_stride(83) 5948 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5949 } 5950 } 5951 } 5952 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,n_div_8_subtile)5953 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, n_div_8_subtile) { 5954 TEST_REQUIRES_ARM_NEON_DOT; 5955 for (uint32_t n = 16; n <= 24; n += 8) { 5956 for (size_t k = 1; k <= 80; k += 17) { 5957 for (uint32_t m = 1; m <= 4; m++) { 5958 GemmMicrokernelTester() 5959 .mr(4) 5960 .nr(8) 5961 .kr(4) 5962 .sr(1) 5963 .m(m) 5964 .n(n) 5965 .k(k) 5966 .iterations(1) 5967 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5968 } 5969 } 5970 } 5971 } 5972 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,strided_cm_subtile)5973 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) { 5974 TEST_REQUIRES_ARM_NEON_DOT; 5975 for (size_t k = 1; k <= 80; k += 17) { 5976 for (uint32_t n = 1; n <= 8; n++) { 5977 for (uint32_t m = 1; m <= 4; m++) { 5978 GemmMicrokernelTester() 5979 .mr(4) 5980 .nr(8) 5981 .kr(4) 5982 .sr(1) 5983 .m(m) 5984 .n(n) 5985 .k(k) 5986 .cm_stride(11) 5987 .iterations(1) 5988 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5989 } 5990 } 5991 } 5992 } 5993 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,qmin)5994 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, qmin) { 5995 TEST_REQUIRES_ARM_NEON_DOT; 5996 GemmMicrokernelTester() 5997 .mr(4) 5998 .nr(8) 5999 .kr(4) 6000 .sr(1) 6001 .m(4) 6002 .n(8) 6003 .k(16) 6004 .qmin(128) 6005 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6006 } 6007 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,qmax)6008 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, qmax) { 6009 TEST_REQUIRES_ARM_NEON_DOT; 6010 GemmMicrokernelTester() 6011 .mr(4) 6012 .nr(8) 6013 .kr(4) 6014 .sr(1) 6015 .m(4) 6016 .n(8) 6017 .k(16) 6018 .qmax(128) 6019 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6020 } 6021 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,strided_cm)6022 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, strided_cm) { 6023 TEST_REQUIRES_ARM_NEON_DOT; 6024 GemmMicrokernelTester() 6025 .mr(4) 6026 .nr(8) 6027 .kr(4) 6028 .sr(1) 6029 .m(4) 6030 .n(8) 6031 .k(16) 6032 .cm_stride(11) 6033 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6034 } 6035 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,no_a_zero_point)6036 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, no_a_zero_point) { 6037 TEST_REQUIRES_ARM_NEON_DOT; 6038 for (size_t k = 1; k <= 80; k += 17) { 6039 GemmMicrokernelTester() 6040 .mr(4) 6041 .nr(8) 6042 .kr(4) 6043 .sr(1) 6044 .m(4) 6045 .n(8) 6046 .k(k) 6047 .a_zero_point(0) 6048 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6049 } 6050 } 6051 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,no_b_zero_point)6052 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, no_b_zero_point) { 6053 TEST_REQUIRES_ARM_NEON_DOT; 6054 for (size_t k = 1; k <= 80; k += 17) { 6055 GemmMicrokernelTester() 6056 .mr(4) 6057 .nr(8) 6058 .kr(4) 6059 .sr(1) 6060 .m(4) 6061 .n(8) 6062 .k(k) 6063 .b_zero_point(0) 6064 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6065 } 6066 } 6067 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128,no_zero_point)6068 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_LD128, no_zero_point) { 6069 TEST_REQUIRES_ARM_NEON_DOT; 6070 for (size_t k = 1; k <= 80; k += 17) { 6071 GemmMicrokernelTester() 6072 .mr(4) 6073 .nr(8) 6074 .kr(4) 6075 .sr(1) 6076 .m(4) 6077 .n(8) 6078 .k(k) 6079 .a_zero_point(0) 6080 .b_zero_point(0) 6081 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6082 } 6083 } 6084 #endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 6085 6086 6087 #if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_16)6088 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) { 6089 TEST_REQUIRES_ARM_NEON_DOT; 6090 GemmMicrokernelTester() 6091 .mr(4) 6092 .nr(16) 6093 .kr(4) 6094 .sr(1) 6095 .m(4) 6096 .n(16) 6097 .k(16) 6098 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6099 } 6100 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,strided_cn)6101 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) { 6102 TEST_REQUIRES_ARM_NEON_DOT; 6103 GemmMicrokernelTester() 6104 .mr(4) 6105 .nr(16) 6106 .kr(4) 6107 .sr(1) 6108 .m(4) 6109 .n(16) 6110 .k(16) 6111 .cn_stride(19) 6112 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6113 } 6114 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_16_strided_a)6115 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_strided_a) { 6116 TEST_REQUIRES_ARM_NEON_DOT; 6117 GemmMicrokernelTester() 6118 .mr(4) 6119 .nr(16) 6120 .kr(4) 6121 .sr(1) 6122 .m(4) 6123 .n(16) 6124 .k(16) 6125 .a_stride(19) 6126 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6127 } 6128 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_16_subtile)6129 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) { 6130 TEST_REQUIRES_ARM_NEON_DOT; 6131 for (uint32_t n = 1; n <= 16; n++) { 6132 for (uint32_t m = 1; m <= 4; m++) { 6133 GemmMicrokernelTester() 6134 .mr(4) 6135 .nr(16) 6136 .kr(4) 6137 .sr(1) 6138 .m(m) 6139 .n(n) 6140 .k(16) 6141 .iterations(1) 6142 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6143 } 6144 } 6145 } 6146 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_16_subtile_m)6147 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) { 6148 TEST_REQUIRES_ARM_NEON_DOT; 6149 for (uint32_t m = 1; m <= 4; m++) { 6150 GemmMicrokernelTester() 6151 .mr(4) 6152 .nr(16) 6153 .kr(4) 6154 .sr(1) 6155 .m(m) 6156 .n(16) 6157 .k(16) 6158 .iterations(1) 6159 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6160 } 6161 } 6162 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_16_subtile_n)6163 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) { 6164 TEST_REQUIRES_ARM_NEON_DOT; 6165 for (uint32_t n = 1; n <= 16; n++) { 6166 GemmMicrokernelTester() 6167 .mr(4) 6168 .nr(16) 6169 .kr(4) 6170 .sr(1) 6171 .m(4) 6172 .n(n) 6173 .k(16) 6174 .iterations(1) 6175 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6176 } 6177 } 6178 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_lt_16)6179 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) { 6180 TEST_REQUIRES_ARM_NEON_DOT; 6181 for (size_t k = 1; k < 16; k++) { 6182 GemmMicrokernelTester() 6183 .mr(4) 6184 .nr(16) 6185 .kr(4) 6186 .sr(1) 6187 .m(4) 6188 .n(16) 6189 .k(k) 6190 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6191 } 6192 } 6193 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_lt_16_strided_a)6194 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_strided_a) { 6195 TEST_REQUIRES_ARM_NEON_DOT; 6196 for (size_t k = 1; k < 16; k++) { 6197 GemmMicrokernelTester() 6198 .mr(4) 6199 .nr(16) 6200 .kr(4) 6201 .sr(1) 6202 .m(4) 6203 .n(16) 6204 .k(k) 6205 .a_stride(19) 6206 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6207 } 6208 } 6209 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_lt_16_subtile)6210 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) { 6211 TEST_REQUIRES_ARM_NEON_DOT; 6212 for (size_t k = 1; k < 16; k++) { 6213 for (uint32_t n = 1; n <= 16; n++) { 6214 for (uint32_t m = 1; m <= 4; m++) { 6215 GemmMicrokernelTester() 6216 .mr(4) 6217 .nr(16) 6218 .kr(4) 6219 .sr(1) 6220 .m(m) 6221 .n(n) 6222 .k(k) 6223 .iterations(1) 6224 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6225 } 6226 } 6227 } 6228 } 6229 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_gt_16)6230 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) { 6231 TEST_REQUIRES_ARM_NEON_DOT; 6232 for (size_t k = 17; k < 32; k++) { 6233 GemmMicrokernelTester() 6234 .mr(4) 6235 .nr(16) 6236 .kr(4) 6237 .sr(1) 6238 .m(4) 6239 .n(16) 6240 .k(k) 6241 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6242 } 6243 } 6244 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_gt_16_strided_a)6245 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_strided_a) { 6246 TEST_REQUIRES_ARM_NEON_DOT; 6247 for (size_t k = 17; k < 32; k++) { 6248 GemmMicrokernelTester() 6249 .mr(4) 6250 .nr(16) 6251 .kr(4) 6252 .sr(1) 6253 .m(4) 6254 .n(16) 6255 .k(k) 6256 .a_stride(37) 6257 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6258 } 6259 } 6260 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_gt_16_subtile)6261 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) { 6262 TEST_REQUIRES_ARM_NEON_DOT; 6263 for (size_t k = 17; k < 32; k++) { 6264 for (uint32_t n = 1; n <= 16; n++) { 6265 for (uint32_t m = 1; m <= 4; m++) { 6266 GemmMicrokernelTester() 6267 .mr(4) 6268 .nr(16) 6269 .kr(4) 6270 .sr(1) 6271 .m(m) 6272 .n(n) 6273 .k(k) 6274 .iterations(1) 6275 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6276 } 6277 } 6278 } 6279 } 6280 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_div_16)6281 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) { 6282 TEST_REQUIRES_ARM_NEON_DOT; 6283 for (size_t k = 32; k <= 160; k += 16) { 6284 GemmMicrokernelTester() 6285 .mr(4) 6286 .nr(16) 6287 .kr(4) 6288 .sr(1) 6289 .m(4) 6290 .n(16) 6291 .k(k) 6292 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6293 } 6294 } 6295 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_div_16_strided_a)6296 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_strided_a) { 6297 TEST_REQUIRES_ARM_NEON_DOT; 6298 for (size_t k = 32; k <= 160; k += 16) { 6299 GemmMicrokernelTester() 6300 .mr(4) 6301 .nr(16) 6302 .kr(4) 6303 .sr(1) 6304 .m(4) 6305 .n(16) 6306 .k(k) 6307 .a_stride(163) 6308 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6309 } 6310 } 6311 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,k_div_16_subtile)6312 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) { 6313 TEST_REQUIRES_ARM_NEON_DOT; 6314 for (size_t k = 32; k <= 160; k += 16) { 6315 for (uint32_t n = 1; n <= 16; n++) { 6316 for (uint32_t m = 1; m <= 4; m++) { 6317 GemmMicrokernelTester() 6318 .mr(4) 6319 .nr(16) 6320 .kr(4) 6321 .sr(1) 6322 .m(m) 6323 .n(n) 6324 .k(k) 6325 .iterations(1) 6326 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6327 } 6328 } 6329 } 6330 } 6331 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,n_gt_16)6332 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) { 6333 TEST_REQUIRES_ARM_NEON_DOT; 6334 for (uint32_t n = 17; n < 32; n++) { 6335 for (size_t k = 1; k <= 80; k += 17) { 6336 GemmMicrokernelTester() 6337 .mr(4) 6338 .nr(16) 6339 .kr(4) 6340 .sr(1) 6341 .m(4) 6342 .n(n) 6343 .k(k) 6344 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6345 } 6346 } 6347 } 6348 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,n_gt_16_strided_cn)6349 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) { 6350 TEST_REQUIRES_ARM_NEON_DOT; 6351 for (uint32_t n = 17; n < 32; n++) { 6352 for (size_t k = 1; k <= 80; k += 17) { 6353 GemmMicrokernelTester() 6354 .mr(4) 6355 .nr(16) 6356 .kr(4) 6357 .sr(1) 6358 .m(4) 6359 .n(n) 6360 .k(k) 6361 .cn_stride(19) 6362 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6363 } 6364 } 6365 } 6366 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,n_gt_16_strided_a)6367 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_a) { 6368 TEST_REQUIRES_ARM_NEON_DOT; 6369 for (uint32_t n = 17; n < 32; n++) { 6370 for (size_t k = 1; k <= 80; k += 17) { 6371 GemmMicrokernelTester() 6372 .mr(4) 6373 .nr(16) 6374 .kr(4) 6375 .sr(1) 6376 .m(4) 6377 .n(n) 6378 .k(k) 6379 .a_stride(83) 6380 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6381 } 6382 } 6383 } 6384 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,n_gt_16_subtile)6385 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) { 6386 TEST_REQUIRES_ARM_NEON_DOT; 6387 for (uint32_t n = 17; n < 32; n++) { 6388 for (size_t k = 1; k <= 80; k += 17) { 6389 for (uint32_t m = 1; m <= 4; m++) { 6390 GemmMicrokernelTester() 6391 .mr(4) 6392 .nr(16) 6393 .kr(4) 6394 .sr(1) 6395 .m(m) 6396 .n(n) 6397 .k(k) 6398 .iterations(1) 6399 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6400 } 6401 } 6402 } 6403 } 6404 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,n_div_16)6405 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) { 6406 TEST_REQUIRES_ARM_NEON_DOT; 6407 for (uint32_t n = 32; n <= 48; n += 16) { 6408 for (size_t k = 1; k <= 80; k += 17) { 6409 GemmMicrokernelTester() 6410 .mr(4) 6411 .nr(16) 6412 .kr(4) 6413 .sr(1) 6414 .m(4) 6415 .n(n) 6416 .k(k) 6417 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6418 } 6419 } 6420 } 6421 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,n_div_16_strided_cn)6422 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) { 6423 TEST_REQUIRES_ARM_NEON_DOT; 6424 for (uint32_t n = 32; n <= 48; n += 16) { 6425 for (size_t k = 1; k <= 80; k += 17) { 6426 GemmMicrokernelTester() 6427 .mr(4) 6428 .nr(16) 6429 .kr(4) 6430 .sr(1) 6431 .m(4) 6432 .n(n) 6433 .k(k) 6434 .cn_stride(19) 6435 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6436 } 6437 } 6438 } 6439 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,n_div_16_strided_a)6440 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_a) { 6441 TEST_REQUIRES_ARM_NEON_DOT; 6442 for (uint32_t n = 32; n <= 48; n += 16) { 6443 for (size_t k = 1; k <= 80; k += 17) { 6444 GemmMicrokernelTester() 6445 .mr(4) 6446 .nr(16) 6447 .kr(4) 6448 .sr(1) 6449 .m(4) 6450 .n(n) 6451 .k(k) 6452 .a_stride(83) 6453 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6454 } 6455 } 6456 } 6457 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,n_div_16_subtile)6458 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) { 6459 TEST_REQUIRES_ARM_NEON_DOT; 6460 for (uint32_t n = 32; n <= 48; n += 16) { 6461 for (size_t k = 1; k <= 80; k += 17) { 6462 for (uint32_t m = 1; m <= 4; m++) { 6463 GemmMicrokernelTester() 6464 .mr(4) 6465 .nr(16) 6466 .kr(4) 6467 .sr(1) 6468 .m(m) 6469 .n(n) 6470 .k(k) 6471 .iterations(1) 6472 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6473 } 6474 } 6475 } 6476 } 6477 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,strided_cm_subtile)6478 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) { 6479 TEST_REQUIRES_ARM_NEON_DOT; 6480 for (size_t k = 1; k <= 80; k += 17) { 6481 for (uint32_t n = 1; n <= 16; n++) { 6482 for (uint32_t m = 1; m <= 4; m++) { 6483 GemmMicrokernelTester() 6484 .mr(4) 6485 .nr(16) 6486 .kr(4) 6487 .sr(1) 6488 .m(m) 6489 .n(n) 6490 .k(k) 6491 .cm_stride(19) 6492 .iterations(1) 6493 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6494 } 6495 } 6496 } 6497 } 6498 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,qmin)6499 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) { 6500 TEST_REQUIRES_ARM_NEON_DOT; 6501 GemmMicrokernelTester() 6502 .mr(4) 6503 .nr(16) 6504 .kr(4) 6505 .sr(1) 6506 .m(4) 6507 .n(16) 6508 .k(16) 6509 .qmin(128) 6510 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6511 } 6512 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,qmax)6513 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) { 6514 TEST_REQUIRES_ARM_NEON_DOT; 6515 GemmMicrokernelTester() 6516 .mr(4) 6517 .nr(16) 6518 .kr(4) 6519 .sr(1) 6520 .m(4) 6521 .n(16) 6522 .k(16) 6523 .qmax(128) 6524 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6525 } 6526 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,strided_cm)6527 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) { 6528 TEST_REQUIRES_ARM_NEON_DOT; 6529 GemmMicrokernelTester() 6530 .mr(4) 6531 .nr(16) 6532 .kr(4) 6533 .sr(1) 6534 .m(4) 6535 .n(16) 6536 .k(16) 6537 .cm_stride(19) 6538 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6539 } 6540 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,no_a_zero_point)6541 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, no_a_zero_point) { 6542 TEST_REQUIRES_ARM_NEON_DOT; 6543 for (size_t k = 1; k <= 80; k += 17) { 6544 GemmMicrokernelTester() 6545 .mr(4) 6546 .nr(16) 6547 .kr(4) 6548 .sr(1) 6549 .m(4) 6550 .n(16) 6551 .k(k) 6552 .a_zero_point(0) 6553 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6554 } 6555 } 6556 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,no_b_zero_point)6557 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, no_b_zero_point) { 6558 TEST_REQUIRES_ARM_NEON_DOT; 6559 for (size_t k = 1; k <= 80; k += 17) { 6560 GemmMicrokernelTester() 6561 .mr(4) 6562 .nr(16) 6563 .kr(4) 6564 .sr(1) 6565 .m(4) 6566 .n(16) 6567 .k(k) 6568 .b_zero_point(0) 6569 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6570 } 6571 } 6572 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55,no_zero_point)6573 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, no_zero_point) { 6574 TEST_REQUIRES_ARM_NEON_DOT; 6575 for (size_t k = 1; k <= 80; k += 17) { 6576 GemmMicrokernelTester() 6577 .mr(4) 6578 .nr(16) 6579 .kr(4) 6580 .sr(1) 6581 .m(4) 6582 .n(16) 6583 .k(k) 6584 .a_zero_point(0) 6585 .b_zero_point(0) 6586 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6587 } 6588 } 6589 #endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 6590 6591 6592 #if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_eq_16)6593 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) { 6594 TEST_REQUIRES_ARM_NEON_DOT; 6595 GemmMicrokernelTester() 6596 .mr(4) 6597 .nr(16) 6598 .kr(4) 6599 .sr(1) 6600 .m(4) 6601 .n(16) 6602 .k(16) 6603 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6604 } 6605 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,strided_cn)6606 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) { 6607 TEST_REQUIRES_ARM_NEON_DOT; 6608 GemmMicrokernelTester() 6609 .mr(4) 6610 .nr(16) 6611 .kr(4) 6612 .sr(1) 6613 .m(4) 6614 .n(16) 6615 .k(16) 6616 .cn_stride(19) 6617 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6618 } 6619 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_eq_16_strided_a)6620 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_strided_a) { 6621 TEST_REQUIRES_ARM_NEON_DOT; 6622 GemmMicrokernelTester() 6623 .mr(4) 6624 .nr(16) 6625 .kr(4) 6626 .sr(1) 6627 .m(4) 6628 .n(16) 6629 .k(16) 6630 .a_stride(19) 6631 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6632 } 6633 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_eq_16_subtile)6634 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) { 6635 TEST_REQUIRES_ARM_NEON_DOT; 6636 for (uint32_t n = 1; n <= 16; n++) { 6637 for (uint32_t m = 1; m <= 4; m++) { 6638 GemmMicrokernelTester() 6639 .mr(4) 6640 .nr(16) 6641 .kr(4) 6642 .sr(1) 6643 .m(m) 6644 .n(n) 6645 .k(16) 6646 .iterations(1) 6647 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6648 } 6649 } 6650 } 6651 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_eq_16_subtile_m)6652 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) { 6653 TEST_REQUIRES_ARM_NEON_DOT; 6654 for (uint32_t m = 1; m <= 4; m++) { 6655 GemmMicrokernelTester() 6656 .mr(4) 6657 .nr(16) 6658 .kr(4) 6659 .sr(1) 6660 .m(m) 6661 .n(16) 6662 .k(16) 6663 .iterations(1) 6664 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6665 } 6666 } 6667 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_eq_16_subtile_n)6668 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) { 6669 TEST_REQUIRES_ARM_NEON_DOT; 6670 for (uint32_t n = 1; n <= 16; n++) { 6671 GemmMicrokernelTester() 6672 .mr(4) 6673 .nr(16) 6674 .kr(4) 6675 .sr(1) 6676 .m(4) 6677 .n(n) 6678 .k(16) 6679 .iterations(1) 6680 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6681 } 6682 } 6683 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_lt_16)6684 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) { 6685 TEST_REQUIRES_ARM_NEON_DOT; 6686 for (size_t k = 1; k < 16; k++) { 6687 GemmMicrokernelTester() 6688 .mr(4) 6689 .nr(16) 6690 .kr(4) 6691 .sr(1) 6692 .m(4) 6693 .n(16) 6694 .k(k) 6695 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6696 } 6697 } 6698 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_lt_16_strided_a)6699 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_strided_a) { 6700 TEST_REQUIRES_ARM_NEON_DOT; 6701 for (size_t k = 1; k < 16; k++) { 6702 GemmMicrokernelTester() 6703 .mr(4) 6704 .nr(16) 6705 .kr(4) 6706 .sr(1) 6707 .m(4) 6708 .n(16) 6709 .k(k) 6710 .a_stride(19) 6711 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6712 } 6713 } 6714 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_lt_16_subtile)6715 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) { 6716 TEST_REQUIRES_ARM_NEON_DOT; 6717 for (size_t k = 1; k < 16; k++) { 6718 for (uint32_t n = 1; n <= 16; n++) { 6719 for (uint32_t m = 1; m <= 4; m++) { 6720 GemmMicrokernelTester() 6721 .mr(4) 6722 .nr(16) 6723 .kr(4) 6724 .sr(1) 6725 .m(m) 6726 .n(n) 6727 .k(k) 6728 .iterations(1) 6729 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6730 } 6731 } 6732 } 6733 } 6734 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_gt_16)6735 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) { 6736 TEST_REQUIRES_ARM_NEON_DOT; 6737 for (size_t k = 17; k < 32; k++) { 6738 GemmMicrokernelTester() 6739 .mr(4) 6740 .nr(16) 6741 .kr(4) 6742 .sr(1) 6743 .m(4) 6744 .n(16) 6745 .k(k) 6746 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6747 } 6748 } 6749 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_gt_16_strided_a)6750 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_strided_a) { 6751 TEST_REQUIRES_ARM_NEON_DOT; 6752 for (size_t k = 17; k < 32; k++) { 6753 GemmMicrokernelTester() 6754 .mr(4) 6755 .nr(16) 6756 .kr(4) 6757 .sr(1) 6758 .m(4) 6759 .n(16) 6760 .k(k) 6761 .a_stride(37) 6762 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6763 } 6764 } 6765 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_gt_16_subtile)6766 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) { 6767 TEST_REQUIRES_ARM_NEON_DOT; 6768 for (size_t k = 17; k < 32; k++) { 6769 for (uint32_t n = 1; n <= 16; n++) { 6770 for (uint32_t m = 1; m <= 4; m++) { 6771 GemmMicrokernelTester() 6772 .mr(4) 6773 .nr(16) 6774 .kr(4) 6775 .sr(1) 6776 .m(m) 6777 .n(n) 6778 .k(k) 6779 .iterations(1) 6780 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6781 } 6782 } 6783 } 6784 } 6785 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_div_16)6786 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) { 6787 TEST_REQUIRES_ARM_NEON_DOT; 6788 for (size_t k = 32; k <= 160; k += 16) { 6789 GemmMicrokernelTester() 6790 .mr(4) 6791 .nr(16) 6792 .kr(4) 6793 .sr(1) 6794 .m(4) 6795 .n(16) 6796 .k(k) 6797 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6798 } 6799 } 6800 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_div_16_strided_a)6801 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_strided_a) { 6802 TEST_REQUIRES_ARM_NEON_DOT; 6803 for (size_t k = 32; k <= 160; k += 16) { 6804 GemmMicrokernelTester() 6805 .mr(4) 6806 .nr(16) 6807 .kr(4) 6808 .sr(1) 6809 .m(4) 6810 .n(16) 6811 .k(k) 6812 .a_stride(163) 6813 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6814 } 6815 } 6816 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,k_div_16_subtile)6817 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) { 6818 TEST_REQUIRES_ARM_NEON_DOT; 6819 for (size_t k = 32; k <= 160; k += 16) { 6820 for (uint32_t n = 1; n <= 16; n++) { 6821 for (uint32_t m = 1; m <= 4; m++) { 6822 GemmMicrokernelTester() 6823 .mr(4) 6824 .nr(16) 6825 .kr(4) 6826 .sr(1) 6827 .m(m) 6828 .n(n) 6829 .k(k) 6830 .iterations(1) 6831 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6832 } 6833 } 6834 } 6835 } 6836 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,n_gt_16)6837 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) { 6838 TEST_REQUIRES_ARM_NEON_DOT; 6839 for (uint32_t n = 17; n < 32; n++) { 6840 for (size_t k = 1; k <= 80; k += 17) { 6841 GemmMicrokernelTester() 6842 .mr(4) 6843 .nr(16) 6844 .kr(4) 6845 .sr(1) 6846 .m(4) 6847 .n(n) 6848 .k(k) 6849 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6850 } 6851 } 6852 } 6853 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,n_gt_16_strided_cn)6854 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) { 6855 TEST_REQUIRES_ARM_NEON_DOT; 6856 for (uint32_t n = 17; n < 32; n++) { 6857 for (size_t k = 1; k <= 80; k += 17) { 6858 GemmMicrokernelTester() 6859 .mr(4) 6860 .nr(16) 6861 .kr(4) 6862 .sr(1) 6863 .m(4) 6864 .n(n) 6865 .k(k) 6866 .cn_stride(19) 6867 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6868 } 6869 } 6870 } 6871 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,n_gt_16_strided_a)6872 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_a) { 6873 TEST_REQUIRES_ARM_NEON_DOT; 6874 for (uint32_t n = 17; n < 32; n++) { 6875 for (size_t k = 1; k <= 80; k += 17) { 6876 GemmMicrokernelTester() 6877 .mr(4) 6878 .nr(16) 6879 .kr(4) 6880 .sr(1) 6881 .m(4) 6882 .n(n) 6883 .k(k) 6884 .a_stride(83) 6885 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6886 } 6887 } 6888 } 6889 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,n_gt_16_subtile)6890 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) { 6891 TEST_REQUIRES_ARM_NEON_DOT; 6892 for (uint32_t n = 17; n < 32; n++) { 6893 for (size_t k = 1; k <= 80; k += 17) { 6894 for (uint32_t m = 1; m <= 4; m++) { 6895 GemmMicrokernelTester() 6896 .mr(4) 6897 .nr(16) 6898 .kr(4) 6899 .sr(1) 6900 .m(m) 6901 .n(n) 6902 .k(k) 6903 .iterations(1) 6904 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6905 } 6906 } 6907 } 6908 } 6909 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,n_div_16)6910 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) { 6911 TEST_REQUIRES_ARM_NEON_DOT; 6912 for (uint32_t n = 32; n <= 48; n += 16) { 6913 for (size_t k = 1; k <= 80; k += 17) { 6914 GemmMicrokernelTester() 6915 .mr(4) 6916 .nr(16) 6917 .kr(4) 6918 .sr(1) 6919 .m(4) 6920 .n(n) 6921 .k(k) 6922 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6923 } 6924 } 6925 } 6926 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,n_div_16_strided_cn)6927 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) { 6928 TEST_REQUIRES_ARM_NEON_DOT; 6929 for (uint32_t n = 32; n <= 48; n += 16) { 6930 for (size_t k = 1; k <= 80; k += 17) { 6931 GemmMicrokernelTester() 6932 .mr(4) 6933 .nr(16) 6934 .kr(4) 6935 .sr(1) 6936 .m(4) 6937 .n(n) 6938 .k(k) 6939 .cn_stride(19) 6940 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6941 } 6942 } 6943 } 6944 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,n_div_16_strided_a)6945 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_a) { 6946 TEST_REQUIRES_ARM_NEON_DOT; 6947 for (uint32_t n = 32; n <= 48; n += 16) { 6948 for (size_t k = 1; k <= 80; k += 17) { 6949 GemmMicrokernelTester() 6950 .mr(4) 6951 .nr(16) 6952 .kr(4) 6953 .sr(1) 6954 .m(4) 6955 .n(n) 6956 .k(k) 6957 .a_stride(83) 6958 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6959 } 6960 } 6961 } 6962 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,n_div_16_subtile)6963 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) { 6964 TEST_REQUIRES_ARM_NEON_DOT; 6965 for (uint32_t n = 32; n <= 48; n += 16) { 6966 for (size_t k = 1; k <= 80; k += 17) { 6967 for (uint32_t m = 1; m <= 4; m++) { 6968 GemmMicrokernelTester() 6969 .mr(4) 6970 .nr(16) 6971 .kr(4) 6972 .sr(1) 6973 .m(m) 6974 .n(n) 6975 .k(k) 6976 .iterations(1) 6977 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6978 } 6979 } 6980 } 6981 } 6982 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,strided_cm_subtile)6983 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) { 6984 TEST_REQUIRES_ARM_NEON_DOT; 6985 for (size_t k = 1; k <= 80; k += 17) { 6986 for (uint32_t n = 1; n <= 16; n++) { 6987 for (uint32_t m = 1; m <= 4; m++) { 6988 GemmMicrokernelTester() 6989 .mr(4) 6990 .nr(16) 6991 .kr(4) 6992 .sr(1) 6993 .m(m) 6994 .n(n) 6995 .k(k) 6996 .cm_stride(19) 6997 .iterations(1) 6998 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6999 } 7000 } 7001 } 7002 } 7003 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,qmin)7004 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, qmin) { 7005 TEST_REQUIRES_ARM_NEON_DOT; 7006 GemmMicrokernelTester() 7007 .mr(4) 7008 .nr(16) 7009 .kr(4) 7010 .sr(1) 7011 .m(4) 7012 .n(16) 7013 .k(16) 7014 .qmin(128) 7015 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7016 } 7017 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,qmax)7018 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, qmax) { 7019 TEST_REQUIRES_ARM_NEON_DOT; 7020 GemmMicrokernelTester() 7021 .mr(4) 7022 .nr(16) 7023 .kr(4) 7024 .sr(1) 7025 .m(4) 7026 .n(16) 7027 .k(16) 7028 .qmax(128) 7029 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7030 } 7031 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,strided_cm)7032 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) { 7033 TEST_REQUIRES_ARM_NEON_DOT; 7034 GemmMicrokernelTester() 7035 .mr(4) 7036 .nr(16) 7037 .kr(4) 7038 .sr(1) 7039 .m(4) 7040 .n(16) 7041 .k(16) 7042 .cm_stride(19) 7043 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7044 } 7045 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,no_a_zero_point)7046 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, no_a_zero_point) { 7047 TEST_REQUIRES_ARM_NEON_DOT; 7048 for (size_t k = 1; k <= 80; k += 17) { 7049 GemmMicrokernelTester() 7050 .mr(4) 7051 .nr(16) 7052 .kr(4) 7053 .sr(1) 7054 .m(4) 7055 .n(16) 7056 .k(k) 7057 .a_zero_point(0) 7058 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7059 } 7060 } 7061 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,no_b_zero_point)7062 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, no_b_zero_point) { 7063 TEST_REQUIRES_ARM_NEON_DOT; 7064 for (size_t k = 1; k <= 80; k += 17) { 7065 GemmMicrokernelTester() 7066 .mr(4) 7067 .nr(16) 7068 .kr(4) 7069 .sr(1) 7070 .m(4) 7071 .n(16) 7072 .k(k) 7073 .b_zero_point(0) 7074 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7075 } 7076 } 7077 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128,no_zero_point)7078 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_LD128, no_zero_point) { 7079 TEST_REQUIRES_ARM_NEON_DOT; 7080 for (size_t k = 1; k <= 80; k += 17) { 7081 GemmMicrokernelTester() 7082 .mr(4) 7083 .nr(16) 7084 .kr(4) 7085 .sr(1) 7086 .m(4) 7087 .n(16) 7088 .k(k) 7089 .a_zero_point(0) 7090 .b_zero_point(0) 7091 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7092 } 7093 } 7094 #endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 7095 7096 7097 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_eq_8)7098 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_eq_8) { 7099 TEST_REQUIRES_ARM_NEON_DOT; 7100 GemmMicrokernelTester() 7101 .mr(1) 7102 .nr(16) 7103 .kr(4) 7104 .sr(1) 7105 .m(1) 7106 .n(16) 7107 .k(8) 7108 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7109 } 7110 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,strided_cn)7111 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, strided_cn) { 7112 TEST_REQUIRES_ARM_NEON_DOT; 7113 GemmMicrokernelTester() 7114 .mr(1) 7115 .nr(16) 7116 .kr(4) 7117 .sr(1) 7118 .m(1) 7119 .n(16) 7120 .k(8) 7121 .cn_stride(19) 7122 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7123 } 7124 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_eq_8_strided_a)7125 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_eq_8_strided_a) { 7126 TEST_REQUIRES_ARM_NEON_DOT; 7127 GemmMicrokernelTester() 7128 .mr(1) 7129 .nr(16) 7130 .kr(4) 7131 .sr(1) 7132 .m(1) 7133 .n(16) 7134 .k(8) 7135 .a_stride(11) 7136 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7137 } 7138 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_eq_8_subtile)7139 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_eq_8_subtile) { 7140 TEST_REQUIRES_ARM_NEON_DOT; 7141 for (uint32_t n = 1; n <= 16; n++) { 7142 for (uint32_t m = 1; m <= 1; m++) { 7143 GemmMicrokernelTester() 7144 .mr(1) 7145 .nr(16) 7146 .kr(4) 7147 .sr(1) 7148 .m(m) 7149 .n(n) 7150 .k(8) 7151 .iterations(1) 7152 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7153 } 7154 } 7155 } 7156 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_eq_8_subtile_m)7157 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_eq_8_subtile_m) { 7158 TEST_REQUIRES_ARM_NEON_DOT; 7159 for (uint32_t m = 1; m <= 1; m++) { 7160 GemmMicrokernelTester() 7161 .mr(1) 7162 .nr(16) 7163 .kr(4) 7164 .sr(1) 7165 .m(m) 7166 .n(16) 7167 .k(8) 7168 .iterations(1) 7169 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7170 } 7171 } 7172 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_eq_8_subtile_n)7173 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_eq_8_subtile_n) { 7174 TEST_REQUIRES_ARM_NEON_DOT; 7175 for (uint32_t n = 1; n <= 16; n++) { 7176 GemmMicrokernelTester() 7177 .mr(1) 7178 .nr(16) 7179 .kr(4) 7180 .sr(1) 7181 .m(1) 7182 .n(n) 7183 .k(8) 7184 .iterations(1) 7185 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7186 } 7187 } 7188 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_lt_8)7189 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_lt_8) { 7190 TEST_REQUIRES_ARM_NEON_DOT; 7191 for (size_t k = 1; k < 8; k++) { 7192 GemmMicrokernelTester() 7193 .mr(1) 7194 .nr(16) 7195 .kr(4) 7196 .sr(1) 7197 .m(1) 7198 .n(16) 7199 .k(k) 7200 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7201 } 7202 } 7203 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_lt_8_strided_a)7204 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_lt_8_strided_a) { 7205 TEST_REQUIRES_ARM_NEON_DOT; 7206 for (size_t k = 1; k < 8; k++) { 7207 GemmMicrokernelTester() 7208 .mr(1) 7209 .nr(16) 7210 .kr(4) 7211 .sr(1) 7212 .m(1) 7213 .n(16) 7214 .k(k) 7215 .a_stride(11) 7216 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7217 } 7218 } 7219 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_lt_8_subtile)7220 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_lt_8_subtile) { 7221 TEST_REQUIRES_ARM_NEON_DOT; 7222 for (size_t k = 1; k < 8; k++) { 7223 for (uint32_t n = 1; n <= 16; n++) { 7224 for (uint32_t m = 1; m <= 1; m++) { 7225 GemmMicrokernelTester() 7226 .mr(1) 7227 .nr(16) 7228 .kr(4) 7229 .sr(1) 7230 .m(m) 7231 .n(n) 7232 .k(k) 7233 .iterations(1) 7234 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7235 } 7236 } 7237 } 7238 } 7239 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_gt_8)7240 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_gt_8) { 7241 TEST_REQUIRES_ARM_NEON_DOT; 7242 for (size_t k = 9; k < 16; k++) { 7243 GemmMicrokernelTester() 7244 .mr(1) 7245 .nr(16) 7246 .kr(4) 7247 .sr(1) 7248 .m(1) 7249 .n(16) 7250 .k(k) 7251 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7252 } 7253 } 7254 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_gt_8_strided_a)7255 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_gt_8_strided_a) { 7256 TEST_REQUIRES_ARM_NEON_DOT; 7257 for (size_t k = 9; k < 16; k++) { 7258 GemmMicrokernelTester() 7259 .mr(1) 7260 .nr(16) 7261 .kr(4) 7262 .sr(1) 7263 .m(1) 7264 .n(16) 7265 .k(k) 7266 .a_stride(19) 7267 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7268 } 7269 } 7270 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_gt_8_subtile)7271 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_gt_8_subtile) { 7272 TEST_REQUIRES_ARM_NEON_DOT; 7273 for (size_t k = 9; k < 16; k++) { 7274 for (uint32_t n = 1; n <= 16; n++) { 7275 for (uint32_t m = 1; m <= 1; m++) { 7276 GemmMicrokernelTester() 7277 .mr(1) 7278 .nr(16) 7279 .kr(4) 7280 .sr(1) 7281 .m(m) 7282 .n(n) 7283 .k(k) 7284 .iterations(1) 7285 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7286 } 7287 } 7288 } 7289 } 7290 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_div_8)7291 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_div_8) { 7292 TEST_REQUIRES_ARM_NEON_DOT; 7293 for (size_t k = 16; k <= 80; k += 8) { 7294 GemmMicrokernelTester() 7295 .mr(1) 7296 .nr(16) 7297 .kr(4) 7298 .sr(1) 7299 .m(1) 7300 .n(16) 7301 .k(k) 7302 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7303 } 7304 } 7305 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_div_8_strided_a)7306 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_div_8_strided_a) { 7307 TEST_REQUIRES_ARM_NEON_DOT; 7308 for (size_t k = 16; k <= 80; k += 8) { 7309 GemmMicrokernelTester() 7310 .mr(1) 7311 .nr(16) 7312 .kr(4) 7313 .sr(1) 7314 .m(1) 7315 .n(16) 7316 .k(k) 7317 .a_stride(83) 7318 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7319 } 7320 } 7321 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,k_div_8_subtile)7322 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, k_div_8_subtile) { 7323 TEST_REQUIRES_ARM_NEON_DOT; 7324 for (size_t k = 16; k <= 80; k += 8) { 7325 for (uint32_t n = 1; n <= 16; n++) { 7326 for (uint32_t m = 1; m <= 1; m++) { 7327 GemmMicrokernelTester() 7328 .mr(1) 7329 .nr(16) 7330 .kr(4) 7331 .sr(1) 7332 .m(m) 7333 .n(n) 7334 .k(k) 7335 .iterations(1) 7336 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7337 } 7338 } 7339 } 7340 } 7341 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,n_gt_16)7342 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, n_gt_16) { 7343 TEST_REQUIRES_ARM_NEON_DOT; 7344 for (uint32_t n = 17; n < 32; n++) { 7345 for (size_t k = 1; k <= 40; k += 9) { 7346 GemmMicrokernelTester() 7347 .mr(1) 7348 .nr(16) 7349 .kr(4) 7350 .sr(1) 7351 .m(1) 7352 .n(n) 7353 .k(k) 7354 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7355 } 7356 } 7357 } 7358 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,n_gt_16_strided_cn)7359 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, n_gt_16_strided_cn) { 7360 TEST_REQUIRES_ARM_NEON_DOT; 7361 for (uint32_t n = 17; n < 32; n++) { 7362 for (size_t k = 1; k <= 40; k += 9) { 7363 GemmMicrokernelTester() 7364 .mr(1) 7365 .nr(16) 7366 .kr(4) 7367 .sr(1) 7368 .m(1) 7369 .n(n) 7370 .k(k) 7371 .cn_stride(19) 7372 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7373 } 7374 } 7375 } 7376 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,n_gt_16_strided_a)7377 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, n_gt_16_strided_a) { 7378 TEST_REQUIRES_ARM_NEON_DOT; 7379 for (uint32_t n = 17; n < 32; n++) { 7380 for (size_t k = 1; k <= 40; k += 9) { 7381 GemmMicrokernelTester() 7382 .mr(1) 7383 .nr(16) 7384 .kr(4) 7385 .sr(1) 7386 .m(1) 7387 .n(n) 7388 .k(k) 7389 .a_stride(43) 7390 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7391 } 7392 } 7393 } 7394 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,n_gt_16_subtile)7395 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, n_gt_16_subtile) { 7396 TEST_REQUIRES_ARM_NEON_DOT; 7397 for (uint32_t n = 17; n < 32; n++) { 7398 for (size_t k = 1; k <= 40; k += 9) { 7399 for (uint32_t m = 1; m <= 1; m++) { 7400 GemmMicrokernelTester() 7401 .mr(1) 7402 .nr(16) 7403 .kr(4) 7404 .sr(1) 7405 .m(m) 7406 .n(n) 7407 .k(k) 7408 .iterations(1) 7409 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7410 } 7411 } 7412 } 7413 } 7414 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,n_div_16)7415 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, n_div_16) { 7416 TEST_REQUIRES_ARM_NEON_DOT; 7417 for (uint32_t n = 32; n <= 48; n += 16) { 7418 for (size_t k = 1; k <= 40; k += 9) { 7419 GemmMicrokernelTester() 7420 .mr(1) 7421 .nr(16) 7422 .kr(4) 7423 .sr(1) 7424 .m(1) 7425 .n(n) 7426 .k(k) 7427 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7428 } 7429 } 7430 } 7431 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,n_div_16_strided_cn)7432 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, n_div_16_strided_cn) { 7433 TEST_REQUIRES_ARM_NEON_DOT; 7434 for (uint32_t n = 32; n <= 48; n += 16) { 7435 for (size_t k = 1; k <= 40; k += 9) { 7436 GemmMicrokernelTester() 7437 .mr(1) 7438 .nr(16) 7439 .kr(4) 7440 .sr(1) 7441 .m(1) 7442 .n(n) 7443 .k(k) 7444 .cn_stride(19) 7445 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7446 } 7447 } 7448 } 7449 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,n_div_16_strided_a)7450 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, n_div_16_strided_a) { 7451 TEST_REQUIRES_ARM_NEON_DOT; 7452 for (uint32_t n = 32; n <= 48; n += 16) { 7453 for (size_t k = 1; k <= 40; k += 9) { 7454 GemmMicrokernelTester() 7455 .mr(1) 7456 .nr(16) 7457 .kr(4) 7458 .sr(1) 7459 .m(1) 7460 .n(n) 7461 .k(k) 7462 .a_stride(43) 7463 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7464 } 7465 } 7466 } 7467 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,n_div_16_subtile)7468 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, n_div_16_subtile) { 7469 TEST_REQUIRES_ARM_NEON_DOT; 7470 for (uint32_t n = 32; n <= 48; n += 16) { 7471 for (size_t k = 1; k <= 40; k += 9) { 7472 for (uint32_t m = 1; m <= 1; m++) { 7473 GemmMicrokernelTester() 7474 .mr(1) 7475 .nr(16) 7476 .kr(4) 7477 .sr(1) 7478 .m(m) 7479 .n(n) 7480 .k(k) 7481 .iterations(1) 7482 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7483 } 7484 } 7485 } 7486 } 7487 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,strided_cm_subtile)7488 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, strided_cm_subtile) { 7489 TEST_REQUIRES_ARM_NEON_DOT; 7490 for (size_t k = 1; k <= 40; k += 9) { 7491 for (uint32_t n = 1; n <= 16; n++) { 7492 for (uint32_t m = 1; m <= 1; m++) { 7493 GemmMicrokernelTester() 7494 .mr(1) 7495 .nr(16) 7496 .kr(4) 7497 .sr(1) 7498 .m(m) 7499 .n(n) 7500 .k(k) 7501 .cm_stride(19) 7502 .iterations(1) 7503 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7504 } 7505 } 7506 } 7507 } 7508 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,qmin)7509 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, qmin) { 7510 TEST_REQUIRES_ARM_NEON_DOT; 7511 GemmMicrokernelTester() 7512 .mr(1) 7513 .nr(16) 7514 .kr(4) 7515 .sr(1) 7516 .m(1) 7517 .n(16) 7518 .k(8) 7519 .qmin(128) 7520 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7521 } 7522 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,qmax)7523 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, qmax) { 7524 TEST_REQUIRES_ARM_NEON_DOT; 7525 GemmMicrokernelTester() 7526 .mr(1) 7527 .nr(16) 7528 .kr(4) 7529 .sr(1) 7530 .m(1) 7531 .n(16) 7532 .k(8) 7533 .qmax(128) 7534 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7535 } 7536 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,strided_cm)7537 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, strided_cm) { 7538 TEST_REQUIRES_ARM_NEON_DOT; 7539 GemmMicrokernelTester() 7540 .mr(1) 7541 .nr(16) 7542 .kr(4) 7543 .sr(1) 7544 .m(1) 7545 .n(16) 7546 .k(8) 7547 .cm_stride(19) 7548 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7549 } 7550 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,no_a_zero_point)7551 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, no_a_zero_point) { 7552 TEST_REQUIRES_ARM_NEON_DOT; 7553 for (size_t k = 1; k <= 40; k += 9) { 7554 GemmMicrokernelTester() 7555 .mr(1) 7556 .nr(16) 7557 .kr(4) 7558 .sr(1) 7559 .m(1) 7560 .n(16) 7561 .k(k) 7562 .a_zero_point(0) 7563 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7564 } 7565 } 7566 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,no_b_zero_point)7567 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, no_b_zero_point) { 7568 TEST_REQUIRES_ARM_NEON_DOT; 7569 for (size_t k = 1; k <= 40; k += 9) { 7570 GemmMicrokernelTester() 7571 .mr(1) 7572 .nr(16) 7573 .kr(4) 7574 .sr(1) 7575 .m(1) 7576 .n(16) 7577 .k(k) 7578 .b_zero_point(0) 7579 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7580 } 7581 } 7582 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT,no_zero_point)7583 TEST(QU8_GEMM_MINMAX_RNDNU_1X16C4__NEONDOT, no_zero_point) { 7584 TEST_REQUIRES_ARM_NEON_DOT; 7585 for (size_t k = 1; k <= 40; k += 9) { 7586 GemmMicrokernelTester() 7587 .mr(1) 7588 .nr(16) 7589 .kr(4) 7590 .sr(1) 7591 .m(1) 7592 .n(16) 7593 .k(k) 7594 .a_zero_point(0) 7595 .b_zero_point(0) 7596 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7597 } 7598 } 7599 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) 7600 7601 7602 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_eq_8)7603 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_eq_8) { 7604 TEST_REQUIRES_ARM_NEON_DOT; 7605 GemmMicrokernelTester() 7606 .mr(1) 7607 .nr(32) 7608 .kr(4) 7609 .sr(1) 7610 .m(1) 7611 .n(32) 7612 .k(8) 7613 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7614 } 7615 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,strided_cn)7616 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, strided_cn) { 7617 TEST_REQUIRES_ARM_NEON_DOT; 7618 GemmMicrokernelTester() 7619 .mr(1) 7620 .nr(32) 7621 .kr(4) 7622 .sr(1) 7623 .m(1) 7624 .n(32) 7625 .k(8) 7626 .cn_stride(37) 7627 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7628 } 7629 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_eq_8_strided_a)7630 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_eq_8_strided_a) { 7631 TEST_REQUIRES_ARM_NEON_DOT; 7632 GemmMicrokernelTester() 7633 .mr(1) 7634 .nr(32) 7635 .kr(4) 7636 .sr(1) 7637 .m(1) 7638 .n(32) 7639 .k(8) 7640 .a_stride(11) 7641 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7642 } 7643 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_eq_8_subtile)7644 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_eq_8_subtile) { 7645 TEST_REQUIRES_ARM_NEON_DOT; 7646 for (uint32_t n = 1; n <= 32; n++) { 7647 for (uint32_t m = 1; m <= 1; m++) { 7648 GemmMicrokernelTester() 7649 .mr(1) 7650 .nr(32) 7651 .kr(4) 7652 .sr(1) 7653 .m(m) 7654 .n(n) 7655 .k(8) 7656 .iterations(1) 7657 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7658 } 7659 } 7660 } 7661 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_eq_8_subtile_m)7662 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_eq_8_subtile_m) { 7663 TEST_REQUIRES_ARM_NEON_DOT; 7664 for (uint32_t m = 1; m <= 1; m++) { 7665 GemmMicrokernelTester() 7666 .mr(1) 7667 .nr(32) 7668 .kr(4) 7669 .sr(1) 7670 .m(m) 7671 .n(32) 7672 .k(8) 7673 .iterations(1) 7674 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7675 } 7676 } 7677 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_eq_8_subtile_n)7678 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_eq_8_subtile_n) { 7679 TEST_REQUIRES_ARM_NEON_DOT; 7680 for (uint32_t n = 1; n <= 32; n++) { 7681 GemmMicrokernelTester() 7682 .mr(1) 7683 .nr(32) 7684 .kr(4) 7685 .sr(1) 7686 .m(1) 7687 .n(n) 7688 .k(8) 7689 .iterations(1) 7690 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7691 } 7692 } 7693 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_lt_8)7694 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_lt_8) { 7695 TEST_REQUIRES_ARM_NEON_DOT; 7696 for (size_t k = 1; k < 8; k++) { 7697 GemmMicrokernelTester() 7698 .mr(1) 7699 .nr(32) 7700 .kr(4) 7701 .sr(1) 7702 .m(1) 7703 .n(32) 7704 .k(k) 7705 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7706 } 7707 } 7708 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_lt_8_strided_a)7709 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_lt_8_strided_a) { 7710 TEST_REQUIRES_ARM_NEON_DOT; 7711 for (size_t k = 1; k < 8; k++) { 7712 GemmMicrokernelTester() 7713 .mr(1) 7714 .nr(32) 7715 .kr(4) 7716 .sr(1) 7717 .m(1) 7718 .n(32) 7719 .k(k) 7720 .a_stride(11) 7721 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7722 } 7723 } 7724 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_lt_8_subtile)7725 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_lt_8_subtile) { 7726 TEST_REQUIRES_ARM_NEON_DOT; 7727 for (size_t k = 1; k < 8; k++) { 7728 for (uint32_t n = 1; n <= 32; n++) { 7729 for (uint32_t m = 1; m <= 1; m++) { 7730 GemmMicrokernelTester() 7731 .mr(1) 7732 .nr(32) 7733 .kr(4) 7734 .sr(1) 7735 .m(m) 7736 .n(n) 7737 .k(k) 7738 .iterations(1) 7739 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7740 } 7741 } 7742 } 7743 } 7744 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_gt_8)7745 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_gt_8) { 7746 TEST_REQUIRES_ARM_NEON_DOT; 7747 for (size_t k = 9; k < 16; k++) { 7748 GemmMicrokernelTester() 7749 .mr(1) 7750 .nr(32) 7751 .kr(4) 7752 .sr(1) 7753 .m(1) 7754 .n(32) 7755 .k(k) 7756 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7757 } 7758 } 7759 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_gt_8_strided_a)7760 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_gt_8_strided_a) { 7761 TEST_REQUIRES_ARM_NEON_DOT; 7762 for (size_t k = 9; k < 16; k++) { 7763 GemmMicrokernelTester() 7764 .mr(1) 7765 .nr(32) 7766 .kr(4) 7767 .sr(1) 7768 .m(1) 7769 .n(32) 7770 .k(k) 7771 .a_stride(19) 7772 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7773 } 7774 } 7775 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_gt_8_subtile)7776 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_gt_8_subtile) { 7777 TEST_REQUIRES_ARM_NEON_DOT; 7778 for (size_t k = 9; k < 16; k++) { 7779 for (uint32_t n = 1; n <= 32; n++) { 7780 for (uint32_t m = 1; m <= 1; m++) { 7781 GemmMicrokernelTester() 7782 .mr(1) 7783 .nr(32) 7784 .kr(4) 7785 .sr(1) 7786 .m(m) 7787 .n(n) 7788 .k(k) 7789 .iterations(1) 7790 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7791 } 7792 } 7793 } 7794 } 7795 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_div_8)7796 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_div_8) { 7797 TEST_REQUIRES_ARM_NEON_DOT; 7798 for (size_t k = 16; k <= 80; k += 8) { 7799 GemmMicrokernelTester() 7800 .mr(1) 7801 .nr(32) 7802 .kr(4) 7803 .sr(1) 7804 .m(1) 7805 .n(32) 7806 .k(k) 7807 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7808 } 7809 } 7810 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_div_8_strided_a)7811 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_div_8_strided_a) { 7812 TEST_REQUIRES_ARM_NEON_DOT; 7813 for (size_t k = 16; k <= 80; k += 8) { 7814 GemmMicrokernelTester() 7815 .mr(1) 7816 .nr(32) 7817 .kr(4) 7818 .sr(1) 7819 .m(1) 7820 .n(32) 7821 .k(k) 7822 .a_stride(83) 7823 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7824 } 7825 } 7826 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,k_div_8_subtile)7827 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, k_div_8_subtile) { 7828 TEST_REQUIRES_ARM_NEON_DOT; 7829 for (size_t k = 16; k <= 80; k += 8) { 7830 for (uint32_t n = 1; n <= 32; n++) { 7831 for (uint32_t m = 1; m <= 1; m++) { 7832 GemmMicrokernelTester() 7833 .mr(1) 7834 .nr(32) 7835 .kr(4) 7836 .sr(1) 7837 .m(m) 7838 .n(n) 7839 .k(k) 7840 .iterations(1) 7841 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7842 } 7843 } 7844 } 7845 } 7846 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,n_gt_32)7847 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, n_gt_32) { 7848 TEST_REQUIRES_ARM_NEON_DOT; 7849 for (uint32_t n = 33; n < 64; n++) { 7850 for (size_t k = 1; k <= 40; k += 9) { 7851 GemmMicrokernelTester() 7852 .mr(1) 7853 .nr(32) 7854 .kr(4) 7855 .sr(1) 7856 .m(1) 7857 .n(n) 7858 .k(k) 7859 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7860 } 7861 } 7862 } 7863 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,n_gt_32_strided_cn)7864 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, n_gt_32_strided_cn) { 7865 TEST_REQUIRES_ARM_NEON_DOT; 7866 for (uint32_t n = 33; n < 64; n++) { 7867 for (size_t k = 1; k <= 40; k += 9) { 7868 GemmMicrokernelTester() 7869 .mr(1) 7870 .nr(32) 7871 .kr(4) 7872 .sr(1) 7873 .m(1) 7874 .n(n) 7875 .k(k) 7876 .cn_stride(37) 7877 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7878 } 7879 } 7880 } 7881 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,n_gt_32_strided_a)7882 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, n_gt_32_strided_a) { 7883 TEST_REQUIRES_ARM_NEON_DOT; 7884 for (uint32_t n = 33; n < 64; n++) { 7885 for (size_t k = 1; k <= 40; k += 9) { 7886 GemmMicrokernelTester() 7887 .mr(1) 7888 .nr(32) 7889 .kr(4) 7890 .sr(1) 7891 .m(1) 7892 .n(n) 7893 .k(k) 7894 .a_stride(43) 7895 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7896 } 7897 } 7898 } 7899 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,n_gt_32_subtile)7900 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, n_gt_32_subtile) { 7901 TEST_REQUIRES_ARM_NEON_DOT; 7902 for (uint32_t n = 33; n < 64; n++) { 7903 for (size_t k = 1; k <= 40; k += 9) { 7904 for (uint32_t m = 1; m <= 1; m++) { 7905 GemmMicrokernelTester() 7906 .mr(1) 7907 .nr(32) 7908 .kr(4) 7909 .sr(1) 7910 .m(m) 7911 .n(n) 7912 .k(k) 7913 .iterations(1) 7914 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7915 } 7916 } 7917 } 7918 } 7919 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,n_div_32)7920 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, n_div_32) { 7921 TEST_REQUIRES_ARM_NEON_DOT; 7922 for (uint32_t n = 64; n <= 96; n += 32) { 7923 for (size_t k = 1; k <= 40; k += 9) { 7924 GemmMicrokernelTester() 7925 .mr(1) 7926 .nr(32) 7927 .kr(4) 7928 .sr(1) 7929 .m(1) 7930 .n(n) 7931 .k(k) 7932 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7933 } 7934 } 7935 } 7936 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,n_div_32_strided_cn)7937 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, n_div_32_strided_cn) { 7938 TEST_REQUIRES_ARM_NEON_DOT; 7939 for (uint32_t n = 64; n <= 96; n += 32) { 7940 for (size_t k = 1; k <= 40; k += 9) { 7941 GemmMicrokernelTester() 7942 .mr(1) 7943 .nr(32) 7944 .kr(4) 7945 .sr(1) 7946 .m(1) 7947 .n(n) 7948 .k(k) 7949 .cn_stride(37) 7950 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7951 } 7952 } 7953 } 7954 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,n_div_32_strided_a)7955 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, n_div_32_strided_a) { 7956 TEST_REQUIRES_ARM_NEON_DOT; 7957 for (uint32_t n = 64; n <= 96; n += 32) { 7958 for (size_t k = 1; k <= 40; k += 9) { 7959 GemmMicrokernelTester() 7960 .mr(1) 7961 .nr(32) 7962 .kr(4) 7963 .sr(1) 7964 .m(1) 7965 .n(n) 7966 .k(k) 7967 .a_stride(43) 7968 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7969 } 7970 } 7971 } 7972 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,n_div_32_subtile)7973 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, n_div_32_subtile) { 7974 TEST_REQUIRES_ARM_NEON_DOT; 7975 for (uint32_t n = 64; n <= 96; n += 32) { 7976 for (size_t k = 1; k <= 40; k += 9) { 7977 for (uint32_t m = 1; m <= 1; m++) { 7978 GemmMicrokernelTester() 7979 .mr(1) 7980 .nr(32) 7981 .kr(4) 7982 .sr(1) 7983 .m(m) 7984 .n(n) 7985 .k(k) 7986 .iterations(1) 7987 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7988 } 7989 } 7990 } 7991 } 7992 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,strided_cm_subtile)7993 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, strided_cm_subtile) { 7994 TEST_REQUIRES_ARM_NEON_DOT; 7995 for (size_t k = 1; k <= 40; k += 9) { 7996 for (uint32_t n = 1; n <= 32; n++) { 7997 for (uint32_t m = 1; m <= 1; m++) { 7998 GemmMicrokernelTester() 7999 .mr(1) 8000 .nr(32) 8001 .kr(4) 8002 .sr(1) 8003 .m(m) 8004 .n(n) 8005 .k(k) 8006 .cm_stride(37) 8007 .iterations(1) 8008 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8009 } 8010 } 8011 } 8012 } 8013 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,qmin)8014 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, qmin) { 8015 TEST_REQUIRES_ARM_NEON_DOT; 8016 GemmMicrokernelTester() 8017 .mr(1) 8018 .nr(32) 8019 .kr(4) 8020 .sr(1) 8021 .m(1) 8022 .n(32) 8023 .k(8) 8024 .qmin(128) 8025 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8026 } 8027 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,qmax)8028 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, qmax) { 8029 TEST_REQUIRES_ARM_NEON_DOT; 8030 GemmMicrokernelTester() 8031 .mr(1) 8032 .nr(32) 8033 .kr(4) 8034 .sr(1) 8035 .m(1) 8036 .n(32) 8037 .k(8) 8038 .qmax(128) 8039 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8040 } 8041 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,strided_cm)8042 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, strided_cm) { 8043 TEST_REQUIRES_ARM_NEON_DOT; 8044 GemmMicrokernelTester() 8045 .mr(1) 8046 .nr(32) 8047 .kr(4) 8048 .sr(1) 8049 .m(1) 8050 .n(32) 8051 .k(8) 8052 .cm_stride(37) 8053 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8054 } 8055 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,no_a_zero_point)8056 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, no_a_zero_point) { 8057 TEST_REQUIRES_ARM_NEON_DOT; 8058 for (size_t k = 1; k <= 40; k += 9) { 8059 GemmMicrokernelTester() 8060 .mr(1) 8061 .nr(32) 8062 .kr(4) 8063 .sr(1) 8064 .m(1) 8065 .n(32) 8066 .k(k) 8067 .a_zero_point(0) 8068 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8069 } 8070 } 8071 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,no_b_zero_point)8072 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, no_b_zero_point) { 8073 TEST_REQUIRES_ARM_NEON_DOT; 8074 for (size_t k = 1; k <= 40; k += 9) { 8075 GemmMicrokernelTester() 8076 .mr(1) 8077 .nr(32) 8078 .kr(4) 8079 .sr(1) 8080 .m(1) 8081 .n(32) 8082 .k(k) 8083 .b_zero_point(0) 8084 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8085 } 8086 } 8087 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT,no_zero_point)8088 TEST(QU8_GEMM_MINMAX_RNDNU_1X32C4__NEONDOT, no_zero_point) { 8089 TEST_REQUIRES_ARM_NEON_DOT; 8090 for (size_t k = 1; k <= 40; k += 9) { 8091 GemmMicrokernelTester() 8092 .mr(1) 8093 .nr(32) 8094 .kr(4) 8095 .sr(1) 8096 .m(1) 8097 .n(32) 8098 .k(k) 8099 .a_zero_point(0) 8100 .b_zero_point(0) 8101 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8102 } 8103 } 8104 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) 8105 8106 8107 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_eq_8)8108 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_eq_8) { 8109 TEST_REQUIRES_ARM_NEON_DOT; 8110 GemmMicrokernelTester() 8111 .mr(2) 8112 .nr(8) 8113 .kr(4) 8114 .sr(1) 8115 .m(2) 8116 .n(8) 8117 .k(8) 8118 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8119 } 8120 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,strided_cn)8121 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, strided_cn) { 8122 TEST_REQUIRES_ARM_NEON_DOT; 8123 GemmMicrokernelTester() 8124 .mr(2) 8125 .nr(8) 8126 .kr(4) 8127 .sr(1) 8128 .m(2) 8129 .n(8) 8130 .k(8) 8131 .cn_stride(11) 8132 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8133 } 8134 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_eq_8_strided_a)8135 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_eq_8_strided_a) { 8136 TEST_REQUIRES_ARM_NEON_DOT; 8137 GemmMicrokernelTester() 8138 .mr(2) 8139 .nr(8) 8140 .kr(4) 8141 .sr(1) 8142 .m(2) 8143 .n(8) 8144 .k(8) 8145 .a_stride(11) 8146 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8147 } 8148 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_eq_8_subtile)8149 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_eq_8_subtile) { 8150 TEST_REQUIRES_ARM_NEON_DOT; 8151 for (uint32_t n = 1; n <= 8; n++) { 8152 for (uint32_t m = 1; m <= 2; m++) { 8153 GemmMicrokernelTester() 8154 .mr(2) 8155 .nr(8) 8156 .kr(4) 8157 .sr(1) 8158 .m(m) 8159 .n(n) 8160 .k(8) 8161 .iterations(1) 8162 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8163 } 8164 } 8165 } 8166 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_eq_8_subtile_m)8167 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_eq_8_subtile_m) { 8168 TEST_REQUIRES_ARM_NEON_DOT; 8169 for (uint32_t m = 1; m <= 2; m++) { 8170 GemmMicrokernelTester() 8171 .mr(2) 8172 .nr(8) 8173 .kr(4) 8174 .sr(1) 8175 .m(m) 8176 .n(8) 8177 .k(8) 8178 .iterations(1) 8179 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8180 } 8181 } 8182 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_eq_8_subtile_n)8183 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_eq_8_subtile_n) { 8184 TEST_REQUIRES_ARM_NEON_DOT; 8185 for (uint32_t n = 1; n <= 8; n++) { 8186 GemmMicrokernelTester() 8187 .mr(2) 8188 .nr(8) 8189 .kr(4) 8190 .sr(1) 8191 .m(2) 8192 .n(n) 8193 .k(8) 8194 .iterations(1) 8195 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8196 } 8197 } 8198 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_lt_8)8199 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_lt_8) { 8200 TEST_REQUIRES_ARM_NEON_DOT; 8201 for (size_t k = 1; k < 8; k++) { 8202 GemmMicrokernelTester() 8203 .mr(2) 8204 .nr(8) 8205 .kr(4) 8206 .sr(1) 8207 .m(2) 8208 .n(8) 8209 .k(k) 8210 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8211 } 8212 } 8213 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_lt_8_strided_a)8214 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_lt_8_strided_a) { 8215 TEST_REQUIRES_ARM_NEON_DOT; 8216 for (size_t k = 1; k < 8; k++) { 8217 GemmMicrokernelTester() 8218 .mr(2) 8219 .nr(8) 8220 .kr(4) 8221 .sr(1) 8222 .m(2) 8223 .n(8) 8224 .k(k) 8225 .a_stride(11) 8226 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8227 } 8228 } 8229 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_lt_8_subtile)8230 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_lt_8_subtile) { 8231 TEST_REQUIRES_ARM_NEON_DOT; 8232 for (size_t k = 1; k < 8; k++) { 8233 for (uint32_t n = 1; n <= 8; n++) { 8234 for (uint32_t m = 1; m <= 2; m++) { 8235 GemmMicrokernelTester() 8236 .mr(2) 8237 .nr(8) 8238 .kr(4) 8239 .sr(1) 8240 .m(m) 8241 .n(n) 8242 .k(k) 8243 .iterations(1) 8244 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8245 } 8246 } 8247 } 8248 } 8249 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_gt_8)8250 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_gt_8) { 8251 TEST_REQUIRES_ARM_NEON_DOT; 8252 for (size_t k = 9; k < 16; k++) { 8253 GemmMicrokernelTester() 8254 .mr(2) 8255 .nr(8) 8256 .kr(4) 8257 .sr(1) 8258 .m(2) 8259 .n(8) 8260 .k(k) 8261 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8262 } 8263 } 8264 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_gt_8_strided_a)8265 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_gt_8_strided_a) { 8266 TEST_REQUIRES_ARM_NEON_DOT; 8267 for (size_t k = 9; k < 16; k++) { 8268 GemmMicrokernelTester() 8269 .mr(2) 8270 .nr(8) 8271 .kr(4) 8272 .sr(1) 8273 .m(2) 8274 .n(8) 8275 .k(k) 8276 .a_stride(19) 8277 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8278 } 8279 } 8280 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_gt_8_subtile)8281 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_gt_8_subtile) { 8282 TEST_REQUIRES_ARM_NEON_DOT; 8283 for (size_t k = 9; k < 16; k++) { 8284 for (uint32_t n = 1; n <= 8; n++) { 8285 for (uint32_t m = 1; m <= 2; m++) { 8286 GemmMicrokernelTester() 8287 .mr(2) 8288 .nr(8) 8289 .kr(4) 8290 .sr(1) 8291 .m(m) 8292 .n(n) 8293 .k(k) 8294 .iterations(1) 8295 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8296 } 8297 } 8298 } 8299 } 8300 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_div_8)8301 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_div_8) { 8302 TEST_REQUIRES_ARM_NEON_DOT; 8303 for (size_t k = 16; k <= 80; k += 8) { 8304 GemmMicrokernelTester() 8305 .mr(2) 8306 .nr(8) 8307 .kr(4) 8308 .sr(1) 8309 .m(2) 8310 .n(8) 8311 .k(k) 8312 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8313 } 8314 } 8315 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_div_8_strided_a)8316 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_div_8_strided_a) { 8317 TEST_REQUIRES_ARM_NEON_DOT; 8318 for (size_t k = 16; k <= 80; k += 8) { 8319 GemmMicrokernelTester() 8320 .mr(2) 8321 .nr(8) 8322 .kr(4) 8323 .sr(1) 8324 .m(2) 8325 .n(8) 8326 .k(k) 8327 .a_stride(83) 8328 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8329 } 8330 } 8331 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,k_div_8_subtile)8332 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, k_div_8_subtile) { 8333 TEST_REQUIRES_ARM_NEON_DOT; 8334 for (size_t k = 16; k <= 80; k += 8) { 8335 for (uint32_t n = 1; n <= 8; n++) { 8336 for (uint32_t m = 1; m <= 2; m++) { 8337 GemmMicrokernelTester() 8338 .mr(2) 8339 .nr(8) 8340 .kr(4) 8341 .sr(1) 8342 .m(m) 8343 .n(n) 8344 .k(k) 8345 .iterations(1) 8346 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8347 } 8348 } 8349 } 8350 } 8351 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,n_gt_8)8352 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, n_gt_8) { 8353 TEST_REQUIRES_ARM_NEON_DOT; 8354 for (uint32_t n = 9; n < 16; n++) { 8355 for (size_t k = 1; k <= 40; k += 9) { 8356 GemmMicrokernelTester() 8357 .mr(2) 8358 .nr(8) 8359 .kr(4) 8360 .sr(1) 8361 .m(2) 8362 .n(n) 8363 .k(k) 8364 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8365 } 8366 } 8367 } 8368 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,n_gt_8_strided_cn)8369 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, n_gt_8_strided_cn) { 8370 TEST_REQUIRES_ARM_NEON_DOT; 8371 for (uint32_t n = 9; n < 16; n++) { 8372 for (size_t k = 1; k <= 40; k += 9) { 8373 GemmMicrokernelTester() 8374 .mr(2) 8375 .nr(8) 8376 .kr(4) 8377 .sr(1) 8378 .m(2) 8379 .n(n) 8380 .k(k) 8381 .cn_stride(11) 8382 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8383 } 8384 } 8385 } 8386 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,n_gt_8_strided_a)8387 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, n_gt_8_strided_a) { 8388 TEST_REQUIRES_ARM_NEON_DOT; 8389 for (uint32_t n = 9; n < 16; n++) { 8390 for (size_t k = 1; k <= 40; k += 9) { 8391 GemmMicrokernelTester() 8392 .mr(2) 8393 .nr(8) 8394 .kr(4) 8395 .sr(1) 8396 .m(2) 8397 .n(n) 8398 .k(k) 8399 .a_stride(43) 8400 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8401 } 8402 } 8403 } 8404 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,n_gt_8_subtile)8405 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, n_gt_8_subtile) { 8406 TEST_REQUIRES_ARM_NEON_DOT; 8407 for (uint32_t n = 9; n < 16; n++) { 8408 for (size_t k = 1; k <= 40; k += 9) { 8409 for (uint32_t m = 1; m <= 2; m++) { 8410 GemmMicrokernelTester() 8411 .mr(2) 8412 .nr(8) 8413 .kr(4) 8414 .sr(1) 8415 .m(m) 8416 .n(n) 8417 .k(k) 8418 .iterations(1) 8419 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8420 } 8421 } 8422 } 8423 } 8424 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,n_div_8)8425 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, n_div_8) { 8426 TEST_REQUIRES_ARM_NEON_DOT; 8427 for (uint32_t n = 16; n <= 24; n += 8) { 8428 for (size_t k = 1; k <= 40; k += 9) { 8429 GemmMicrokernelTester() 8430 .mr(2) 8431 .nr(8) 8432 .kr(4) 8433 .sr(1) 8434 .m(2) 8435 .n(n) 8436 .k(k) 8437 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8438 } 8439 } 8440 } 8441 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,n_div_8_strided_cn)8442 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, n_div_8_strided_cn) { 8443 TEST_REQUIRES_ARM_NEON_DOT; 8444 for (uint32_t n = 16; n <= 24; n += 8) { 8445 for (size_t k = 1; k <= 40; k += 9) { 8446 GemmMicrokernelTester() 8447 .mr(2) 8448 .nr(8) 8449 .kr(4) 8450 .sr(1) 8451 .m(2) 8452 .n(n) 8453 .k(k) 8454 .cn_stride(11) 8455 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8456 } 8457 } 8458 } 8459 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,n_div_8_strided_a)8460 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, n_div_8_strided_a) { 8461 TEST_REQUIRES_ARM_NEON_DOT; 8462 for (uint32_t n = 16; n <= 24; n += 8) { 8463 for (size_t k = 1; k <= 40; k += 9) { 8464 GemmMicrokernelTester() 8465 .mr(2) 8466 .nr(8) 8467 .kr(4) 8468 .sr(1) 8469 .m(2) 8470 .n(n) 8471 .k(k) 8472 .a_stride(43) 8473 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8474 } 8475 } 8476 } 8477 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,n_div_8_subtile)8478 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, n_div_8_subtile) { 8479 TEST_REQUIRES_ARM_NEON_DOT; 8480 for (uint32_t n = 16; n <= 24; n += 8) { 8481 for (size_t k = 1; k <= 40; k += 9) { 8482 for (uint32_t m = 1; m <= 2; m++) { 8483 GemmMicrokernelTester() 8484 .mr(2) 8485 .nr(8) 8486 .kr(4) 8487 .sr(1) 8488 .m(m) 8489 .n(n) 8490 .k(k) 8491 .iterations(1) 8492 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8493 } 8494 } 8495 } 8496 } 8497 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,strided_cm_subtile)8498 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, strided_cm_subtile) { 8499 TEST_REQUIRES_ARM_NEON_DOT; 8500 for (size_t k = 1; k <= 40; k += 9) { 8501 for (uint32_t n = 1; n <= 8; n++) { 8502 for (uint32_t m = 1; m <= 2; m++) { 8503 GemmMicrokernelTester() 8504 .mr(2) 8505 .nr(8) 8506 .kr(4) 8507 .sr(1) 8508 .m(m) 8509 .n(n) 8510 .k(k) 8511 .cm_stride(11) 8512 .iterations(1) 8513 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8514 } 8515 } 8516 } 8517 } 8518 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,qmin)8519 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, qmin) { 8520 TEST_REQUIRES_ARM_NEON_DOT; 8521 GemmMicrokernelTester() 8522 .mr(2) 8523 .nr(8) 8524 .kr(4) 8525 .sr(1) 8526 .m(2) 8527 .n(8) 8528 .k(8) 8529 .qmin(128) 8530 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8531 } 8532 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,qmax)8533 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, qmax) { 8534 TEST_REQUIRES_ARM_NEON_DOT; 8535 GemmMicrokernelTester() 8536 .mr(2) 8537 .nr(8) 8538 .kr(4) 8539 .sr(1) 8540 .m(2) 8541 .n(8) 8542 .k(8) 8543 .qmax(128) 8544 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8545 } 8546 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,strided_cm)8547 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, strided_cm) { 8548 TEST_REQUIRES_ARM_NEON_DOT; 8549 GemmMicrokernelTester() 8550 .mr(2) 8551 .nr(8) 8552 .kr(4) 8553 .sr(1) 8554 .m(2) 8555 .n(8) 8556 .k(8) 8557 .cm_stride(11) 8558 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8559 } 8560 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,no_a_zero_point)8561 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, no_a_zero_point) { 8562 TEST_REQUIRES_ARM_NEON_DOT; 8563 for (size_t k = 1; k <= 40; k += 9) { 8564 GemmMicrokernelTester() 8565 .mr(2) 8566 .nr(8) 8567 .kr(4) 8568 .sr(1) 8569 .m(2) 8570 .n(8) 8571 .k(k) 8572 .a_zero_point(0) 8573 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8574 } 8575 } 8576 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,no_b_zero_point)8577 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, no_b_zero_point) { 8578 TEST_REQUIRES_ARM_NEON_DOT; 8579 for (size_t k = 1; k <= 40; k += 9) { 8580 GemmMicrokernelTester() 8581 .mr(2) 8582 .nr(8) 8583 .kr(4) 8584 .sr(1) 8585 .m(2) 8586 .n(8) 8587 .k(k) 8588 .b_zero_point(0) 8589 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8590 } 8591 } 8592 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT,no_zero_point)8593 TEST(QU8_GEMM_MINMAX_RNDNU_2X8C4__NEONDOT, no_zero_point) { 8594 TEST_REQUIRES_ARM_NEON_DOT; 8595 for (size_t k = 1; k <= 40; k += 9) { 8596 GemmMicrokernelTester() 8597 .mr(2) 8598 .nr(8) 8599 .kr(4) 8600 .sr(1) 8601 .m(2) 8602 .n(8) 8603 .k(k) 8604 .a_zero_point(0) 8605 .b_zero_point(0) 8606 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8607 } 8608 } 8609 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) 8610 8611 8612 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_eq_8)8613 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_eq_8) { 8614 TEST_REQUIRES_ARM_NEON_DOT; 8615 GemmMicrokernelTester() 8616 .mr(3) 8617 .nr(8) 8618 .kr(4) 8619 .sr(1) 8620 .m(3) 8621 .n(8) 8622 .k(8) 8623 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8624 } 8625 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,strided_cn)8626 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, strided_cn) { 8627 TEST_REQUIRES_ARM_NEON_DOT; 8628 GemmMicrokernelTester() 8629 .mr(3) 8630 .nr(8) 8631 .kr(4) 8632 .sr(1) 8633 .m(3) 8634 .n(8) 8635 .k(8) 8636 .cn_stride(11) 8637 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8638 } 8639 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_eq_8_strided_a)8640 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_eq_8_strided_a) { 8641 TEST_REQUIRES_ARM_NEON_DOT; 8642 GemmMicrokernelTester() 8643 .mr(3) 8644 .nr(8) 8645 .kr(4) 8646 .sr(1) 8647 .m(3) 8648 .n(8) 8649 .k(8) 8650 .a_stride(11) 8651 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8652 } 8653 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_eq_8_subtile)8654 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_eq_8_subtile) { 8655 TEST_REQUIRES_ARM_NEON_DOT; 8656 for (uint32_t n = 1; n <= 8; n++) { 8657 for (uint32_t m = 1; m <= 3; m++) { 8658 GemmMicrokernelTester() 8659 .mr(3) 8660 .nr(8) 8661 .kr(4) 8662 .sr(1) 8663 .m(m) 8664 .n(n) 8665 .k(8) 8666 .iterations(1) 8667 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8668 } 8669 } 8670 } 8671 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_eq_8_subtile_m)8672 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_eq_8_subtile_m) { 8673 TEST_REQUIRES_ARM_NEON_DOT; 8674 for (uint32_t m = 1; m <= 3; m++) { 8675 GemmMicrokernelTester() 8676 .mr(3) 8677 .nr(8) 8678 .kr(4) 8679 .sr(1) 8680 .m(m) 8681 .n(8) 8682 .k(8) 8683 .iterations(1) 8684 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8685 } 8686 } 8687 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_eq_8_subtile_n)8688 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_eq_8_subtile_n) { 8689 TEST_REQUIRES_ARM_NEON_DOT; 8690 for (uint32_t n = 1; n <= 8; n++) { 8691 GemmMicrokernelTester() 8692 .mr(3) 8693 .nr(8) 8694 .kr(4) 8695 .sr(1) 8696 .m(3) 8697 .n(n) 8698 .k(8) 8699 .iterations(1) 8700 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8701 } 8702 } 8703 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_lt_8)8704 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_lt_8) { 8705 TEST_REQUIRES_ARM_NEON_DOT; 8706 for (size_t k = 1; k < 8; k++) { 8707 GemmMicrokernelTester() 8708 .mr(3) 8709 .nr(8) 8710 .kr(4) 8711 .sr(1) 8712 .m(3) 8713 .n(8) 8714 .k(k) 8715 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8716 } 8717 } 8718 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_lt_8_strided_a)8719 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_lt_8_strided_a) { 8720 TEST_REQUIRES_ARM_NEON_DOT; 8721 for (size_t k = 1; k < 8; k++) { 8722 GemmMicrokernelTester() 8723 .mr(3) 8724 .nr(8) 8725 .kr(4) 8726 .sr(1) 8727 .m(3) 8728 .n(8) 8729 .k(k) 8730 .a_stride(11) 8731 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8732 } 8733 } 8734 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_lt_8_subtile)8735 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_lt_8_subtile) { 8736 TEST_REQUIRES_ARM_NEON_DOT; 8737 for (size_t k = 1; k < 8; k++) { 8738 for (uint32_t n = 1; n <= 8; n++) { 8739 for (uint32_t m = 1; m <= 3; m++) { 8740 GemmMicrokernelTester() 8741 .mr(3) 8742 .nr(8) 8743 .kr(4) 8744 .sr(1) 8745 .m(m) 8746 .n(n) 8747 .k(k) 8748 .iterations(1) 8749 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8750 } 8751 } 8752 } 8753 } 8754 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_gt_8)8755 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_gt_8) { 8756 TEST_REQUIRES_ARM_NEON_DOT; 8757 for (size_t k = 9; k < 16; k++) { 8758 GemmMicrokernelTester() 8759 .mr(3) 8760 .nr(8) 8761 .kr(4) 8762 .sr(1) 8763 .m(3) 8764 .n(8) 8765 .k(k) 8766 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8767 } 8768 } 8769 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_gt_8_strided_a)8770 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_gt_8_strided_a) { 8771 TEST_REQUIRES_ARM_NEON_DOT; 8772 for (size_t k = 9; k < 16; k++) { 8773 GemmMicrokernelTester() 8774 .mr(3) 8775 .nr(8) 8776 .kr(4) 8777 .sr(1) 8778 .m(3) 8779 .n(8) 8780 .k(k) 8781 .a_stride(19) 8782 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8783 } 8784 } 8785 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_gt_8_subtile)8786 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_gt_8_subtile) { 8787 TEST_REQUIRES_ARM_NEON_DOT; 8788 for (size_t k = 9; k < 16; k++) { 8789 for (uint32_t n = 1; n <= 8; n++) { 8790 for (uint32_t m = 1; m <= 3; m++) { 8791 GemmMicrokernelTester() 8792 .mr(3) 8793 .nr(8) 8794 .kr(4) 8795 .sr(1) 8796 .m(m) 8797 .n(n) 8798 .k(k) 8799 .iterations(1) 8800 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8801 } 8802 } 8803 } 8804 } 8805 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_div_8)8806 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_div_8) { 8807 TEST_REQUIRES_ARM_NEON_DOT; 8808 for (size_t k = 16; k <= 80; k += 8) { 8809 GemmMicrokernelTester() 8810 .mr(3) 8811 .nr(8) 8812 .kr(4) 8813 .sr(1) 8814 .m(3) 8815 .n(8) 8816 .k(k) 8817 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8818 } 8819 } 8820 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_div_8_strided_a)8821 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_div_8_strided_a) { 8822 TEST_REQUIRES_ARM_NEON_DOT; 8823 for (size_t k = 16; k <= 80; k += 8) { 8824 GemmMicrokernelTester() 8825 .mr(3) 8826 .nr(8) 8827 .kr(4) 8828 .sr(1) 8829 .m(3) 8830 .n(8) 8831 .k(k) 8832 .a_stride(83) 8833 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8834 } 8835 } 8836 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,k_div_8_subtile)8837 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, k_div_8_subtile) { 8838 TEST_REQUIRES_ARM_NEON_DOT; 8839 for (size_t k = 16; k <= 80; k += 8) { 8840 for (uint32_t n = 1; n <= 8; n++) { 8841 for (uint32_t m = 1; m <= 3; m++) { 8842 GemmMicrokernelTester() 8843 .mr(3) 8844 .nr(8) 8845 .kr(4) 8846 .sr(1) 8847 .m(m) 8848 .n(n) 8849 .k(k) 8850 .iterations(1) 8851 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8852 } 8853 } 8854 } 8855 } 8856 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,n_gt_8)8857 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, n_gt_8) { 8858 TEST_REQUIRES_ARM_NEON_DOT; 8859 for (uint32_t n = 9; n < 16; n++) { 8860 for (size_t k = 1; k <= 40; k += 9) { 8861 GemmMicrokernelTester() 8862 .mr(3) 8863 .nr(8) 8864 .kr(4) 8865 .sr(1) 8866 .m(3) 8867 .n(n) 8868 .k(k) 8869 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8870 } 8871 } 8872 } 8873 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,n_gt_8_strided_cn)8874 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, n_gt_8_strided_cn) { 8875 TEST_REQUIRES_ARM_NEON_DOT; 8876 for (uint32_t n = 9; n < 16; n++) { 8877 for (size_t k = 1; k <= 40; k += 9) { 8878 GemmMicrokernelTester() 8879 .mr(3) 8880 .nr(8) 8881 .kr(4) 8882 .sr(1) 8883 .m(3) 8884 .n(n) 8885 .k(k) 8886 .cn_stride(11) 8887 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8888 } 8889 } 8890 } 8891 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,n_gt_8_strided_a)8892 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, n_gt_8_strided_a) { 8893 TEST_REQUIRES_ARM_NEON_DOT; 8894 for (uint32_t n = 9; n < 16; n++) { 8895 for (size_t k = 1; k <= 40; k += 9) { 8896 GemmMicrokernelTester() 8897 .mr(3) 8898 .nr(8) 8899 .kr(4) 8900 .sr(1) 8901 .m(3) 8902 .n(n) 8903 .k(k) 8904 .a_stride(43) 8905 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8906 } 8907 } 8908 } 8909 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,n_gt_8_subtile)8910 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, n_gt_8_subtile) { 8911 TEST_REQUIRES_ARM_NEON_DOT; 8912 for (uint32_t n = 9; n < 16; n++) { 8913 for (size_t k = 1; k <= 40; k += 9) { 8914 for (uint32_t m = 1; m <= 3; m++) { 8915 GemmMicrokernelTester() 8916 .mr(3) 8917 .nr(8) 8918 .kr(4) 8919 .sr(1) 8920 .m(m) 8921 .n(n) 8922 .k(k) 8923 .iterations(1) 8924 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8925 } 8926 } 8927 } 8928 } 8929 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,n_div_8)8930 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, n_div_8) { 8931 TEST_REQUIRES_ARM_NEON_DOT; 8932 for (uint32_t n = 16; n <= 24; n += 8) { 8933 for (size_t k = 1; k <= 40; k += 9) { 8934 GemmMicrokernelTester() 8935 .mr(3) 8936 .nr(8) 8937 .kr(4) 8938 .sr(1) 8939 .m(3) 8940 .n(n) 8941 .k(k) 8942 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8943 } 8944 } 8945 } 8946 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,n_div_8_strided_cn)8947 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, n_div_8_strided_cn) { 8948 TEST_REQUIRES_ARM_NEON_DOT; 8949 for (uint32_t n = 16; n <= 24; n += 8) { 8950 for (size_t k = 1; k <= 40; k += 9) { 8951 GemmMicrokernelTester() 8952 .mr(3) 8953 .nr(8) 8954 .kr(4) 8955 .sr(1) 8956 .m(3) 8957 .n(n) 8958 .k(k) 8959 .cn_stride(11) 8960 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8961 } 8962 } 8963 } 8964 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,n_div_8_strided_a)8965 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, n_div_8_strided_a) { 8966 TEST_REQUIRES_ARM_NEON_DOT; 8967 for (uint32_t n = 16; n <= 24; n += 8) { 8968 for (size_t k = 1; k <= 40; k += 9) { 8969 GemmMicrokernelTester() 8970 .mr(3) 8971 .nr(8) 8972 .kr(4) 8973 .sr(1) 8974 .m(3) 8975 .n(n) 8976 .k(k) 8977 .a_stride(43) 8978 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8979 } 8980 } 8981 } 8982 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,n_div_8_subtile)8983 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, n_div_8_subtile) { 8984 TEST_REQUIRES_ARM_NEON_DOT; 8985 for (uint32_t n = 16; n <= 24; n += 8) { 8986 for (size_t k = 1; k <= 40; k += 9) { 8987 for (uint32_t m = 1; m <= 3; m++) { 8988 GemmMicrokernelTester() 8989 .mr(3) 8990 .nr(8) 8991 .kr(4) 8992 .sr(1) 8993 .m(m) 8994 .n(n) 8995 .k(k) 8996 .iterations(1) 8997 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8998 } 8999 } 9000 } 9001 } 9002 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,strided_cm_subtile)9003 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, strided_cm_subtile) { 9004 TEST_REQUIRES_ARM_NEON_DOT; 9005 for (size_t k = 1; k <= 40; k += 9) { 9006 for (uint32_t n = 1; n <= 8; n++) { 9007 for (uint32_t m = 1; m <= 3; m++) { 9008 GemmMicrokernelTester() 9009 .mr(3) 9010 .nr(8) 9011 .kr(4) 9012 .sr(1) 9013 .m(m) 9014 .n(n) 9015 .k(k) 9016 .cm_stride(11) 9017 .iterations(1) 9018 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9019 } 9020 } 9021 } 9022 } 9023 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,qmin)9024 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, qmin) { 9025 TEST_REQUIRES_ARM_NEON_DOT; 9026 GemmMicrokernelTester() 9027 .mr(3) 9028 .nr(8) 9029 .kr(4) 9030 .sr(1) 9031 .m(3) 9032 .n(8) 9033 .k(8) 9034 .qmin(128) 9035 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9036 } 9037 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,qmax)9038 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, qmax) { 9039 TEST_REQUIRES_ARM_NEON_DOT; 9040 GemmMicrokernelTester() 9041 .mr(3) 9042 .nr(8) 9043 .kr(4) 9044 .sr(1) 9045 .m(3) 9046 .n(8) 9047 .k(8) 9048 .qmax(128) 9049 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9050 } 9051 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,strided_cm)9052 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, strided_cm) { 9053 TEST_REQUIRES_ARM_NEON_DOT; 9054 GemmMicrokernelTester() 9055 .mr(3) 9056 .nr(8) 9057 .kr(4) 9058 .sr(1) 9059 .m(3) 9060 .n(8) 9061 .k(8) 9062 .cm_stride(11) 9063 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9064 } 9065 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,no_a_zero_point)9066 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, no_a_zero_point) { 9067 TEST_REQUIRES_ARM_NEON_DOT; 9068 for (size_t k = 1; k <= 40; k += 9) { 9069 GemmMicrokernelTester() 9070 .mr(3) 9071 .nr(8) 9072 .kr(4) 9073 .sr(1) 9074 .m(3) 9075 .n(8) 9076 .k(k) 9077 .a_zero_point(0) 9078 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9079 } 9080 } 9081 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,no_b_zero_point)9082 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, no_b_zero_point) { 9083 TEST_REQUIRES_ARM_NEON_DOT; 9084 for (size_t k = 1; k <= 40; k += 9) { 9085 GemmMicrokernelTester() 9086 .mr(3) 9087 .nr(8) 9088 .kr(4) 9089 .sr(1) 9090 .m(3) 9091 .n(8) 9092 .k(k) 9093 .b_zero_point(0) 9094 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9095 } 9096 } 9097 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT,no_zero_point)9098 TEST(QU8_GEMM_MINMAX_RNDNU_3X8C4__NEONDOT, no_zero_point) { 9099 TEST_REQUIRES_ARM_NEON_DOT; 9100 for (size_t k = 1; k <= 40; k += 9) { 9101 GemmMicrokernelTester() 9102 .mr(3) 9103 .nr(8) 9104 .kr(4) 9105 .sr(1) 9106 .m(3) 9107 .n(8) 9108 .k(k) 9109 .a_zero_point(0) 9110 .b_zero_point(0) 9111 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9112 } 9113 } 9114 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) 9115 9116 9117 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_eq_8)9118 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_eq_8) { 9119 TEST_REQUIRES_ARM_NEON_DOT; 9120 GemmMicrokernelTester() 9121 .mr(4) 9122 .nr(8) 9123 .kr(4) 9124 .sr(1) 9125 .m(4) 9126 .n(8) 9127 .k(8) 9128 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9129 } 9130 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,strided_cn)9131 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, strided_cn) { 9132 TEST_REQUIRES_ARM_NEON_DOT; 9133 GemmMicrokernelTester() 9134 .mr(4) 9135 .nr(8) 9136 .kr(4) 9137 .sr(1) 9138 .m(4) 9139 .n(8) 9140 .k(8) 9141 .cn_stride(11) 9142 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9143 } 9144 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_eq_8_strided_a)9145 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_eq_8_strided_a) { 9146 TEST_REQUIRES_ARM_NEON_DOT; 9147 GemmMicrokernelTester() 9148 .mr(4) 9149 .nr(8) 9150 .kr(4) 9151 .sr(1) 9152 .m(4) 9153 .n(8) 9154 .k(8) 9155 .a_stride(11) 9156 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9157 } 9158 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_eq_8_subtile)9159 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_eq_8_subtile) { 9160 TEST_REQUIRES_ARM_NEON_DOT; 9161 for (uint32_t n = 1; n <= 8; n++) { 9162 for (uint32_t m = 1; m <= 4; m++) { 9163 GemmMicrokernelTester() 9164 .mr(4) 9165 .nr(8) 9166 .kr(4) 9167 .sr(1) 9168 .m(m) 9169 .n(n) 9170 .k(8) 9171 .iterations(1) 9172 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9173 } 9174 } 9175 } 9176 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_eq_8_subtile_m)9177 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_eq_8_subtile_m) { 9178 TEST_REQUIRES_ARM_NEON_DOT; 9179 for (uint32_t m = 1; m <= 4; m++) { 9180 GemmMicrokernelTester() 9181 .mr(4) 9182 .nr(8) 9183 .kr(4) 9184 .sr(1) 9185 .m(m) 9186 .n(8) 9187 .k(8) 9188 .iterations(1) 9189 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9190 } 9191 } 9192 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_eq_8_subtile_n)9193 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_eq_8_subtile_n) { 9194 TEST_REQUIRES_ARM_NEON_DOT; 9195 for (uint32_t n = 1; n <= 8; n++) { 9196 GemmMicrokernelTester() 9197 .mr(4) 9198 .nr(8) 9199 .kr(4) 9200 .sr(1) 9201 .m(4) 9202 .n(n) 9203 .k(8) 9204 .iterations(1) 9205 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9206 } 9207 } 9208 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_lt_8)9209 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_lt_8) { 9210 TEST_REQUIRES_ARM_NEON_DOT; 9211 for (size_t k = 1; k < 8; k++) { 9212 GemmMicrokernelTester() 9213 .mr(4) 9214 .nr(8) 9215 .kr(4) 9216 .sr(1) 9217 .m(4) 9218 .n(8) 9219 .k(k) 9220 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9221 } 9222 } 9223 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_lt_8_strided_a)9224 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_lt_8_strided_a) { 9225 TEST_REQUIRES_ARM_NEON_DOT; 9226 for (size_t k = 1; k < 8; k++) { 9227 GemmMicrokernelTester() 9228 .mr(4) 9229 .nr(8) 9230 .kr(4) 9231 .sr(1) 9232 .m(4) 9233 .n(8) 9234 .k(k) 9235 .a_stride(11) 9236 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9237 } 9238 } 9239 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_lt_8_subtile)9240 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_lt_8_subtile) { 9241 TEST_REQUIRES_ARM_NEON_DOT; 9242 for (size_t k = 1; k < 8; k++) { 9243 for (uint32_t n = 1; n <= 8; n++) { 9244 for (uint32_t m = 1; m <= 4; m++) { 9245 GemmMicrokernelTester() 9246 .mr(4) 9247 .nr(8) 9248 .kr(4) 9249 .sr(1) 9250 .m(m) 9251 .n(n) 9252 .k(k) 9253 .iterations(1) 9254 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9255 } 9256 } 9257 } 9258 } 9259 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_gt_8)9260 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_gt_8) { 9261 TEST_REQUIRES_ARM_NEON_DOT; 9262 for (size_t k = 9; k < 16; k++) { 9263 GemmMicrokernelTester() 9264 .mr(4) 9265 .nr(8) 9266 .kr(4) 9267 .sr(1) 9268 .m(4) 9269 .n(8) 9270 .k(k) 9271 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9272 } 9273 } 9274 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_gt_8_strided_a)9275 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_gt_8_strided_a) { 9276 TEST_REQUIRES_ARM_NEON_DOT; 9277 for (size_t k = 9; k < 16; k++) { 9278 GemmMicrokernelTester() 9279 .mr(4) 9280 .nr(8) 9281 .kr(4) 9282 .sr(1) 9283 .m(4) 9284 .n(8) 9285 .k(k) 9286 .a_stride(19) 9287 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9288 } 9289 } 9290 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_gt_8_subtile)9291 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_gt_8_subtile) { 9292 TEST_REQUIRES_ARM_NEON_DOT; 9293 for (size_t k = 9; k < 16; k++) { 9294 for (uint32_t n = 1; n <= 8; n++) { 9295 for (uint32_t m = 1; m <= 4; m++) { 9296 GemmMicrokernelTester() 9297 .mr(4) 9298 .nr(8) 9299 .kr(4) 9300 .sr(1) 9301 .m(m) 9302 .n(n) 9303 .k(k) 9304 .iterations(1) 9305 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9306 } 9307 } 9308 } 9309 } 9310 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_div_8)9311 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_div_8) { 9312 TEST_REQUIRES_ARM_NEON_DOT; 9313 for (size_t k = 16; k <= 80; k += 8) { 9314 GemmMicrokernelTester() 9315 .mr(4) 9316 .nr(8) 9317 .kr(4) 9318 .sr(1) 9319 .m(4) 9320 .n(8) 9321 .k(k) 9322 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9323 } 9324 } 9325 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_div_8_strided_a)9326 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_div_8_strided_a) { 9327 TEST_REQUIRES_ARM_NEON_DOT; 9328 for (size_t k = 16; k <= 80; k += 8) { 9329 GemmMicrokernelTester() 9330 .mr(4) 9331 .nr(8) 9332 .kr(4) 9333 .sr(1) 9334 .m(4) 9335 .n(8) 9336 .k(k) 9337 .a_stride(83) 9338 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9339 } 9340 } 9341 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,k_div_8_subtile)9342 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, k_div_8_subtile) { 9343 TEST_REQUIRES_ARM_NEON_DOT; 9344 for (size_t k = 16; k <= 80; k += 8) { 9345 for (uint32_t n = 1; n <= 8; n++) { 9346 for (uint32_t m = 1; m <= 4; m++) { 9347 GemmMicrokernelTester() 9348 .mr(4) 9349 .nr(8) 9350 .kr(4) 9351 .sr(1) 9352 .m(m) 9353 .n(n) 9354 .k(k) 9355 .iterations(1) 9356 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9357 } 9358 } 9359 } 9360 } 9361 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,n_gt_8)9362 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, n_gt_8) { 9363 TEST_REQUIRES_ARM_NEON_DOT; 9364 for (uint32_t n = 9; n < 16; n++) { 9365 for (size_t k = 1; k <= 40; k += 9) { 9366 GemmMicrokernelTester() 9367 .mr(4) 9368 .nr(8) 9369 .kr(4) 9370 .sr(1) 9371 .m(4) 9372 .n(n) 9373 .k(k) 9374 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9375 } 9376 } 9377 } 9378 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,n_gt_8_strided_cn)9379 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, n_gt_8_strided_cn) { 9380 TEST_REQUIRES_ARM_NEON_DOT; 9381 for (uint32_t n = 9; n < 16; n++) { 9382 for (size_t k = 1; k <= 40; k += 9) { 9383 GemmMicrokernelTester() 9384 .mr(4) 9385 .nr(8) 9386 .kr(4) 9387 .sr(1) 9388 .m(4) 9389 .n(n) 9390 .k(k) 9391 .cn_stride(11) 9392 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9393 } 9394 } 9395 } 9396 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,n_gt_8_strided_a)9397 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, n_gt_8_strided_a) { 9398 TEST_REQUIRES_ARM_NEON_DOT; 9399 for (uint32_t n = 9; n < 16; n++) { 9400 for (size_t k = 1; k <= 40; k += 9) { 9401 GemmMicrokernelTester() 9402 .mr(4) 9403 .nr(8) 9404 .kr(4) 9405 .sr(1) 9406 .m(4) 9407 .n(n) 9408 .k(k) 9409 .a_stride(43) 9410 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9411 } 9412 } 9413 } 9414 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,n_gt_8_subtile)9415 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, n_gt_8_subtile) { 9416 TEST_REQUIRES_ARM_NEON_DOT; 9417 for (uint32_t n = 9; n < 16; n++) { 9418 for (size_t k = 1; k <= 40; k += 9) { 9419 for (uint32_t m = 1; m <= 4; m++) { 9420 GemmMicrokernelTester() 9421 .mr(4) 9422 .nr(8) 9423 .kr(4) 9424 .sr(1) 9425 .m(m) 9426 .n(n) 9427 .k(k) 9428 .iterations(1) 9429 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9430 } 9431 } 9432 } 9433 } 9434 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,n_div_8)9435 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, n_div_8) { 9436 TEST_REQUIRES_ARM_NEON_DOT; 9437 for (uint32_t n = 16; n <= 24; n += 8) { 9438 for (size_t k = 1; k <= 40; k += 9) { 9439 GemmMicrokernelTester() 9440 .mr(4) 9441 .nr(8) 9442 .kr(4) 9443 .sr(1) 9444 .m(4) 9445 .n(n) 9446 .k(k) 9447 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9448 } 9449 } 9450 } 9451 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,n_div_8_strided_cn)9452 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, n_div_8_strided_cn) { 9453 TEST_REQUIRES_ARM_NEON_DOT; 9454 for (uint32_t n = 16; n <= 24; n += 8) { 9455 for (size_t k = 1; k <= 40; k += 9) { 9456 GemmMicrokernelTester() 9457 .mr(4) 9458 .nr(8) 9459 .kr(4) 9460 .sr(1) 9461 .m(4) 9462 .n(n) 9463 .k(k) 9464 .cn_stride(11) 9465 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9466 } 9467 } 9468 } 9469 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,n_div_8_strided_a)9470 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, n_div_8_strided_a) { 9471 TEST_REQUIRES_ARM_NEON_DOT; 9472 for (uint32_t n = 16; n <= 24; n += 8) { 9473 for (size_t k = 1; k <= 40; k += 9) { 9474 GemmMicrokernelTester() 9475 .mr(4) 9476 .nr(8) 9477 .kr(4) 9478 .sr(1) 9479 .m(4) 9480 .n(n) 9481 .k(k) 9482 .a_stride(43) 9483 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9484 } 9485 } 9486 } 9487 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,n_div_8_subtile)9488 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, n_div_8_subtile) { 9489 TEST_REQUIRES_ARM_NEON_DOT; 9490 for (uint32_t n = 16; n <= 24; n += 8) { 9491 for (size_t k = 1; k <= 40; k += 9) { 9492 for (uint32_t m = 1; m <= 4; m++) { 9493 GemmMicrokernelTester() 9494 .mr(4) 9495 .nr(8) 9496 .kr(4) 9497 .sr(1) 9498 .m(m) 9499 .n(n) 9500 .k(k) 9501 .iterations(1) 9502 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9503 } 9504 } 9505 } 9506 } 9507 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,strided_cm_subtile)9508 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, strided_cm_subtile) { 9509 TEST_REQUIRES_ARM_NEON_DOT; 9510 for (size_t k = 1; k <= 40; k += 9) { 9511 for (uint32_t n = 1; n <= 8; n++) { 9512 for (uint32_t m = 1; m <= 4; m++) { 9513 GemmMicrokernelTester() 9514 .mr(4) 9515 .nr(8) 9516 .kr(4) 9517 .sr(1) 9518 .m(m) 9519 .n(n) 9520 .k(k) 9521 .cm_stride(11) 9522 .iterations(1) 9523 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9524 } 9525 } 9526 } 9527 } 9528 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,qmin)9529 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, qmin) { 9530 TEST_REQUIRES_ARM_NEON_DOT; 9531 GemmMicrokernelTester() 9532 .mr(4) 9533 .nr(8) 9534 .kr(4) 9535 .sr(1) 9536 .m(4) 9537 .n(8) 9538 .k(8) 9539 .qmin(128) 9540 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9541 } 9542 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,qmax)9543 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, qmax) { 9544 TEST_REQUIRES_ARM_NEON_DOT; 9545 GemmMicrokernelTester() 9546 .mr(4) 9547 .nr(8) 9548 .kr(4) 9549 .sr(1) 9550 .m(4) 9551 .n(8) 9552 .k(8) 9553 .qmax(128) 9554 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9555 } 9556 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,strided_cm)9557 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, strided_cm) { 9558 TEST_REQUIRES_ARM_NEON_DOT; 9559 GemmMicrokernelTester() 9560 .mr(4) 9561 .nr(8) 9562 .kr(4) 9563 .sr(1) 9564 .m(4) 9565 .n(8) 9566 .k(8) 9567 .cm_stride(11) 9568 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9569 } 9570 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,no_a_zero_point)9571 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, no_a_zero_point) { 9572 TEST_REQUIRES_ARM_NEON_DOT; 9573 for (size_t k = 1; k <= 40; k += 9) { 9574 GemmMicrokernelTester() 9575 .mr(4) 9576 .nr(8) 9577 .kr(4) 9578 .sr(1) 9579 .m(4) 9580 .n(8) 9581 .k(k) 9582 .a_zero_point(0) 9583 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9584 } 9585 } 9586 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,no_b_zero_point)9587 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, no_b_zero_point) { 9588 TEST_REQUIRES_ARM_NEON_DOT; 9589 for (size_t k = 1; k <= 40; k += 9) { 9590 GemmMicrokernelTester() 9591 .mr(4) 9592 .nr(8) 9593 .kr(4) 9594 .sr(1) 9595 .m(4) 9596 .n(8) 9597 .k(k) 9598 .b_zero_point(0) 9599 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9600 } 9601 } 9602 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT,no_zero_point)9603 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__NEONDOT, no_zero_point) { 9604 TEST_REQUIRES_ARM_NEON_DOT; 9605 for (size_t k = 1; k <= 40; k += 9) { 9606 GemmMicrokernelTester() 9607 .mr(4) 9608 .nr(8) 9609 .kr(4) 9610 .sr(1) 9611 .m(4) 9612 .n(8) 9613 .k(k) 9614 .a_zero_point(0) 9615 .b_zero_point(0) 9616 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9617 } 9618 } 9619 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) 9620 9621 9622 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_eq_8)9623 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_eq_8) { 9624 TEST_REQUIRES_ARM_NEON_DOT; 9625 GemmMicrokernelTester() 9626 .mr(5) 9627 .nr(8) 9628 .kr(4) 9629 .sr(1) 9630 .m(5) 9631 .n(8) 9632 .k(8) 9633 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9634 } 9635 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,strided_cn)9636 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, strided_cn) { 9637 TEST_REQUIRES_ARM_NEON_DOT; 9638 GemmMicrokernelTester() 9639 .mr(5) 9640 .nr(8) 9641 .kr(4) 9642 .sr(1) 9643 .m(5) 9644 .n(8) 9645 .k(8) 9646 .cn_stride(11) 9647 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9648 } 9649 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_eq_8_strided_a)9650 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_eq_8_strided_a) { 9651 TEST_REQUIRES_ARM_NEON_DOT; 9652 GemmMicrokernelTester() 9653 .mr(5) 9654 .nr(8) 9655 .kr(4) 9656 .sr(1) 9657 .m(5) 9658 .n(8) 9659 .k(8) 9660 .a_stride(11) 9661 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9662 } 9663 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_eq_8_subtile)9664 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_eq_8_subtile) { 9665 TEST_REQUIRES_ARM_NEON_DOT; 9666 for (uint32_t n = 1; n <= 8; n++) { 9667 for (uint32_t m = 1; m <= 5; m++) { 9668 GemmMicrokernelTester() 9669 .mr(5) 9670 .nr(8) 9671 .kr(4) 9672 .sr(1) 9673 .m(m) 9674 .n(n) 9675 .k(8) 9676 .iterations(1) 9677 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9678 } 9679 } 9680 } 9681 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_eq_8_subtile_m)9682 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_eq_8_subtile_m) { 9683 TEST_REQUIRES_ARM_NEON_DOT; 9684 for (uint32_t m = 1; m <= 5; m++) { 9685 GemmMicrokernelTester() 9686 .mr(5) 9687 .nr(8) 9688 .kr(4) 9689 .sr(1) 9690 .m(m) 9691 .n(8) 9692 .k(8) 9693 .iterations(1) 9694 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9695 } 9696 } 9697 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_eq_8_subtile_n)9698 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_eq_8_subtile_n) { 9699 TEST_REQUIRES_ARM_NEON_DOT; 9700 for (uint32_t n = 1; n <= 8; n++) { 9701 GemmMicrokernelTester() 9702 .mr(5) 9703 .nr(8) 9704 .kr(4) 9705 .sr(1) 9706 .m(5) 9707 .n(n) 9708 .k(8) 9709 .iterations(1) 9710 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9711 } 9712 } 9713 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_lt_8)9714 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_lt_8) { 9715 TEST_REQUIRES_ARM_NEON_DOT; 9716 for (size_t k = 1; k < 8; k++) { 9717 GemmMicrokernelTester() 9718 .mr(5) 9719 .nr(8) 9720 .kr(4) 9721 .sr(1) 9722 .m(5) 9723 .n(8) 9724 .k(k) 9725 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9726 } 9727 } 9728 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_lt_8_strided_a)9729 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_lt_8_strided_a) { 9730 TEST_REQUIRES_ARM_NEON_DOT; 9731 for (size_t k = 1; k < 8; k++) { 9732 GemmMicrokernelTester() 9733 .mr(5) 9734 .nr(8) 9735 .kr(4) 9736 .sr(1) 9737 .m(5) 9738 .n(8) 9739 .k(k) 9740 .a_stride(11) 9741 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9742 } 9743 } 9744 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_lt_8_subtile)9745 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_lt_8_subtile) { 9746 TEST_REQUIRES_ARM_NEON_DOT; 9747 for (size_t k = 1; k < 8; k++) { 9748 for (uint32_t n = 1; n <= 8; n++) { 9749 for (uint32_t m = 1; m <= 5; m++) { 9750 GemmMicrokernelTester() 9751 .mr(5) 9752 .nr(8) 9753 .kr(4) 9754 .sr(1) 9755 .m(m) 9756 .n(n) 9757 .k(k) 9758 .iterations(1) 9759 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9760 } 9761 } 9762 } 9763 } 9764 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_gt_8)9765 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_gt_8) { 9766 TEST_REQUIRES_ARM_NEON_DOT; 9767 for (size_t k = 9; k < 16; k++) { 9768 GemmMicrokernelTester() 9769 .mr(5) 9770 .nr(8) 9771 .kr(4) 9772 .sr(1) 9773 .m(5) 9774 .n(8) 9775 .k(k) 9776 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9777 } 9778 } 9779 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_gt_8_strided_a)9780 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_gt_8_strided_a) { 9781 TEST_REQUIRES_ARM_NEON_DOT; 9782 for (size_t k = 9; k < 16; k++) { 9783 GemmMicrokernelTester() 9784 .mr(5) 9785 .nr(8) 9786 .kr(4) 9787 .sr(1) 9788 .m(5) 9789 .n(8) 9790 .k(k) 9791 .a_stride(19) 9792 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9793 } 9794 } 9795 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_gt_8_subtile)9796 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_gt_8_subtile) { 9797 TEST_REQUIRES_ARM_NEON_DOT; 9798 for (size_t k = 9; k < 16; k++) { 9799 for (uint32_t n = 1; n <= 8; n++) { 9800 for (uint32_t m = 1; m <= 5; m++) { 9801 GemmMicrokernelTester() 9802 .mr(5) 9803 .nr(8) 9804 .kr(4) 9805 .sr(1) 9806 .m(m) 9807 .n(n) 9808 .k(k) 9809 .iterations(1) 9810 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9811 } 9812 } 9813 } 9814 } 9815 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_div_8)9816 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_div_8) { 9817 TEST_REQUIRES_ARM_NEON_DOT; 9818 for (size_t k = 16; k <= 80; k += 8) { 9819 GemmMicrokernelTester() 9820 .mr(5) 9821 .nr(8) 9822 .kr(4) 9823 .sr(1) 9824 .m(5) 9825 .n(8) 9826 .k(k) 9827 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9828 } 9829 } 9830 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_div_8_strided_a)9831 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_div_8_strided_a) { 9832 TEST_REQUIRES_ARM_NEON_DOT; 9833 for (size_t k = 16; k <= 80; k += 8) { 9834 GemmMicrokernelTester() 9835 .mr(5) 9836 .nr(8) 9837 .kr(4) 9838 .sr(1) 9839 .m(5) 9840 .n(8) 9841 .k(k) 9842 .a_stride(83) 9843 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9844 } 9845 } 9846 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,k_div_8_subtile)9847 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, k_div_8_subtile) { 9848 TEST_REQUIRES_ARM_NEON_DOT; 9849 for (size_t k = 16; k <= 80; k += 8) { 9850 for (uint32_t n = 1; n <= 8; n++) { 9851 for (uint32_t m = 1; m <= 5; m++) { 9852 GemmMicrokernelTester() 9853 .mr(5) 9854 .nr(8) 9855 .kr(4) 9856 .sr(1) 9857 .m(m) 9858 .n(n) 9859 .k(k) 9860 .iterations(1) 9861 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9862 } 9863 } 9864 } 9865 } 9866 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,n_gt_8)9867 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, n_gt_8) { 9868 TEST_REQUIRES_ARM_NEON_DOT; 9869 for (uint32_t n = 9; n < 16; n++) { 9870 for (size_t k = 1; k <= 40; k += 9) { 9871 GemmMicrokernelTester() 9872 .mr(5) 9873 .nr(8) 9874 .kr(4) 9875 .sr(1) 9876 .m(5) 9877 .n(n) 9878 .k(k) 9879 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9880 } 9881 } 9882 } 9883 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,n_gt_8_strided_cn)9884 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, n_gt_8_strided_cn) { 9885 TEST_REQUIRES_ARM_NEON_DOT; 9886 for (uint32_t n = 9; n < 16; n++) { 9887 for (size_t k = 1; k <= 40; k += 9) { 9888 GemmMicrokernelTester() 9889 .mr(5) 9890 .nr(8) 9891 .kr(4) 9892 .sr(1) 9893 .m(5) 9894 .n(n) 9895 .k(k) 9896 .cn_stride(11) 9897 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9898 } 9899 } 9900 } 9901 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,n_gt_8_strided_a)9902 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, n_gt_8_strided_a) { 9903 TEST_REQUIRES_ARM_NEON_DOT; 9904 for (uint32_t n = 9; n < 16; n++) { 9905 for (size_t k = 1; k <= 40; k += 9) { 9906 GemmMicrokernelTester() 9907 .mr(5) 9908 .nr(8) 9909 .kr(4) 9910 .sr(1) 9911 .m(5) 9912 .n(n) 9913 .k(k) 9914 .a_stride(43) 9915 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9916 } 9917 } 9918 } 9919 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,n_gt_8_subtile)9920 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, n_gt_8_subtile) { 9921 TEST_REQUIRES_ARM_NEON_DOT; 9922 for (uint32_t n = 9; n < 16; n++) { 9923 for (size_t k = 1; k <= 40; k += 9) { 9924 for (uint32_t m = 1; m <= 5; m++) { 9925 GemmMicrokernelTester() 9926 .mr(5) 9927 .nr(8) 9928 .kr(4) 9929 .sr(1) 9930 .m(m) 9931 .n(n) 9932 .k(k) 9933 .iterations(1) 9934 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9935 } 9936 } 9937 } 9938 } 9939 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,n_div_8)9940 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, n_div_8) { 9941 TEST_REQUIRES_ARM_NEON_DOT; 9942 for (uint32_t n = 16; n <= 24; n += 8) { 9943 for (size_t k = 1; k <= 40; k += 9) { 9944 GemmMicrokernelTester() 9945 .mr(5) 9946 .nr(8) 9947 .kr(4) 9948 .sr(1) 9949 .m(5) 9950 .n(n) 9951 .k(k) 9952 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9953 } 9954 } 9955 } 9956 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,n_div_8_strided_cn)9957 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, n_div_8_strided_cn) { 9958 TEST_REQUIRES_ARM_NEON_DOT; 9959 for (uint32_t n = 16; n <= 24; n += 8) { 9960 for (size_t k = 1; k <= 40; k += 9) { 9961 GemmMicrokernelTester() 9962 .mr(5) 9963 .nr(8) 9964 .kr(4) 9965 .sr(1) 9966 .m(5) 9967 .n(n) 9968 .k(k) 9969 .cn_stride(11) 9970 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9971 } 9972 } 9973 } 9974 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,n_div_8_strided_a)9975 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, n_div_8_strided_a) { 9976 TEST_REQUIRES_ARM_NEON_DOT; 9977 for (uint32_t n = 16; n <= 24; n += 8) { 9978 for (size_t k = 1; k <= 40; k += 9) { 9979 GemmMicrokernelTester() 9980 .mr(5) 9981 .nr(8) 9982 .kr(4) 9983 .sr(1) 9984 .m(5) 9985 .n(n) 9986 .k(k) 9987 .a_stride(43) 9988 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9989 } 9990 } 9991 } 9992 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,n_div_8_subtile)9993 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, n_div_8_subtile) { 9994 TEST_REQUIRES_ARM_NEON_DOT; 9995 for (uint32_t n = 16; n <= 24; n += 8) { 9996 for (size_t k = 1; k <= 40; k += 9) { 9997 for (uint32_t m = 1; m <= 5; m++) { 9998 GemmMicrokernelTester() 9999 .mr(5) 10000 .nr(8) 10001 .kr(4) 10002 .sr(1) 10003 .m(m) 10004 .n(n) 10005 .k(k) 10006 .iterations(1) 10007 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10008 } 10009 } 10010 } 10011 } 10012 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,strided_cm_subtile)10013 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, strided_cm_subtile) { 10014 TEST_REQUIRES_ARM_NEON_DOT; 10015 for (size_t k = 1; k <= 40; k += 9) { 10016 for (uint32_t n = 1; n <= 8; n++) { 10017 for (uint32_t m = 1; m <= 5; m++) { 10018 GemmMicrokernelTester() 10019 .mr(5) 10020 .nr(8) 10021 .kr(4) 10022 .sr(1) 10023 .m(m) 10024 .n(n) 10025 .k(k) 10026 .cm_stride(11) 10027 .iterations(1) 10028 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10029 } 10030 } 10031 } 10032 } 10033 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,qmin)10034 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, qmin) { 10035 TEST_REQUIRES_ARM_NEON_DOT; 10036 GemmMicrokernelTester() 10037 .mr(5) 10038 .nr(8) 10039 .kr(4) 10040 .sr(1) 10041 .m(5) 10042 .n(8) 10043 .k(8) 10044 .qmin(128) 10045 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10046 } 10047 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,qmax)10048 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, qmax) { 10049 TEST_REQUIRES_ARM_NEON_DOT; 10050 GemmMicrokernelTester() 10051 .mr(5) 10052 .nr(8) 10053 .kr(4) 10054 .sr(1) 10055 .m(5) 10056 .n(8) 10057 .k(8) 10058 .qmax(128) 10059 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10060 } 10061 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,strided_cm)10062 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, strided_cm) { 10063 TEST_REQUIRES_ARM_NEON_DOT; 10064 GemmMicrokernelTester() 10065 .mr(5) 10066 .nr(8) 10067 .kr(4) 10068 .sr(1) 10069 .m(5) 10070 .n(8) 10071 .k(8) 10072 .cm_stride(11) 10073 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10074 } 10075 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,no_a_zero_point)10076 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, no_a_zero_point) { 10077 TEST_REQUIRES_ARM_NEON_DOT; 10078 for (size_t k = 1; k <= 40; k += 9) { 10079 GemmMicrokernelTester() 10080 .mr(5) 10081 .nr(8) 10082 .kr(4) 10083 .sr(1) 10084 .m(5) 10085 .n(8) 10086 .k(k) 10087 .a_zero_point(0) 10088 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10089 } 10090 } 10091 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,no_b_zero_point)10092 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, no_b_zero_point) { 10093 TEST_REQUIRES_ARM_NEON_DOT; 10094 for (size_t k = 1; k <= 40; k += 9) { 10095 GemmMicrokernelTester() 10096 .mr(5) 10097 .nr(8) 10098 .kr(4) 10099 .sr(1) 10100 .m(5) 10101 .n(8) 10102 .k(k) 10103 .b_zero_point(0) 10104 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10105 } 10106 } 10107 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT,no_zero_point)10108 TEST(QU8_GEMM_MINMAX_RNDNU_5X8C4__NEONDOT, no_zero_point) { 10109 TEST_REQUIRES_ARM_NEON_DOT; 10110 for (size_t k = 1; k <= 40; k += 9) { 10111 GemmMicrokernelTester() 10112 .mr(5) 10113 .nr(8) 10114 .kr(4) 10115 .sr(1) 10116 .m(5) 10117 .n(8) 10118 .k(k) 10119 .a_zero_point(0) 10120 .b_zero_point(0) 10121 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10122 } 10123 } 10124 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) 10125 10126 10127 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_eq_8)10128 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_eq_8) { 10129 TEST_REQUIRES_ARM_NEON_DOT; 10130 GemmMicrokernelTester() 10131 .mr(6) 10132 .nr(16) 10133 .kr(4) 10134 .sr(1) 10135 .m(6) 10136 .n(16) 10137 .k(8) 10138 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10139 } 10140 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,strided_cn)10141 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, strided_cn) { 10142 TEST_REQUIRES_ARM_NEON_DOT; 10143 GemmMicrokernelTester() 10144 .mr(6) 10145 .nr(16) 10146 .kr(4) 10147 .sr(1) 10148 .m(6) 10149 .n(16) 10150 .k(8) 10151 .cn_stride(19) 10152 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10153 } 10154 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_eq_8_strided_a)10155 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_eq_8_strided_a) { 10156 TEST_REQUIRES_ARM_NEON_DOT; 10157 GemmMicrokernelTester() 10158 .mr(6) 10159 .nr(16) 10160 .kr(4) 10161 .sr(1) 10162 .m(6) 10163 .n(16) 10164 .k(8) 10165 .a_stride(11) 10166 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10167 } 10168 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_eq_8_subtile)10169 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_eq_8_subtile) { 10170 TEST_REQUIRES_ARM_NEON_DOT; 10171 for (uint32_t n = 1; n <= 16; n++) { 10172 for (uint32_t m = 1; m <= 6; m++) { 10173 GemmMicrokernelTester() 10174 .mr(6) 10175 .nr(16) 10176 .kr(4) 10177 .sr(1) 10178 .m(m) 10179 .n(n) 10180 .k(8) 10181 .iterations(1) 10182 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10183 } 10184 } 10185 } 10186 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_eq_8_subtile_m)10187 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_eq_8_subtile_m) { 10188 TEST_REQUIRES_ARM_NEON_DOT; 10189 for (uint32_t m = 1; m <= 6; m++) { 10190 GemmMicrokernelTester() 10191 .mr(6) 10192 .nr(16) 10193 .kr(4) 10194 .sr(1) 10195 .m(m) 10196 .n(16) 10197 .k(8) 10198 .iterations(1) 10199 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10200 } 10201 } 10202 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_eq_8_subtile_n)10203 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_eq_8_subtile_n) { 10204 TEST_REQUIRES_ARM_NEON_DOT; 10205 for (uint32_t n = 1; n <= 16; n++) { 10206 GemmMicrokernelTester() 10207 .mr(6) 10208 .nr(16) 10209 .kr(4) 10210 .sr(1) 10211 .m(6) 10212 .n(n) 10213 .k(8) 10214 .iterations(1) 10215 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10216 } 10217 } 10218 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_lt_8)10219 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_lt_8) { 10220 TEST_REQUIRES_ARM_NEON_DOT; 10221 for (size_t k = 1; k < 8; k++) { 10222 GemmMicrokernelTester() 10223 .mr(6) 10224 .nr(16) 10225 .kr(4) 10226 .sr(1) 10227 .m(6) 10228 .n(16) 10229 .k(k) 10230 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10231 } 10232 } 10233 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_lt_8_strided_a)10234 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_lt_8_strided_a) { 10235 TEST_REQUIRES_ARM_NEON_DOT; 10236 for (size_t k = 1; k < 8; k++) { 10237 GemmMicrokernelTester() 10238 .mr(6) 10239 .nr(16) 10240 .kr(4) 10241 .sr(1) 10242 .m(6) 10243 .n(16) 10244 .k(k) 10245 .a_stride(11) 10246 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10247 } 10248 } 10249 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_lt_8_subtile)10250 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_lt_8_subtile) { 10251 TEST_REQUIRES_ARM_NEON_DOT; 10252 for (size_t k = 1; k < 8; k++) { 10253 for (uint32_t n = 1; n <= 16; n++) { 10254 for (uint32_t m = 1; m <= 6; m++) { 10255 GemmMicrokernelTester() 10256 .mr(6) 10257 .nr(16) 10258 .kr(4) 10259 .sr(1) 10260 .m(m) 10261 .n(n) 10262 .k(k) 10263 .iterations(1) 10264 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10265 } 10266 } 10267 } 10268 } 10269 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_gt_8)10270 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_gt_8) { 10271 TEST_REQUIRES_ARM_NEON_DOT; 10272 for (size_t k = 9; k < 16; k++) { 10273 GemmMicrokernelTester() 10274 .mr(6) 10275 .nr(16) 10276 .kr(4) 10277 .sr(1) 10278 .m(6) 10279 .n(16) 10280 .k(k) 10281 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10282 } 10283 } 10284 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_gt_8_strided_a)10285 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_gt_8_strided_a) { 10286 TEST_REQUIRES_ARM_NEON_DOT; 10287 for (size_t k = 9; k < 16; k++) { 10288 GemmMicrokernelTester() 10289 .mr(6) 10290 .nr(16) 10291 .kr(4) 10292 .sr(1) 10293 .m(6) 10294 .n(16) 10295 .k(k) 10296 .a_stride(19) 10297 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10298 } 10299 } 10300 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_gt_8_subtile)10301 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_gt_8_subtile) { 10302 TEST_REQUIRES_ARM_NEON_DOT; 10303 for (size_t k = 9; k < 16; k++) { 10304 for (uint32_t n = 1; n <= 16; n++) { 10305 for (uint32_t m = 1; m <= 6; m++) { 10306 GemmMicrokernelTester() 10307 .mr(6) 10308 .nr(16) 10309 .kr(4) 10310 .sr(1) 10311 .m(m) 10312 .n(n) 10313 .k(k) 10314 .iterations(1) 10315 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10316 } 10317 } 10318 } 10319 } 10320 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_div_8)10321 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_div_8) { 10322 TEST_REQUIRES_ARM_NEON_DOT; 10323 for (size_t k = 16; k <= 80; k += 8) { 10324 GemmMicrokernelTester() 10325 .mr(6) 10326 .nr(16) 10327 .kr(4) 10328 .sr(1) 10329 .m(6) 10330 .n(16) 10331 .k(k) 10332 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10333 } 10334 } 10335 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_div_8_strided_a)10336 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_div_8_strided_a) { 10337 TEST_REQUIRES_ARM_NEON_DOT; 10338 for (size_t k = 16; k <= 80; k += 8) { 10339 GemmMicrokernelTester() 10340 .mr(6) 10341 .nr(16) 10342 .kr(4) 10343 .sr(1) 10344 .m(6) 10345 .n(16) 10346 .k(k) 10347 .a_stride(83) 10348 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10349 } 10350 } 10351 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,k_div_8_subtile)10352 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, k_div_8_subtile) { 10353 TEST_REQUIRES_ARM_NEON_DOT; 10354 for (size_t k = 16; k <= 80; k += 8) { 10355 for (uint32_t n = 1; n <= 16; n++) { 10356 for (uint32_t m = 1; m <= 6; m++) { 10357 GemmMicrokernelTester() 10358 .mr(6) 10359 .nr(16) 10360 .kr(4) 10361 .sr(1) 10362 .m(m) 10363 .n(n) 10364 .k(k) 10365 .iterations(1) 10366 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10367 } 10368 } 10369 } 10370 } 10371 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,n_gt_16)10372 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, n_gt_16) { 10373 TEST_REQUIRES_ARM_NEON_DOT; 10374 for (uint32_t n = 17; n < 32; n++) { 10375 for (size_t k = 1; k <= 40; k += 9) { 10376 GemmMicrokernelTester() 10377 .mr(6) 10378 .nr(16) 10379 .kr(4) 10380 .sr(1) 10381 .m(6) 10382 .n(n) 10383 .k(k) 10384 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10385 } 10386 } 10387 } 10388 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,n_gt_16_strided_cn)10389 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, n_gt_16_strided_cn) { 10390 TEST_REQUIRES_ARM_NEON_DOT; 10391 for (uint32_t n = 17; n < 32; n++) { 10392 for (size_t k = 1; k <= 40; k += 9) { 10393 GemmMicrokernelTester() 10394 .mr(6) 10395 .nr(16) 10396 .kr(4) 10397 .sr(1) 10398 .m(6) 10399 .n(n) 10400 .k(k) 10401 .cn_stride(19) 10402 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10403 } 10404 } 10405 } 10406 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,n_gt_16_strided_a)10407 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, n_gt_16_strided_a) { 10408 TEST_REQUIRES_ARM_NEON_DOT; 10409 for (uint32_t n = 17; n < 32; n++) { 10410 for (size_t k = 1; k <= 40; k += 9) { 10411 GemmMicrokernelTester() 10412 .mr(6) 10413 .nr(16) 10414 .kr(4) 10415 .sr(1) 10416 .m(6) 10417 .n(n) 10418 .k(k) 10419 .a_stride(43) 10420 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10421 } 10422 } 10423 } 10424 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,n_gt_16_subtile)10425 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, n_gt_16_subtile) { 10426 TEST_REQUIRES_ARM_NEON_DOT; 10427 for (uint32_t n = 17; n < 32; n++) { 10428 for (size_t k = 1; k <= 40; k += 9) { 10429 for (uint32_t m = 1; m <= 6; m++) { 10430 GemmMicrokernelTester() 10431 .mr(6) 10432 .nr(16) 10433 .kr(4) 10434 .sr(1) 10435 .m(m) 10436 .n(n) 10437 .k(k) 10438 .iterations(1) 10439 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10440 } 10441 } 10442 } 10443 } 10444 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,n_div_16)10445 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, n_div_16) { 10446 TEST_REQUIRES_ARM_NEON_DOT; 10447 for (uint32_t n = 32; n <= 48; n += 16) { 10448 for (size_t k = 1; k <= 40; k += 9) { 10449 GemmMicrokernelTester() 10450 .mr(6) 10451 .nr(16) 10452 .kr(4) 10453 .sr(1) 10454 .m(6) 10455 .n(n) 10456 .k(k) 10457 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10458 } 10459 } 10460 } 10461 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,n_div_16_strided_cn)10462 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, n_div_16_strided_cn) { 10463 TEST_REQUIRES_ARM_NEON_DOT; 10464 for (uint32_t n = 32; n <= 48; n += 16) { 10465 for (size_t k = 1; k <= 40; k += 9) { 10466 GemmMicrokernelTester() 10467 .mr(6) 10468 .nr(16) 10469 .kr(4) 10470 .sr(1) 10471 .m(6) 10472 .n(n) 10473 .k(k) 10474 .cn_stride(19) 10475 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10476 } 10477 } 10478 } 10479 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,n_div_16_strided_a)10480 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, n_div_16_strided_a) { 10481 TEST_REQUIRES_ARM_NEON_DOT; 10482 for (uint32_t n = 32; n <= 48; n += 16) { 10483 for (size_t k = 1; k <= 40; k += 9) { 10484 GemmMicrokernelTester() 10485 .mr(6) 10486 .nr(16) 10487 .kr(4) 10488 .sr(1) 10489 .m(6) 10490 .n(n) 10491 .k(k) 10492 .a_stride(43) 10493 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10494 } 10495 } 10496 } 10497 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,n_div_16_subtile)10498 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, n_div_16_subtile) { 10499 TEST_REQUIRES_ARM_NEON_DOT; 10500 for (uint32_t n = 32; n <= 48; n += 16) { 10501 for (size_t k = 1; k <= 40; k += 9) { 10502 for (uint32_t m = 1; m <= 6; m++) { 10503 GemmMicrokernelTester() 10504 .mr(6) 10505 .nr(16) 10506 .kr(4) 10507 .sr(1) 10508 .m(m) 10509 .n(n) 10510 .k(k) 10511 .iterations(1) 10512 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10513 } 10514 } 10515 } 10516 } 10517 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,strided_cm_subtile)10518 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, strided_cm_subtile) { 10519 TEST_REQUIRES_ARM_NEON_DOT; 10520 for (size_t k = 1; k <= 40; k += 9) { 10521 for (uint32_t n = 1; n <= 16; n++) { 10522 for (uint32_t m = 1; m <= 6; m++) { 10523 GemmMicrokernelTester() 10524 .mr(6) 10525 .nr(16) 10526 .kr(4) 10527 .sr(1) 10528 .m(m) 10529 .n(n) 10530 .k(k) 10531 .cm_stride(19) 10532 .iterations(1) 10533 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10534 } 10535 } 10536 } 10537 } 10538 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,qmin)10539 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, qmin) { 10540 TEST_REQUIRES_ARM_NEON_DOT; 10541 GemmMicrokernelTester() 10542 .mr(6) 10543 .nr(16) 10544 .kr(4) 10545 .sr(1) 10546 .m(6) 10547 .n(16) 10548 .k(8) 10549 .qmin(128) 10550 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10551 } 10552 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,qmax)10553 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, qmax) { 10554 TEST_REQUIRES_ARM_NEON_DOT; 10555 GemmMicrokernelTester() 10556 .mr(6) 10557 .nr(16) 10558 .kr(4) 10559 .sr(1) 10560 .m(6) 10561 .n(16) 10562 .k(8) 10563 .qmax(128) 10564 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10565 } 10566 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,strided_cm)10567 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, strided_cm) { 10568 TEST_REQUIRES_ARM_NEON_DOT; 10569 GemmMicrokernelTester() 10570 .mr(6) 10571 .nr(16) 10572 .kr(4) 10573 .sr(1) 10574 .m(6) 10575 .n(16) 10576 .k(8) 10577 .cm_stride(19) 10578 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10579 } 10580 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,no_a_zero_point)10581 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, no_a_zero_point) { 10582 TEST_REQUIRES_ARM_NEON_DOT; 10583 for (size_t k = 1; k <= 40; k += 9) { 10584 GemmMicrokernelTester() 10585 .mr(6) 10586 .nr(16) 10587 .kr(4) 10588 .sr(1) 10589 .m(6) 10590 .n(16) 10591 .k(k) 10592 .a_zero_point(0) 10593 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10594 } 10595 } 10596 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,no_b_zero_point)10597 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, no_b_zero_point) { 10598 TEST_REQUIRES_ARM_NEON_DOT; 10599 for (size_t k = 1; k <= 40; k += 9) { 10600 GemmMicrokernelTester() 10601 .mr(6) 10602 .nr(16) 10603 .kr(4) 10604 .sr(1) 10605 .m(6) 10606 .n(16) 10607 .k(k) 10608 .b_zero_point(0) 10609 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10610 } 10611 } 10612 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT,no_zero_point)10613 TEST(QU8_GEMM_MINMAX_RNDNU_6X16C4__NEONDOT, no_zero_point) { 10614 TEST_REQUIRES_ARM_NEON_DOT; 10615 for (size_t k = 1; k <= 40; k += 9) { 10616 GemmMicrokernelTester() 10617 .mr(6) 10618 .nr(16) 10619 .kr(4) 10620 .sr(1) 10621 .m(6) 10622 .n(16) 10623 .k(k) 10624 .a_zero_point(0) 10625 .b_zero_point(0) 10626 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10627 } 10628 } 10629 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64) 10630 10631 10632 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_eq_8)10633 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_eq_8) { 10634 TEST_REQUIRES_ARM_NEON; 10635 GemmMicrokernelTester() 10636 .mr(4) 10637 .nr(16) 10638 .kr(1) 10639 .sr(1) 10640 .m(4) 10641 .n(16) 10642 .k(8) 10643 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10644 } 10645 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,strided_cn)10646 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, strided_cn) { 10647 TEST_REQUIRES_ARM_NEON; 10648 GemmMicrokernelTester() 10649 .mr(4) 10650 .nr(16) 10651 .kr(1) 10652 .sr(1) 10653 .m(4) 10654 .n(16) 10655 .k(8) 10656 .cn_stride(19) 10657 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10658 } 10659 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_eq_8_strided_a)10660 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_eq_8_strided_a) { 10661 TEST_REQUIRES_ARM_NEON; 10662 GemmMicrokernelTester() 10663 .mr(4) 10664 .nr(16) 10665 .kr(1) 10666 .sr(1) 10667 .m(4) 10668 .n(16) 10669 .k(8) 10670 .a_stride(11) 10671 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10672 } 10673 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_eq_8_subtile)10674 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_eq_8_subtile) { 10675 TEST_REQUIRES_ARM_NEON; 10676 for (uint32_t n = 1; n <= 16; n++) { 10677 for (uint32_t m = 1; m <= 4; m++) { 10678 GemmMicrokernelTester() 10679 .mr(4) 10680 .nr(16) 10681 .kr(1) 10682 .sr(1) 10683 .m(m) 10684 .n(n) 10685 .k(8) 10686 .iterations(1) 10687 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10688 } 10689 } 10690 } 10691 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_eq_8_subtile_m)10692 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_eq_8_subtile_m) { 10693 TEST_REQUIRES_ARM_NEON; 10694 for (uint32_t m = 1; m <= 4; m++) { 10695 GemmMicrokernelTester() 10696 .mr(4) 10697 .nr(16) 10698 .kr(1) 10699 .sr(1) 10700 .m(m) 10701 .n(16) 10702 .k(8) 10703 .iterations(1) 10704 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10705 } 10706 } 10707 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_eq_8_subtile_n)10708 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_eq_8_subtile_n) { 10709 TEST_REQUIRES_ARM_NEON; 10710 for (uint32_t n = 1; n <= 16; n++) { 10711 GemmMicrokernelTester() 10712 .mr(4) 10713 .nr(16) 10714 .kr(1) 10715 .sr(1) 10716 .m(4) 10717 .n(n) 10718 .k(8) 10719 .iterations(1) 10720 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10721 } 10722 } 10723 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_lt_8)10724 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_lt_8) { 10725 TEST_REQUIRES_ARM_NEON; 10726 for (size_t k = 1; k < 8; k++) { 10727 GemmMicrokernelTester() 10728 .mr(4) 10729 .nr(16) 10730 .kr(1) 10731 .sr(1) 10732 .m(4) 10733 .n(16) 10734 .k(k) 10735 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10736 } 10737 } 10738 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_lt_8_strided_a)10739 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_lt_8_strided_a) { 10740 TEST_REQUIRES_ARM_NEON; 10741 for (size_t k = 1; k < 8; k++) { 10742 GemmMicrokernelTester() 10743 .mr(4) 10744 .nr(16) 10745 .kr(1) 10746 .sr(1) 10747 .m(4) 10748 .n(16) 10749 .k(k) 10750 .a_stride(11) 10751 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10752 } 10753 } 10754 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_lt_8_subtile)10755 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_lt_8_subtile) { 10756 TEST_REQUIRES_ARM_NEON; 10757 for (size_t k = 1; k < 8; k++) { 10758 for (uint32_t n = 1; n <= 16; n++) { 10759 for (uint32_t m = 1; m <= 4; m++) { 10760 GemmMicrokernelTester() 10761 .mr(4) 10762 .nr(16) 10763 .kr(1) 10764 .sr(1) 10765 .m(m) 10766 .n(n) 10767 .k(k) 10768 .iterations(1) 10769 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10770 } 10771 } 10772 } 10773 } 10774 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_gt_8)10775 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_gt_8) { 10776 TEST_REQUIRES_ARM_NEON; 10777 for (size_t k = 9; k < 16; k++) { 10778 GemmMicrokernelTester() 10779 .mr(4) 10780 .nr(16) 10781 .kr(1) 10782 .sr(1) 10783 .m(4) 10784 .n(16) 10785 .k(k) 10786 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10787 } 10788 } 10789 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_gt_8_strided_a)10790 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_gt_8_strided_a) { 10791 TEST_REQUIRES_ARM_NEON; 10792 for (size_t k = 9; k < 16; k++) { 10793 GemmMicrokernelTester() 10794 .mr(4) 10795 .nr(16) 10796 .kr(1) 10797 .sr(1) 10798 .m(4) 10799 .n(16) 10800 .k(k) 10801 .a_stride(19) 10802 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10803 } 10804 } 10805 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_gt_8_subtile)10806 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_gt_8_subtile) { 10807 TEST_REQUIRES_ARM_NEON; 10808 for (size_t k = 9; k < 16; k++) { 10809 for (uint32_t n = 1; n <= 16; n++) { 10810 for (uint32_t m = 1; m <= 4; m++) { 10811 GemmMicrokernelTester() 10812 .mr(4) 10813 .nr(16) 10814 .kr(1) 10815 .sr(1) 10816 .m(m) 10817 .n(n) 10818 .k(k) 10819 .iterations(1) 10820 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10821 } 10822 } 10823 } 10824 } 10825 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_div_8)10826 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_div_8) { 10827 TEST_REQUIRES_ARM_NEON; 10828 for (size_t k = 16; k <= 80; k += 8) { 10829 GemmMicrokernelTester() 10830 .mr(4) 10831 .nr(16) 10832 .kr(1) 10833 .sr(1) 10834 .m(4) 10835 .n(16) 10836 .k(k) 10837 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10838 } 10839 } 10840 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_div_8_strided_a)10841 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_div_8_strided_a) { 10842 TEST_REQUIRES_ARM_NEON; 10843 for (size_t k = 16; k <= 80; k += 8) { 10844 GemmMicrokernelTester() 10845 .mr(4) 10846 .nr(16) 10847 .kr(1) 10848 .sr(1) 10849 .m(4) 10850 .n(16) 10851 .k(k) 10852 .a_stride(83) 10853 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10854 } 10855 } 10856 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,k_div_8_subtile)10857 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, k_div_8_subtile) { 10858 TEST_REQUIRES_ARM_NEON; 10859 for (size_t k = 16; k <= 80; k += 8) { 10860 for (uint32_t n = 1; n <= 16; n++) { 10861 for (uint32_t m = 1; m <= 4; m++) { 10862 GemmMicrokernelTester() 10863 .mr(4) 10864 .nr(16) 10865 .kr(1) 10866 .sr(1) 10867 .m(m) 10868 .n(n) 10869 .k(k) 10870 .iterations(1) 10871 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10872 } 10873 } 10874 } 10875 } 10876 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,n_gt_16)10877 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, n_gt_16) { 10878 TEST_REQUIRES_ARM_NEON; 10879 for (uint32_t n = 17; n < 32; n++) { 10880 for (size_t k = 1; k <= 40; k += 9) { 10881 GemmMicrokernelTester() 10882 .mr(4) 10883 .nr(16) 10884 .kr(1) 10885 .sr(1) 10886 .m(4) 10887 .n(n) 10888 .k(k) 10889 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10890 } 10891 } 10892 } 10893 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,n_gt_16_strided_cn)10894 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, n_gt_16_strided_cn) { 10895 TEST_REQUIRES_ARM_NEON; 10896 for (uint32_t n = 17; n < 32; n++) { 10897 for (size_t k = 1; k <= 40; k += 9) { 10898 GemmMicrokernelTester() 10899 .mr(4) 10900 .nr(16) 10901 .kr(1) 10902 .sr(1) 10903 .m(4) 10904 .n(n) 10905 .k(k) 10906 .cn_stride(19) 10907 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10908 } 10909 } 10910 } 10911 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,n_gt_16_strided_a)10912 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, n_gt_16_strided_a) { 10913 TEST_REQUIRES_ARM_NEON; 10914 for (uint32_t n = 17; n < 32; n++) { 10915 for (size_t k = 1; k <= 40; k += 9) { 10916 GemmMicrokernelTester() 10917 .mr(4) 10918 .nr(16) 10919 .kr(1) 10920 .sr(1) 10921 .m(4) 10922 .n(n) 10923 .k(k) 10924 .a_stride(43) 10925 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10926 } 10927 } 10928 } 10929 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,n_gt_16_subtile)10930 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, n_gt_16_subtile) { 10931 TEST_REQUIRES_ARM_NEON; 10932 for (uint32_t n = 17; n < 32; n++) { 10933 for (size_t k = 1; k <= 40; k += 9) { 10934 for (uint32_t m = 1; m <= 4; m++) { 10935 GemmMicrokernelTester() 10936 .mr(4) 10937 .nr(16) 10938 .kr(1) 10939 .sr(1) 10940 .m(m) 10941 .n(n) 10942 .k(k) 10943 .iterations(1) 10944 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10945 } 10946 } 10947 } 10948 } 10949 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,n_div_16)10950 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, n_div_16) { 10951 TEST_REQUIRES_ARM_NEON; 10952 for (uint32_t n = 32; n <= 48; n += 16) { 10953 for (size_t k = 1; k <= 40; k += 9) { 10954 GemmMicrokernelTester() 10955 .mr(4) 10956 .nr(16) 10957 .kr(1) 10958 .sr(1) 10959 .m(4) 10960 .n(n) 10961 .k(k) 10962 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10963 } 10964 } 10965 } 10966 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,n_div_16_strided_cn)10967 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, n_div_16_strided_cn) { 10968 TEST_REQUIRES_ARM_NEON; 10969 for (uint32_t n = 32; n <= 48; n += 16) { 10970 for (size_t k = 1; k <= 40; k += 9) { 10971 GemmMicrokernelTester() 10972 .mr(4) 10973 .nr(16) 10974 .kr(1) 10975 .sr(1) 10976 .m(4) 10977 .n(n) 10978 .k(k) 10979 .cn_stride(19) 10980 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10981 } 10982 } 10983 } 10984 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,n_div_16_strided_a)10985 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, n_div_16_strided_a) { 10986 TEST_REQUIRES_ARM_NEON; 10987 for (uint32_t n = 32; n <= 48; n += 16) { 10988 for (size_t k = 1; k <= 40; k += 9) { 10989 GemmMicrokernelTester() 10990 .mr(4) 10991 .nr(16) 10992 .kr(1) 10993 .sr(1) 10994 .m(4) 10995 .n(n) 10996 .k(k) 10997 .a_stride(43) 10998 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10999 } 11000 } 11001 } 11002 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,n_div_16_subtile)11003 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, n_div_16_subtile) { 11004 TEST_REQUIRES_ARM_NEON; 11005 for (uint32_t n = 32; n <= 48; n += 16) { 11006 for (size_t k = 1; k <= 40; k += 9) { 11007 for (uint32_t m = 1; m <= 4; m++) { 11008 GemmMicrokernelTester() 11009 .mr(4) 11010 .nr(16) 11011 .kr(1) 11012 .sr(1) 11013 .m(m) 11014 .n(n) 11015 .k(k) 11016 .iterations(1) 11017 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11018 } 11019 } 11020 } 11021 } 11022 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,strided_cm_subtile)11023 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, strided_cm_subtile) { 11024 TEST_REQUIRES_ARM_NEON; 11025 for (size_t k = 1; k <= 40; k += 9) { 11026 for (uint32_t n = 1; n <= 16; n++) { 11027 for (uint32_t m = 1; m <= 4; m++) { 11028 GemmMicrokernelTester() 11029 .mr(4) 11030 .nr(16) 11031 .kr(1) 11032 .sr(1) 11033 .m(m) 11034 .n(n) 11035 .k(k) 11036 .cm_stride(19) 11037 .iterations(1) 11038 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11039 } 11040 } 11041 } 11042 } 11043 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,qmin)11044 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, qmin) { 11045 TEST_REQUIRES_ARM_NEON; 11046 GemmMicrokernelTester() 11047 .mr(4) 11048 .nr(16) 11049 .kr(1) 11050 .sr(1) 11051 .m(4) 11052 .n(16) 11053 .k(8) 11054 .qmin(128) 11055 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11056 } 11057 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,qmax)11058 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, qmax) { 11059 TEST_REQUIRES_ARM_NEON; 11060 GemmMicrokernelTester() 11061 .mr(4) 11062 .nr(16) 11063 .kr(1) 11064 .sr(1) 11065 .m(4) 11066 .n(16) 11067 .k(8) 11068 .qmax(128) 11069 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11070 } 11071 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,strided_cm)11072 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, strided_cm) { 11073 TEST_REQUIRES_ARM_NEON; 11074 GemmMicrokernelTester() 11075 .mr(4) 11076 .nr(16) 11077 .kr(1) 11078 .sr(1) 11079 .m(4) 11080 .n(16) 11081 .k(8) 11082 .cm_stride(19) 11083 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11084 } 11085 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,no_a_zero_point)11086 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, no_a_zero_point) { 11087 TEST_REQUIRES_ARM_NEON; 11088 for (size_t k = 1; k <= 40; k += 9) { 11089 GemmMicrokernelTester() 11090 .mr(4) 11091 .nr(16) 11092 .kr(1) 11093 .sr(1) 11094 .m(4) 11095 .n(16) 11096 .k(k) 11097 .a_zero_point(0) 11098 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11099 } 11100 } 11101 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,no_b_zero_point)11102 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, no_b_zero_point) { 11103 TEST_REQUIRES_ARM_NEON; 11104 for (size_t k = 1; k <= 40; k += 9) { 11105 GemmMicrokernelTester() 11106 .mr(4) 11107 .nr(16) 11108 .kr(1) 11109 .sr(1) 11110 .m(4) 11111 .n(16) 11112 .k(k) 11113 .b_zero_point(0) 11114 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11115 } 11116 } 11117 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75,no_zero_point)11118 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A75, no_zero_point) { 11119 TEST_REQUIRES_ARM_NEON; 11120 for (size_t k = 1; k <= 40; k += 9) { 11121 GemmMicrokernelTester() 11122 .mr(4) 11123 .nr(16) 11124 .kr(1) 11125 .sr(1) 11126 .m(4) 11127 .n(16) 11128 .k(k) 11129 .a_zero_point(0) 11130 .b_zero_point(0) 11131 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11132 } 11133 } 11134 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 11135 11136 11137 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8)11138 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) { 11139 TEST_REQUIRES_ARM_NEON; 11140 GemmMicrokernelTester() 11141 .mr(4) 11142 .nr(16) 11143 .kr(1) 11144 .sr(1) 11145 .m(4) 11146 .n(16) 11147 .k(8) 11148 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11149 } 11150 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,strided_cn)11151 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cn) { 11152 TEST_REQUIRES_ARM_NEON; 11153 GemmMicrokernelTester() 11154 .mr(4) 11155 .nr(16) 11156 .kr(1) 11157 .sr(1) 11158 .m(4) 11159 .n(16) 11160 .k(8) 11161 .cn_stride(19) 11162 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11163 } 11164 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8_strided_a)11165 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_strided_a) { 11166 TEST_REQUIRES_ARM_NEON; 11167 GemmMicrokernelTester() 11168 .mr(4) 11169 .nr(16) 11170 .kr(1) 11171 .sr(1) 11172 .m(4) 11173 .n(16) 11174 .k(8) 11175 .a_stride(11) 11176 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11177 } 11178 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8_subtile)11179 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile) { 11180 TEST_REQUIRES_ARM_NEON; 11181 for (uint32_t n = 1; n <= 16; n++) { 11182 for (uint32_t m = 1; m <= 4; m++) { 11183 GemmMicrokernelTester() 11184 .mr(4) 11185 .nr(16) 11186 .kr(1) 11187 .sr(1) 11188 .m(m) 11189 .n(n) 11190 .k(8) 11191 .iterations(1) 11192 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11193 } 11194 } 11195 } 11196 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8_subtile_m)11197 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_m) { 11198 TEST_REQUIRES_ARM_NEON; 11199 for (uint32_t m = 1; m <= 4; m++) { 11200 GemmMicrokernelTester() 11201 .mr(4) 11202 .nr(16) 11203 .kr(1) 11204 .sr(1) 11205 .m(m) 11206 .n(16) 11207 .k(8) 11208 .iterations(1) 11209 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11210 } 11211 } 11212 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_eq_8_subtile_n)11213 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_n) { 11214 TEST_REQUIRES_ARM_NEON; 11215 for (uint32_t n = 1; n <= 16; n++) { 11216 GemmMicrokernelTester() 11217 .mr(4) 11218 .nr(16) 11219 .kr(1) 11220 .sr(1) 11221 .m(4) 11222 .n(n) 11223 .k(8) 11224 .iterations(1) 11225 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11226 } 11227 } 11228 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_lt_8)11229 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8) { 11230 TEST_REQUIRES_ARM_NEON; 11231 for (size_t k = 1; k < 8; k++) { 11232 GemmMicrokernelTester() 11233 .mr(4) 11234 .nr(16) 11235 .kr(1) 11236 .sr(1) 11237 .m(4) 11238 .n(16) 11239 .k(k) 11240 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11241 } 11242 } 11243 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_lt_8_strided_a)11244 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_strided_a) { 11245 TEST_REQUIRES_ARM_NEON; 11246 for (size_t k = 1; k < 8; k++) { 11247 GemmMicrokernelTester() 11248 .mr(4) 11249 .nr(16) 11250 .kr(1) 11251 .sr(1) 11252 .m(4) 11253 .n(16) 11254 .k(k) 11255 .a_stride(11) 11256 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11257 } 11258 } 11259 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_lt_8_subtile)11260 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_subtile) { 11261 TEST_REQUIRES_ARM_NEON; 11262 for (size_t k = 1; k < 8; k++) { 11263 for (uint32_t n = 1; n <= 16; n++) { 11264 for (uint32_t m = 1; m <= 4; m++) { 11265 GemmMicrokernelTester() 11266 .mr(4) 11267 .nr(16) 11268 .kr(1) 11269 .sr(1) 11270 .m(m) 11271 .n(n) 11272 .k(k) 11273 .iterations(1) 11274 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11275 } 11276 } 11277 } 11278 } 11279 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_gt_8)11280 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8) { 11281 TEST_REQUIRES_ARM_NEON; 11282 for (size_t k = 9; k < 16; k++) { 11283 GemmMicrokernelTester() 11284 .mr(4) 11285 .nr(16) 11286 .kr(1) 11287 .sr(1) 11288 .m(4) 11289 .n(16) 11290 .k(k) 11291 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11292 } 11293 } 11294 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_gt_8_strided_a)11295 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_strided_a) { 11296 TEST_REQUIRES_ARM_NEON; 11297 for (size_t k = 9; k < 16; k++) { 11298 GemmMicrokernelTester() 11299 .mr(4) 11300 .nr(16) 11301 .kr(1) 11302 .sr(1) 11303 .m(4) 11304 .n(16) 11305 .k(k) 11306 .a_stride(19) 11307 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11308 } 11309 } 11310 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_gt_8_subtile)11311 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_subtile) { 11312 TEST_REQUIRES_ARM_NEON; 11313 for (size_t k = 9; k < 16; k++) { 11314 for (uint32_t n = 1; n <= 16; n++) { 11315 for (uint32_t m = 1; m <= 4; m++) { 11316 GemmMicrokernelTester() 11317 .mr(4) 11318 .nr(16) 11319 .kr(1) 11320 .sr(1) 11321 .m(m) 11322 .n(n) 11323 .k(k) 11324 .iterations(1) 11325 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11326 } 11327 } 11328 } 11329 } 11330 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_div_8)11331 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8) { 11332 TEST_REQUIRES_ARM_NEON; 11333 for (size_t k = 16; k <= 80; k += 8) { 11334 GemmMicrokernelTester() 11335 .mr(4) 11336 .nr(16) 11337 .kr(1) 11338 .sr(1) 11339 .m(4) 11340 .n(16) 11341 .k(k) 11342 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11343 } 11344 } 11345 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_div_8_strided_a)11346 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_strided_a) { 11347 TEST_REQUIRES_ARM_NEON; 11348 for (size_t k = 16; k <= 80; k += 8) { 11349 GemmMicrokernelTester() 11350 .mr(4) 11351 .nr(16) 11352 .kr(1) 11353 .sr(1) 11354 .m(4) 11355 .n(16) 11356 .k(k) 11357 .a_stride(83) 11358 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11359 } 11360 } 11361 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,k_div_8_subtile)11362 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_subtile) { 11363 TEST_REQUIRES_ARM_NEON; 11364 for (size_t k = 16; k <= 80; k += 8) { 11365 for (uint32_t n = 1; n <= 16; n++) { 11366 for (uint32_t m = 1; m <= 4; m++) { 11367 GemmMicrokernelTester() 11368 .mr(4) 11369 .nr(16) 11370 .kr(1) 11371 .sr(1) 11372 .m(m) 11373 .n(n) 11374 .k(k) 11375 .iterations(1) 11376 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11377 } 11378 } 11379 } 11380 } 11381 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_gt_16)11382 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16) { 11383 TEST_REQUIRES_ARM_NEON; 11384 for (uint32_t n = 17; n < 32; n++) { 11385 for (size_t k = 1; k <= 40; k += 9) { 11386 GemmMicrokernelTester() 11387 .mr(4) 11388 .nr(16) 11389 .kr(1) 11390 .sr(1) 11391 .m(4) 11392 .n(n) 11393 .k(k) 11394 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11395 } 11396 } 11397 } 11398 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_gt_16_strided_cn)11399 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_strided_cn) { 11400 TEST_REQUIRES_ARM_NEON; 11401 for (uint32_t n = 17; n < 32; n++) { 11402 for (size_t k = 1; k <= 40; k += 9) { 11403 GemmMicrokernelTester() 11404 .mr(4) 11405 .nr(16) 11406 .kr(1) 11407 .sr(1) 11408 .m(4) 11409 .n(n) 11410 .k(k) 11411 .cn_stride(19) 11412 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11413 } 11414 } 11415 } 11416 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_gt_16_strided_a)11417 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_strided_a) { 11418 TEST_REQUIRES_ARM_NEON; 11419 for (uint32_t n = 17; n < 32; n++) { 11420 for (size_t k = 1; k <= 40; k += 9) { 11421 GemmMicrokernelTester() 11422 .mr(4) 11423 .nr(16) 11424 .kr(1) 11425 .sr(1) 11426 .m(4) 11427 .n(n) 11428 .k(k) 11429 .a_stride(43) 11430 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11431 } 11432 } 11433 } 11434 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_gt_16_subtile)11435 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_subtile) { 11436 TEST_REQUIRES_ARM_NEON; 11437 for (uint32_t n = 17; n < 32; n++) { 11438 for (size_t k = 1; k <= 40; k += 9) { 11439 for (uint32_t m = 1; m <= 4; m++) { 11440 GemmMicrokernelTester() 11441 .mr(4) 11442 .nr(16) 11443 .kr(1) 11444 .sr(1) 11445 .m(m) 11446 .n(n) 11447 .k(k) 11448 .iterations(1) 11449 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11450 } 11451 } 11452 } 11453 } 11454 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_div_16)11455 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16) { 11456 TEST_REQUIRES_ARM_NEON; 11457 for (uint32_t n = 32; n <= 48; n += 16) { 11458 for (size_t k = 1; k <= 40; k += 9) { 11459 GemmMicrokernelTester() 11460 .mr(4) 11461 .nr(16) 11462 .kr(1) 11463 .sr(1) 11464 .m(4) 11465 .n(n) 11466 .k(k) 11467 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11468 } 11469 } 11470 } 11471 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_div_16_strided_cn)11472 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_strided_cn) { 11473 TEST_REQUIRES_ARM_NEON; 11474 for (uint32_t n = 32; n <= 48; n += 16) { 11475 for (size_t k = 1; k <= 40; k += 9) { 11476 GemmMicrokernelTester() 11477 .mr(4) 11478 .nr(16) 11479 .kr(1) 11480 .sr(1) 11481 .m(4) 11482 .n(n) 11483 .k(k) 11484 .cn_stride(19) 11485 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11486 } 11487 } 11488 } 11489 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_div_16_strided_a)11490 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_strided_a) { 11491 TEST_REQUIRES_ARM_NEON; 11492 for (uint32_t n = 32; n <= 48; n += 16) { 11493 for (size_t k = 1; k <= 40; k += 9) { 11494 GemmMicrokernelTester() 11495 .mr(4) 11496 .nr(16) 11497 .kr(1) 11498 .sr(1) 11499 .m(4) 11500 .n(n) 11501 .k(k) 11502 .a_stride(43) 11503 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11504 } 11505 } 11506 } 11507 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,n_div_16_subtile)11508 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_subtile) { 11509 TEST_REQUIRES_ARM_NEON; 11510 for (uint32_t n = 32; n <= 48; n += 16) { 11511 for (size_t k = 1; k <= 40; k += 9) { 11512 for (uint32_t m = 1; m <= 4; m++) { 11513 GemmMicrokernelTester() 11514 .mr(4) 11515 .nr(16) 11516 .kr(1) 11517 .sr(1) 11518 .m(m) 11519 .n(n) 11520 .k(k) 11521 .iterations(1) 11522 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11523 } 11524 } 11525 } 11526 } 11527 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,strided_cm_subtile)11528 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm_subtile) { 11529 TEST_REQUIRES_ARM_NEON; 11530 for (size_t k = 1; k <= 40; k += 9) { 11531 for (uint32_t n = 1; n <= 16; n++) { 11532 for (uint32_t m = 1; m <= 4; m++) { 11533 GemmMicrokernelTester() 11534 .mr(4) 11535 .nr(16) 11536 .kr(1) 11537 .sr(1) 11538 .m(m) 11539 .n(n) 11540 .k(k) 11541 .cm_stride(19) 11542 .iterations(1) 11543 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11544 } 11545 } 11546 } 11547 } 11548 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,qmin)11549 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmin) { 11550 TEST_REQUIRES_ARM_NEON; 11551 GemmMicrokernelTester() 11552 .mr(4) 11553 .nr(16) 11554 .kr(1) 11555 .sr(1) 11556 .m(4) 11557 .n(16) 11558 .k(8) 11559 .qmin(128) 11560 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11561 } 11562 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,qmax)11563 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmax) { 11564 TEST_REQUIRES_ARM_NEON; 11565 GemmMicrokernelTester() 11566 .mr(4) 11567 .nr(16) 11568 .kr(1) 11569 .sr(1) 11570 .m(4) 11571 .n(16) 11572 .k(8) 11573 .qmax(128) 11574 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11575 } 11576 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,strided_cm)11577 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm) { 11578 TEST_REQUIRES_ARM_NEON; 11579 GemmMicrokernelTester() 11580 .mr(4) 11581 .nr(16) 11582 .kr(1) 11583 .sr(1) 11584 .m(4) 11585 .n(16) 11586 .k(8) 11587 .cm_stride(19) 11588 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11589 } 11590 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,no_a_zero_point)11591 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, no_a_zero_point) { 11592 TEST_REQUIRES_ARM_NEON; 11593 for (size_t k = 1; k <= 40; k += 9) { 11594 GemmMicrokernelTester() 11595 .mr(4) 11596 .nr(16) 11597 .kr(1) 11598 .sr(1) 11599 .m(4) 11600 .n(16) 11601 .k(k) 11602 .a_zero_point(0) 11603 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11604 } 11605 } 11606 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,no_b_zero_point)11607 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, no_b_zero_point) { 11608 TEST_REQUIRES_ARM_NEON; 11609 for (size_t k = 1; k <= 40; k += 9) { 11610 GemmMicrokernelTester() 11611 .mr(4) 11612 .nr(16) 11613 .kr(1) 11614 .sr(1) 11615 .m(4) 11616 .n(16) 11617 .k(k) 11618 .b_zero_point(0) 11619 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11620 } 11621 } 11622 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53,no_zero_point)11623 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, no_zero_point) { 11624 TEST_REQUIRES_ARM_NEON; 11625 for (size_t k = 1; k <= 40; k += 9) { 11626 GemmMicrokernelTester() 11627 .mr(4) 11628 .nr(16) 11629 .kr(1) 11630 .sr(1) 11631 .m(4) 11632 .n(16) 11633 .k(k) 11634 .a_zero_point(0) 11635 .b_zero_point(0) 11636 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 11637 } 11638 } 11639 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 11640