1/* 2 * Copyright © 2024, Arm Limited 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include "src/arm/asm.S" 28#include "util.S" 29 30#define PREP_BIAS 32, lsl #8 // 8192 31#define PREP_BIAS_NEG 224, lsl #8 // -8192 32 33#if HAVE_SVE2 34ENABLE_SVE 35ENABLE_SVE2 36 37// No spaces in these expressions, due to gas-preprocessor. It is translated by 38// -1 to save the negative offset when getting the address of `mc_subpel_filters`. 39#define REGULAR1 (((0*15-1)<<7)|(3*15-1)) 40#define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) 41#define SHARP1 (((2*15-1)<<7)|(3*15-1)) 42 43#define FUNC_ALIGN 2 44#define JUMP_ALIGN 2 45#define LOOP_ALIGN 2 46 47 48// Shuffle indices to permute horizontal samples in preparation for input to 49// 16-bit SDOT instructions. The 8-tap horizontal convolution uses sample 50// indices in the interval of [-3, 4] relative to the current sample position. 51const h_tbl_sve, align=4 52 .byte 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 53 .byte 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 54endconst 55 56// Vertical convolutions also use 16-bit SDOT instructions, where two 128-bit 57// registers contain a transposed 4x4 matrix of values. Subsequent iterations 58// of the vertical convolution can reuse the 3x4 sub-matrix from the previous 59// loop iteration. These shuffle indices shift and merge this 4x4 matrix with 60// the values of a new line. 61const v_tbl_sve, align=4 62 .byte 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25 63 .byte 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19 64 .byte 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23 65 .byte 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27 66 .byte 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31 67endconst 68 69 70.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 71function \op\()_8tap_\type\()_16bpc_\isa, export=1, align=FUNC_ALIGN 72 mov x9, \type_h 73 mov x10, \type_v 74 .if \jump 75 b \op\()_8tap_\isa 76 .endif 77endfunc 78.endm 79 80.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, xmx, xmy, ldst, lsrc, wd_strd, ws_strd 81make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa 82make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa 83make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa 84make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa 85make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa 86make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa 87make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa 88make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa 89make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0 90 91function \type\()_8tap_\isa, align=FUNC_ALIGN 92 clz w8, \w 93 mov w11, #0x4081 // (1<<14) | (1<<7) | 1 94 ptrue p0.b, vl16 95 sub w8, w8, #24 // for jump tables 96 movrel x12, X(mc_subpel_filters) 97 cbnz \mx, L(\type\()_8tap_h_hv_\isa) 98.ifc \type, prep 99 cbz \my, prep_sve 100.else // put 101 cbnz \my, L(\type\()_8tap_v_\isa) 102 mov w9, w8 103 b X(put_16bpc_neon) 104 105 .align JUMP_ALIGN 106.endif 107 108L(\type\()_8tap_v_\isa): 109 madd \my, \my, w11, w10 110 movrel x13, v_tbl_sve 111.ifc \bdmax, w8 // put case, but skip 112 ld1r {v5.8h}, [sp] // loading into w8 113.endif 114 sub \src, \src, \s_strd // src - s_strd 115 ubfx w11, \my, #7, #7 116 and \my, \my, #0x7F 117 ldr q6, [x13] 118 cmp \h, #4 119 csel \my, \my, w11, le 120 sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd 121 add \xmy, x12, \xmy, lsl #3 // subpel V filter address 122 ldp q28, q29, [x13, #16] 123 ld1sb {z7.h}, p0/z, [\xmy] 124.ifc \type, prep 125 clz \bdmax, \bdmax 126 sub \bdmax, \bdmax, #24 127 dup v5.4s, \bdmax 128.endif 129 cmp \w, #8 130 b.lt 40f 131 132 // .align JUMP_ALIGN // fallthrough 13380: // V - 8xN+ 134 ldp q30, q31, [x13, #48] 135.ifc \type, prep 136 add \wd_strd, \w, \w // d_strd = 2 * w 137.endif 138 .align LOOP_ALIGN 13981: 140 add \lsrc, \src, \s_strd, lsl #1 141 142 ldr q16, [\src] 143 ldr q17, [\src, \s_strd] 144 ldr q18, [\lsrc] 145 ldr q19, [\lsrc, \s_strd] 146 add \lsrc, \lsrc, \s_strd, lsl #1 147 mov \ldst, \dst 148 149 ldr q20, [\lsrc] 150 ldr q21, [\lsrc, \s_strd] 151 add \lsrc, \lsrc, \s_strd, lsl #1 152 ldr q22, [\lsrc] 153 ldr q23, [\lsrc, \s_strd] 154 add \lsrc, \lsrc, \s_strd, lsl #1 155 sub w8, \h, #1 156 157 zip1 v0.8h, v16.8h, v17.8h 158 zip2 v1.8h, v16.8h, v17.8h 159 zip1 v2.8h, v18.8h, v19.8h 160 zip2 v3.8h, v18.8h, v19.8h 161 162 zip1 v18.8h, v20.8h, v21.8h 163 zip2 v21.8h, v20.8h, v21.8h 164 zip1 v24.8h, v22.8h, v23.8h 165 zip2 v27.8h, v22.8h, v23.8h 166 167 zip1 v16.4s, v0.4s, v2.4s 168 zip2 v19.4s, v0.4s, v2.4s 169 zip1 v22.4s, v1.4s, v3.4s 170 zip2 v25.4s, v1.4s, v3.4s 171 172 zip1 v17.4s, v18.4s, v24.4s 173 zip2 v20.4s, v18.4s, v24.4s 174 zip1 v23.4s, v21.4s, v27.4s 175 zip2 v26.4s, v21.4s, v27.4s 176 177 .align LOOP_ALIGN 1788: 179 ld1 {v18.16b}, [\lsrc], \s_strd 180 181 movi v0.2d, #0 182 movi v1.2d, #0 183 movi v2.2d, #0 184 movi v3.2d, #0 185 mov v21.16b, v18.16b 186 mov v24.16b, v18.16b 187 mov v27.16b, v18.16b 188 189 sdot z0.d, z16.h, z7.h[0] 190 tbl v16.16b, {v16.16b, v17.16b}, v6.16b 191 sdot z1.d, z19.h, z7.h[0] 192 tbl v19.16b, {v19.16b, v20.16b}, v6.16b 193 sdot z2.d, z22.h, z7.h[0] 194 tbl v22.16b, {v22.16b, v23.16b}, v6.16b 195 subs w8, w8, #1 196 sdot z3.d, z25.h, z7.h[0] 197 tbl v25.16b, {v25.16b, v26.16b}, v6.16b 198 199 sdot z0.d, z17.h, z7.h[1] 200 tbl v17.16b, {v17.16b, v18.16b}, v28.16b 201 sdot z1.d, z20.h, z7.h[1] 202 tbl v20.16b, {v20.16b, v21.16b}, v29.16b 203 sdot z2.d, z23.h, z7.h[1] 204 tbl v23.16b, {v23.16b, v24.16b}, v30.16b 205 sdot z3.d, z26.h, z7.h[1] 206 tbl v26.16b, {v26.16b, v27.16b}, v31.16b 207 208 uzp1 v0.4s, v0.4s, v1.4s 209 uzp1 v1.4s, v2.4s, v3.4s 210.ifc \type, prep 211 srshl v0.4s, v0.4s, v5.4s 212 srshl v1.4s, v1.4s, v5.4s 213 uzp1 v0.8h, v0.8h, v1.8h 214 sub z0.h, z0.h, #PREP_BIAS 215.else // put 216 sqrshrun v0.4h, v0.4s, #6 217 sqrshrun2 v0.8h, v1.4s, #6 218 umin v0.8h, v0.8h, v5.8h 219.endif 220 st1 {v0.16b}, [\ldst], \d_strd 221 b.gt 8b 222 223 movi v0.2d, #0 224 movi v1.2d, #0 225 movi v2.2d, #0 226 movi v3.2d, #0 227 228 sdot z0.d, z16.h, z7.h[0] 229 sdot z1.d, z19.h, z7.h[0] 230 sdot z2.d, z22.h, z7.h[0] 231 sdot z3.d, z25.h, z7.h[0] 232 233 sdot z0.d, z17.h, z7.h[1] 234 sdot z1.d, z20.h, z7.h[1] 235 sdot z2.d, z23.h, z7.h[1] 236 sdot z3.d, z26.h, z7.h[1] 237 subs \w, \w, #8 238 239 uzp1 v0.4s, v0.4s, v1.4s 240 uzp1 v1.4s, v2.4s, v3.4s 241.ifc \type, prep 242 srshl v0.4s, v0.4s, v5.4s 243 srshl v1.4s, v1.4s, v5.4s 244 uzp1 v0.8h, v0.8h, v1.8h 245 sub z0.h, z0.h, #PREP_BIAS 246.else // put 247 sqrshrun v0.4h, v0.4s, #6 248 sqrshrun2 v0.8h, v1.4s, #6 249 umin v0.8h, v0.8h, v5.8h 250.endif 251 str q0, [\ldst] 252 253 add \dst, \dst, #16 254 add \src, \src, #16 255 b.gt 81b 256 ret 257 258 .align JUMP_ALIGN 25940: // V - 4xN, put only: 2xN 260.ifc \type, put 261 lsr \d_strd, \d_strd, #1 // hword index for `st1h` 262 whilelt p1.h, wzr, \w // masking for writes 263.endif 264 cmp \h, #4 265 b.le 44f 266 267 ldr d16, [\src] 268 ldr d17, [\src, \s_strd] 269 add \src, \src, \s_strd, lsl #1 270 ldr d18, [\src] 271 ldr d19, [\src, \s_strd] 272 add \src, \src, \s_strd, lsl #1 273 274 ldr d20, [\src] 275 ldr d21, [\src, \s_strd] 276 add \src, \src, \s_strd, lsl #1 277 ldr d22, [\src] 278 ldr d23, [\src, \s_strd] 279 add \src, \src, \s_strd, lsl #1 280 sub \h, \h, #2 281 282 zip1 v0.8h, v16.8h, v17.8h 283 zip1 v2.8h, v18.8h, v19.8h 284 zip1 v18.8h, v20.8h, v21.8h 285 zip1 v24.8h, v22.8h, v23.8h 286 287 zip1 v16.4s, v0.4s, v2.4s 288 zip2 v19.4s, v0.4s, v2.4s 289 zip1 v17.4s, v18.4s, v24.4s 290 zip2 v20.4s, v18.4s, v24.4s 291 292 .align LOOP_ALIGN 2934: 294 ldr d18, [\src] 295 ldr d24, [\src, \s_strd] 296 add \src, \src, \s_strd, lsl #1 297 298 movi v0.2d, #0 299 movi v1.2d, #0 300 movi v2.2d, #0 301 movi v3.2d, #0 302 mov v21.16b, v18.16b 303 mov v27.16b, v24.16b 304 305 sdot z0.d, z16.h, z7.h[0] 306 tbl v22.16b, {v16.16b, v17.16b}, v6.16b 307 sdot z1.d, z19.h, z7.h[0] 308 tbl v25.16b, {v19.16b, v20.16b}, v6.16b 309 sdot z0.d, z17.h, z7.h[1] 310 tbl v23.16b, {v17.16b, v18.16b}, v28.16b 311 sdot z1.d, z20.h, z7.h[1] 312 tbl v26.16b, {v20.16b, v21.16b}, v29.16b 313 subs \h, \h, #2 314 315 sdot z2.d, z22.h, z7.h[0] 316 tbl v16.16b, {v22.16b, v23.16b}, v6.16b 317 sdot z3.d, z25.h, z7.h[0] 318 tbl v19.16b, {v25.16b, v26.16b}, v6.16b 319 sdot z2.d, z23.h, z7.h[1] 320 tbl v17.16b, {v23.16b, v24.16b}, v28.16b 321 sdot z3.d, z26.h, z7.h[1] 322 tbl v20.16b, {v26.16b, v27.16b}, v29.16b 323 324 uzp1 v0.4s, v0.4s, v1.4s 325 uzp1 v1.4s, v2.4s, v3.4s 326.ifc \type, prep 327 srshl v0.4s, v0.4s, v5.4s 328 srshl v1.4s, v1.4s, v5.4s 329 uzp1 v0.8h, v0.8h, v1.8h 330 sub z0.h, z0.h, #PREP_BIAS 331 str q0, [\dst], #16 332.else // put 333 sqrshrun v0.4h, v0.4s, #6 334 sqrshrun v1.4h, v1.4s, #6 335 umin v0.4h, v0.4h, v5.4h 336 umin v1.4h, v1.4h, v5.4h 337 st1h {z0.h}, p1, [\dst] 338 st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] 339 add \dst, \dst, \d_strd, lsl #2 340.endif 341 b.gt 4b 342 343 ldr d18, [\src] 344 345 movi v0.2d, #0 346 movi v1.2d, #0 347 movi v2.2d, #0 348 movi v3.2d, #0 349 mov v21.16b, v18.16b 350 351 sdot z0.d, z16.h, z7.h[0] 352 tbl v22.16b, {v16.16b, v17.16b}, v6.16b 353 sdot z1.d, z19.h, z7.h[0] 354 tbl v25.16b, {v19.16b, v20.16b}, v6.16b 355 sdot z0.d, z17.h, z7.h[1] 356 tbl v23.16b, {v17.16b, v18.16b}, v28.16b 357 sdot z1.d, z20.h, z7.h[1] 358 tbl v26.16b, {v20.16b, v21.16b}, v29.16b 359 360 sdot z2.d, z22.h, z7.h[0] 361 sdot z3.d, z25.h, z7.h[0] 362 sdot z2.d, z23.h, z7.h[1] 363 sdot z3.d, z26.h, z7.h[1] 364 365 uzp1 v0.4s, v0.4s, v1.4s 366 uzp1 v1.4s, v2.4s, v3.4s 367.ifc \type, prep 368 srshl v0.4s, v0.4s, v5.4s 369 srshl v1.4s, v1.4s, v5.4s 370 uzp1 v0.8h, v0.8h, v1.8h 371 sub z0.h, z0.h, #PREP_BIAS 372 str q0, [\dst] 373.else // put 374 sqrshrun v0.4h, v0.4s, #6 375 sqrshrun v1.4h, v1.4s, #6 376 umin v0.4h, v0.4h, v5.4h 377 umin v1.4h, v1.4h, v5.4h 378 st1h {z0.h}, p1, [\dst] 379 st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] 380.endif 381 ret 382 383 .align JUMP_ALIGN 38444: // V - 4x4, put only: 4x2, 2x4, 2x2 385 add \src, \src, \s_strd, lsl #1 // src - s_strd 386 subs \h, \h, #2 387 388 ldr d16, [\src] 389 ldr d17, [\src, \s_strd] 390 add \src, \src, \s_strd, lsl #1 391 ldr d18, [\src] 392 ldr d19, [\src, \s_strd] 393 add \src, \src, \s_strd, lsl #1 394 395 ext v7.16b, v7.16b, v7.16b, #4 // [\xmy + 2 * 2] 396 397 zip1 v0.8h, v16.8h, v17.8h 398 zip1 v2.8h, v18.8h, v19.8h 399 zip1 v16.4s, v0.4s, v2.4s 400 zip2 v19.4s, v0.4s, v2.4s 401 402.ifc \type, put 403 b.eq 42f 404.endif 405 ldr d17, [\src] 406 ldr d23, [\src, \s_strd] 407 add \src, \src, \s_strd, lsl #1 408 409 movi v0.2d, #0 410 movi v1.2d, #0 411 movi v2.2d, #0 412 movi v3.2d, #0 413 mov v20.16b, v17.16b 414 mov v26.16b, v23.16b 415 416 sdot z0.d, z16.h, z7.h[0] 417 tbl v22.16b, {v16.16b, v17.16b}, v28.16b 418 sdot z1.d, z19.h, z7.h[0] 419 tbl v25.16b, {v19.16b, v20.16b}, v29.16b 420 sdot z2.d, z22.h, z7.h[0] 421 tbl v16.16b, {v22.16b, v23.16b}, v28.16b 422 sdot z3.d, z25.h, z7.h[0] 423 tbl v19.16b, {v25.16b, v26.16b}, v29.16b 424 425 uzp1 v0.4s, v0.4s, v1.4s 426 uzp1 v1.4s, v2.4s, v3.4s 427.ifc \type, prep 428 srshl v0.4s, v0.4s, v5.4s 429 srshl v1.4s, v1.4s, v5.4s 430 uzp1 v0.8h, v0.8h, v1.8h 431 sub z0.h, z0.h, #PREP_BIAS 432 str q0, [\dst], #16 433.else // put 434 sqrshrun v0.4h, v0.4s, #6 435 sqrshrun v1.4h, v1.4s, #6 436 umin v0.4h, v0.4h, v5.4h 437 umin v1.4h, v1.4h, v5.4h 438 st1h {z0.h}, p1, [\dst] 439 st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] 440 add \dst, \dst, \d_strd, lsl #2 441.endif 442 443.ifc \type, put 444 .align JUMP_ALIGN 44542: 446.endif 447 ldr d17, [\src] 448 449 movi v0.2d, #0 450 movi v1.2d, #0 451 movi v2.2d, #0 452 movi v3.2d, #0 453 mov v20.16b, v17.16b 454 455 sdot z0.d, z16.h, z7.h[0] 456 tbl v22.16b, {v16.16b, v17.16b}, v28.16b 457 sdot z1.d, z19.h, z7.h[0] 458 tbl v25.16b, {v19.16b, v20.16b}, v29.16b 459 460 sdot z2.d, z22.h, z7.h[0] 461 sdot z3.d, z25.h, z7.h[0] 462 463 uzp1 v0.4s, v0.4s, v1.4s 464 uzp1 v1.4s, v2.4s, v3.4s 465.ifc \type, prep 466 srshl v0.4s, v0.4s, v5.4s 467 srshl v1.4s, v1.4s, v5.4s 468 uzp1 v0.8h, v0.8h, v1.8h 469 sub z0.h, z0.h, #PREP_BIAS 470 str q0, [\dst] 471.else // put 472 sqrshrun v0.4h, v0.4s, #6 473 sqrshrun v1.4h, v1.4s, #6 474 umin v0.4h, v0.4h, v5.4h 475 umin v1.4h, v1.4h, v5.4h 476 st1h {z0.h}, p1, [\dst] 477 st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] 478.endif 479 ret 480 481 .align JUMP_ALIGN 482L(\type\()_8tap_h_hv_\isa): 483 madd \mx, \mx, w11, w9 484 movrel x13, h_tbl_sve 485 sub \src, \src, #6 // src - 3 * 2 486 ubfx w9, \mx, #7, #7 487 and \mx, \mx, #0x7F 488 cmp \w, #4 489 csel \mx, \mx, w9, le 490 ldp q30, q31, [x13] 491 add \xmx, x12, \xmx, lsl #3 // subpel H filter address 492 cbz \my, L(\type\()_8tap_h_\isa) 493 494 // HV cases 495 madd w14, \my, w11, w10 496.ifc \bdmax, w8 497 ldr \bdmax, [sp] 498.endif 499 ubfx w11, w14, #7, #7 500 and w14, w14, #0x7F 501 ld1sb {z4.h}, p0/z, [\xmx] 502 cmp \h, #4 503 csel w14, w14, w11, le 504.ifc \type, put 505 dup v29.8h, \bdmax 506.endif 507 clz \bdmax, \bdmax 508 add \xmy, x12, x14, lsl #3 // subpel V filter address 509 ld1sb {z7.h}, p0/z, [\xmy] 510.ifc \type, put 511 mov w9, #12 512 sub w9, w9, \bdmax 513 dup v6.4s, w9 514.endif 515 sub \bdmax, \bdmax, #24 516 mov x15, x30 517 sub \src, \src, \s_strd // src - s_strd - 3 * 2 518 dup v5.4s, \bdmax 519 cmp w10, SHARP1 520 b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 521 522 // HV 8-tap cases 523 cmp \w, #4 524 b.le 40f 525 526 // .align JUMP_ALIGN // fallthrough 52780: // HV8 - 8xN+ 528.ifc \type, prep 529 add \wd_strd, \w, \w // d_strd = 2 * w 530.endif 531 cmp \h, #4 532 b.le 84f 533 sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2 534 535 .align LOOP_ALIGN 53681: 537 mov \lsrc, \src 538 mov \ldst, \dst 539 mov w8, \h 540 541 bl L(\type\()_hv_filter8_\isa) 542 uzp1 v16.8h, v23.8h, v24.8h 543 bl L(\type\()_hv_filter8_\isa) 544 uzp1 v17.8h, v23.8h, v24.8h 545 bl L(\type\()_hv_filter8_\isa) 546 uzp1 v18.8h, v23.8h, v24.8h 547 bl L(\type\()_hv_filter8_\isa) 548 uzp1 v19.8h, v23.8h, v24.8h 549 bl L(\type\()_hv_filter8_\isa) 550 uzp1 v20.8h, v23.8h, v24.8h 551 bl L(\type\()_hv_filter8_\isa) 552 uzp1 v21.8h, v23.8h, v24.8h 553 bl L(\type\()_hv_filter8_\isa) 554 uzp1 v22.8h, v23.8h, v24.8h 555 556 .align LOOP_ALIGN 5578: 558 ldp q24, q28, [\lsrc] 559 smull v0.4s, v16.4h, v7.h[0] 560 smull2 v1.4s, v16.8h, v7.h[0] 561 mov v16.16b, v17.16b 562 563 movi v2.2d, #0 564 movi v3.2d, #0 565 tbl v23.16b, {v24.16b}, v30.16b 566 tbl v24.16b, {v24.16b}, v31.16b 567 568 ldur q26, [\lsrc, #8] 569 smlal v0.4s, v17.4h, v7.h[1] 570 smlal2 v1.4s, v17.8h, v7.h[1] 571 mov v17.16b, v18.16b 572 add \lsrc, \lsrc, \s_strd 573 574 sdot z2.d, z23.h, z4.h[0] 575 sdot z3.d, z24.h, z4.h[0] 576 movi v23.2d, #0 577 movi v24.2d, #0 578 tbl v25.16b, {v26.16b}, v30.16b 579 tbl v26.16b, {v26.16b}, v31.16b 580 smlal v0.4s, v18.4h, v7.h[2] 581 smlal2 v1.4s, v18.8h, v7.h[2] 582 mov v18.16b, v19.16b 583 584 sdot z23.d, z25.h, z4.h[0] 585 sdot z24.d, z26.h, z4.h[0] 586 tbl v27.16b, {v28.16b}, v30.16b 587 tbl v28.16b, {v28.16b}, v31.16b 588 smlal v0.4s, v19.4h, v7.h[3] 589 smlal2 v1.4s, v19.8h, v7.h[3] 590 mov v19.16b, v20.16b 591 592 subs w8, w8, #1 593 sdot z2.d, z25.h, z4.h[1] 594 sdot z3.d, z26.h, z4.h[1] 595 sdot z23.d, z27.h, z4.h[1] 596 sdot z24.d, z28.h, z4.h[1] 597 598 smlal v0.4s, v20.4h, v7.h[4] 599 smlal2 v1.4s, v20.8h, v7.h[4] 600 mov v20.16b, v21.16b 601 602 uzp1 v3.4s, v2.4s, v3.4s 603 uzp1 v24.4s, v23.4s, v24.4s 604 smlal v0.4s, v21.4h, v7.h[5] 605 smlal2 v1.4s, v21.8h, v7.h[5] 606 mov v21.16b, v22.16b 607 608 srshl v23.4s, v3.4s, v5.4s 609 srshl v24.4s, v24.4s, v5.4s 610 smlal v0.4s, v22.4h, v7.h[6] 611 smlal2 v1.4s, v22.8h, v7.h[6] 612 613 uzp1 v22.8h, v23.8h, v24.8h 614 smlal v0.4s, v22.4h, v7.h[7] 615 smlal2 v1.4s, v22.8h, v7.h[7] 616 617.ifc \type, prep 618 rshrn v0.4h, v0.4s, #6 619 rshrn2 v0.8h, v1.4s, #6 620 sub z0.h, z0.h, #PREP_BIAS 621.else // put 622 srshl v0.4s, v0.4s, v6.4s 623 srshl v1.4s, v1.4s, v6.4s 624 sqxtun v0.4h, v0.4s 625 sqxtun2 v0.8h, v1.4s 626 umin v0.8h, v0.8h, v29.8h 627.endif 628 st1 {v0.8h}, [\ldst], \d_strd 629 b.gt 8b 630 631 subs \w, \w, #8 632 add \src, \src, #16 633 add \dst, \dst, #16 634 b.gt 81b 635 ret x15 636 637 .align JUMP_ALIGN 63840: // HV8 - 4xN, put only: 2xN 639.ifc \type, put 640 lsr \d_strd, \d_strd, #1 // hword index for `st1h` 641 whilelt p1.h, wzr, \w // masking for writes 642.endif 643 ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2] 644 add \src, \src, #4 645 646 cmp \h, #4 647 b.le 44f 648 649 sub \src, \src, \s_strd, lsl #1 // src - 3 * s_strd - 3 * 2 650 bl L(\type\()_hv_filter4_\isa) 651 xtn v16.4h, v0.4s 652 bl L(\type\()_hv_filter4_\isa) 653 xtn v17.4h, v0.4s 654 bl L(\type\()_hv_filter4_\isa) 655 xtn v18.4h, v0.4s 656 bl L(\type\()_hv_filter4_\isa) 657 xtn v19.4h, v0.4s 658 bl L(\type\()_hv_filter4_\isa) 659 xtn v20.4h, v0.4s 660 bl L(\type\()_hv_filter4_\isa) 661 xtn v21.4h, v0.4s 662 bl L(\type\()_hv_filter4_\isa) 663 xtn v22.4h, v0.4s 664 665 .align LOOP_ALIGN 6664: 667 ld1 {v3.16b}, [\src], \s_strd 668 669 smull v24.4s, v16.4h, v7.h[0] 670 smlal v24.4s, v17.4h, v7.h[1] 671 tbl v2.16b, {v3.16b}, v30.16b 672 tbl v3.16b, {v3.16b}, v31.16b 673 movi v0.2d, #0 674 movi v1.2d, #0 675 mov v16.16b, v17.16b 676 mov v17.16b, v18.16b 677 678 smlal v24.4s, v18.4h, v7.h[2] 679 smlal v24.4s, v19.4h, v7.h[3] 680 sdot z0.d, z2.h, z4.h[0] 681 sdot z1.d, z3.h, z4.h[0] 682 mov v18.16b, v19.16b 683 mov v19.16b, v20.16b 684 uzp1 v0.4s, v0.4s, v1.4s 685 686 smlal v24.4s, v20.4h, v7.h[4] 687 smlal v24.4s, v21.4h, v7.h[5] 688 srshl v0.4s, v0.4s, v5.4s 689 mov v20.16b, v21.16b 690 mov v21.16b, v22.16b 691 692 subs \h, \h, #1 693 smlal v24.4s, v22.4h, v7.h[6] 694 xtn v22.4h, v0.4s 695 smlal v24.4s, v22.4h, v7.h[7] 696 697.ifc \type, prep 698 rshrn v0.4h, v24.4s, #6 699 sub z0.h, z0.h, #PREP_BIAS 700 str d0, [\dst], #8 701.else // put 702 srshl v0.4s, v24.4s, v6.4s 703 sqxtun v0.4h, v0.4s 704 umin v0.4h, v0.4h, v29.4h 705 st1h {z0.h}, p1, [\dst] 706 add \dst, \dst, \d_strd, lsl #1 707.endif 708 b.gt 4b 709 ret x15 710 711 .align JUMP_ALIGN 712L(\type\()_6tap_hv_\isa): 713 cmp \w, #4 714 b.le 46f 715 716 // .align JUMP_ALIGN // fallthrough 71780: // HV6 - 8xN+ 718.ifc \type, prep 719 add \wd_strd, \w, \w // d_strd = 2 * w 720.endif 721 cmp \h, #4 722 b.le 84f 723 sub \src, \src, \s_strd // src - 2 * s_strd - 3 * 2 724 725 .align LOOP_ALIGN 72681: 727 mov \lsrc, \src 728 mov \ldst, \dst 729 mov w8, \h 730 731 bl L(\type\()_hv_filter8_\isa) 732 uzp1 v16.8h, v23.8h, v24.8h 733 bl L(\type\()_hv_filter8_\isa) 734 uzp1 v17.8h, v23.8h, v24.8h 735 bl L(\type\()_hv_filter8_\isa) 736 uzp1 v18.8h, v23.8h, v24.8h 737 bl L(\type\()_hv_filter8_\isa) 738 uzp1 v19.8h, v23.8h, v24.8h 739 bl L(\type\()_hv_filter8_\isa) 740 uzp1 v20.8h, v23.8h, v24.8h 741 742 .align LOOP_ALIGN 7438: 744 ldp q24, q28, [\lsrc] 745 746 smull v0.4s, v16.4h, v7.h[1] 747 smull2 v1.4s, v16.8h, v7.h[1] 748 mov v16.16b, v17.16b 749 750 tbl v23.16b, {v24.16b}, v30.16b 751 tbl v24.16b, {v24.16b}, v31.16b 752 movi v2.2d, #0 753 movi v3.2d, #0 754 755 ldur q26, [\lsrc, #8] 756 add \lsrc, \lsrc, \s_strd 757 758 sdot z2.d, z23.h, z4.h[0] 759 sdot z3.d, z24.h, z4.h[0] 760 tbl v25.16b, {v26.16b}, v30.16b 761 tbl v26.16b, {v26.16b}, v31.16b 762 movi v23.2d, #0 763 movi v24.2d, #0 764 765 sdot z23.d, z25.h, z4.h[0] 766 sdot z24.d, z26.h, z4.h[0] 767 tbl v27.16b, {v28.16b}, v30.16b 768 tbl v28.16b, {v28.16b}, v31.16b 769 smlal v0.4s, v17.4h, v7.h[2] 770 smlal2 v1.4s, v17.8h, v7.h[2] 771 mov v17.16b, v18.16b 772 773 sdot z2.d, z25.h, z4.h[1] 774 sdot z3.d, z26.h, z4.h[1] 775 sdot z23.d, z27.h, z4.h[1] 776 sdot z24.d, z28.h, z4.h[1] 777 778 smlal v0.4s, v18.4h, v7.h[3] 779 smlal2 v1.4s, v18.8h, v7.h[3] 780 mov v18.16b, v19.16b 781 782 uzp1 v3.4s, v2.4s, v3.4s 783 uzp1 v24.4s, v23.4s, v24.4s 784 smlal v0.4s, v19.4h, v7.h[4] 785 smlal2 v1.4s, v19.8h, v7.h[4] 786 mov v19.16b, v20.16b 787 788 srshl v23.4s, v3.4s, v5.4s 789 srshl v24.4s, v24.4s, v5.4s 790 smlal v0.4s, v20.4h, v7.h[5] 791 smlal2 v1.4s, v20.8h, v7.h[5] 792 793 subs w8, w8, #1 794 uzp1 v20.8h, v23.8h, v24.8h 795 smlal v0.4s, v20.4h, v7.h[6] 796 smlal2 v1.4s, v20.8h, v7.h[6] 797 798.ifc \type, prep 799 rshrn v0.4h, v0.4s, #6 800 rshrn2 v0.8h, v1.4s, #6 801 sub z0.h, z0.h, #PREP_BIAS 802.else // put 803 srshl v0.4s, v0.4s, v6.4s 804 srshl v1.4s, v1.4s, v6.4s 805 sqxtun v0.4h, v0.4s 806 sqxtun2 v0.8h, v1.4s 807 umin v0.8h, v0.8h, v29.8h 808.endif 809 st1 {v0.8h}, [\ldst], \d_strd 810 b.gt 8b 811 812 add \dst, \dst, #16 813 subs \w, \w, #8 814 add \src, \src, #16 815 b.gt 81b 816 ret x15 817 818 .align LOOP_ALIGN 81984: // HV4 - 8x4, 8x2 820 mov \lsrc, \src 821 mov \ldst, \dst 822 mov w8, \h 823 824 bl L(\type\()_hv_filter8_\isa) 825 uzp1 v17.8h, v23.8h, v24.8h 826 bl L(\type\()_hv_filter8_\isa) 827 uzp1 v18.8h, v23.8h, v24.8h 828 bl L(\type\()_hv_filter8_\isa) 829 uzp1 v19.8h, v23.8h, v24.8h 830 831 .align LOOP_ALIGN 83281: 833 ldp q24, q28, [\lsrc] 834 ldur q26, [\lsrc, #8] 835 add \lsrc, \lsrc, \s_strd 836 837 tbl v23.16b, {v24.16b}, v30.16b 838 tbl v24.16b, {v24.16b}, v31.16b 839 movi v2.2d, #0 840 movi v3.2d, #0 841 sdot z2.d, z23.h, z4.h[0] 842 sdot z3.d, z24.h, z4.h[0] 843 844 tbl v25.16b, {v26.16b}, v30.16b 845 tbl v26.16b, {v26.16b}, v31.16b 846 movi v23.2d, #0 847 movi v24.2d, #0 848 sdot z23.d, z25.h, z4.h[0] 849 sdot z24.d, z26.h, z4.h[0] 850 851 tbl v27.16b, {v28.16b}, v30.16b 852 tbl v28.16b, {v28.16b}, v31.16b 853 sdot z2.d, z25.h, z4.h[1] 854 sdot z3.d, z26.h, z4.h[1] 855 sdot z23.d, z27.h, z4.h[1] 856 sdot z24.d, z28.h, z4.h[1] 857 858 smull v0.4s, v17.4h, v7.h[2] 859 smull2 v1.4s, v17.8h, v7.h[2] 860 mov v17.16b, v18.16b 861 862 subs w8, w8, #1 863 uzp1 v3.4s, v2.4s, v3.4s 864 uzp1 v24.4s, v23.4s, v24.4s 865 smlal v0.4s, v18.4h, v7.h[3] 866 smlal2 v1.4s, v18.8h, v7.h[3] 867 mov v18.16b, v19.16b 868 869 srshl v23.4s, v3.4s, v5.4s 870 srshl v24.4s, v24.4s, v5.4s 871 smlal v0.4s, v19.4h, v7.h[4] 872 smlal2 v1.4s, v19.8h, v7.h[4] 873 874 uzp1 v19.8h, v23.8h, v24.8h 875 smlal v0.4s, v19.4h, v7.h[5] 876 smlal2 v1.4s, v19.8h, v7.h[5] 877 878.ifc \type, prep 879 rshrn v0.4h, v0.4s, #6 880 rshrn2 v0.8h, v1.4s, #6 881 sub z0.h, z0.h, #PREP_BIAS 882.else // put 883 srshl v0.4s, v0.4s, v6.4s 884 srshl v1.4s, v1.4s, v6.4s 885 sqxtun v0.4h, v0.4s 886 sqxtun2 v0.8h, v1.4s 887 umin v0.8h, v0.8h, v29.8h 888.endif 889 st1 {v0.8h}, [\ldst], \d_strd 890 b.gt 81b 891 892 subs \w, \w, #8 893 add \dst, \dst, #16 894 add \src, \src, #16 895 b.gt 84b 896 ret x15 897 898 .align FUNC_ALIGN 899L(\type\()_hv_filter8_\isa): 900 ldp q24, q28, [\lsrc] 901 ldur q26, [\lsrc, #8] 902 add \lsrc, \lsrc, \s_strd 903 904 tbl v23.16b, {v24.16b}, v30.16b 905 tbl v24.16b, {v24.16b}, v31.16b 906 movi v2.2d, #0 907 movi v3.2d, #0 908 sdot z2.d, z23.h, z4.h[0] 909 sdot z3.d, z24.h, z4.h[0] 910 911 tbl v25.16b, {v26.16b}, v30.16b 912 tbl v26.16b, {v26.16b}, v31.16b 913 movi v23.2d, #0 914 movi v24.2d, #0 915 sdot z23.d, z25.h, z4.h[0] 916 sdot z24.d, z26.h, z4.h[0] 917 918 tbl v27.16b, {v28.16b}, v30.16b 919 tbl v28.16b, {v28.16b}, v31.16b 920 sdot z2.d, z25.h, z4.h[1] 921 sdot z3.d, z26.h, z4.h[1] 922 sdot z23.d, z27.h, z4.h[1] 923 sdot z24.d, z28.h, z4.h[1] 924 925 uzp1 v3.4s, v2.4s, v3.4s 926 uzp1 v24.4s, v23.4s, v24.4s 927 srshl v23.4s, v3.4s, v5.4s 928 srshl v24.4s, v24.4s, v5.4s 929 ret 930 931 .align FUNC_ALIGN 932L(\type\()_hv_filter4_\isa): 933 ld1 {v3.16b}, [\src], \s_strd 934 935 tbl v2.16b, {v3.16b}, v30.16b 936 tbl v3.16b, {v3.16b}, v31.16b 937 movi v0.2d, #0 938 movi v1.2d, #0 939 sdot z0.d, z2.h, z4.h[0] 940 sdot z1.d, z3.h, z4.h[0] 941 942 uzp1 v0.4s, v0.4s, v1.4s 943 srshl v0.4s, v0.4s, v5.4s 944 ret 945 946 .align JUMP_ALIGN 94746: // H4V6 - 4xN, put only: 2xN 948.ifc \type, put 949 lsr \d_strd, \d_strd, #1 // hword index for `st1h` 950 whilelt p1.h, wzr, \w // masking for writes 951.endif 952 ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2] 953 add \src, \src, #4 954 955 cmp \h, #4 956 b.le 44f 957 958 sub \src, \src, \s_strd // src - 2 * s_strd - 3 * 2 959 bl L(\type\()_hv_filter4_\isa) 960 xtn v16.4h, v0.4s 961 bl L(\type\()_hv_filter4_\isa) 962 xtn v17.4h, v0.4s 963 bl L(\type\()_hv_filter4_\isa) 964 xtn v18.4h, v0.4s 965 bl L(\type\()_hv_filter4_\isa) 966 xtn v19.4h, v0.4s 967 bl L(\type\()_hv_filter4_\isa) 968 xtn v20.4h, v0.4s 969 970 .align LOOP_ALIGN 9714: 972 ld1 {v3.16b}, [\src], \s_strd 973 smull v24.4s, v16.4h, v7.h[1] 974 smlal v24.4s, v17.4h, v7.h[2] 975 976 tbl v2.16b, {v3.16b}, v30.16b 977 tbl v3.16b, {v3.16b}, v31.16b 978 movi v0.2d, #0 979 movi v1.2d, #0 980 sdot z0.d, z2.h, z4.h[0] 981 sdot z1.d, z3.h, z4.h[0] 982 983 mov v16.16b, v17.16b 984 mov v17.16b, v18.16b 985 smlal v24.4s, v18.4h, v7.h[3] 986 smlal v24.4s, v19.4h, v7.h[4] 987 uzp1 v0.4s, v0.4s, v1.4s 988 989 mov v18.16b, v19.16b 990 mov v19.16b, v20.16b 991 subs \h, \h, #1 992 srshl v0.4s, v0.4s, v5.4s 993 smlal v24.4s, v20.4h, v7.h[5] 994 xtn v20.4h, v0.4s 995 smlal v24.4s, v20.4h, v7.h[6] 996 997.ifc \type, prep 998 rshrn v0.4h, v24.4s, #6 999 sub z0.h, z0.h, #PREP_BIAS 1000 str d0, [\dst], #8 1001.else // put 1002 srshl v0.4s, v24.4s, v6.4s 1003 sqxtun v0.4h, v0.4s 1004 umin v0.4h, v0.4h, v29.4h 1005 st1h {z0.h}, p1, [\dst] 1006 add \dst, \dst, \d_strd, lsl #1 1007.endif 1008 b.gt 4b 1009 ret x15 1010 1011 .align JUMP_ALIGN 101244: // H4V4 - 4x4, put only: 4x2, 2x4, 2x2 1013 bl L(\type\()_hv_filter4_\isa) 1014 xtn v17.4h, v0.4s 1015 bl L(\type\()_hv_filter4_\isa) 1016 xtn v18.4h, v0.4s 1017 bl L(\type\()_hv_filter4_\isa) 1018 xtn v19.4h, v0.4s 1019 1020 .align LOOP_ALIGN 10214: 1022 ld1 {v3.16b}, [\src], \s_strd 1023 smull v24.4s, v17.4h, v7.h[2] 1024 smlal v24.4s, v18.4h, v7.h[3] 1025 1026 tbl v2.16b, {v3.16b}, v30.16b 1027 tbl v3.16b, {v3.16b}, v31.16b 1028 movi v0.2d, #0 1029 movi v1.2d, #0 1030 sdot z0.d, z2.h, z4.h[0] 1031 sdot z1.d, z3.h, z4.h[0] 1032 uzp1 v0.4s, v0.4s, v1.4s 1033 1034 mov v17.16b, v18.16b 1035 mov v18.16b, v19.16b 1036 subs \h, \h, #1 1037 srshl v0.4s, v0.4s, v5.4s 1038 smlal v24.4s, v19.4h, v7.h[4] 1039 xtn v19.4h, v0.4s 1040 smlal v24.4s, v19.4h, v7.h[5] 1041 1042.ifc \type, prep 1043 rshrn v0.4h, v24.4s, #6 1044 sub z0.h, z0.h, #PREP_BIAS 1045 str d0, [\dst], #8 1046.else // put 1047 srshl v0.4s, v24.4s, v6.4s 1048 sqxtun v0.4h, v0.4s 1049 umin v0.4h, v0.4h, v29.4h 1050 st1h {z0.h}, p1, [\dst] 1051 add \dst, \dst, \d_strd, lsl #1 1052.endif 1053 b.gt 4b 1054 ret x15 1055 1056 .align JUMP_ALIGN 1057L(\type\()_8tap_h_\isa): 1058 movrel x11, \type\()_8tap_h_\isa\()_tbl 1059 ldrsw x12, [x11, x8, lsl #2] 1060.ifc \bdmax, w8 1061 ldr \bdmax, [sp] 1062.endif 1063.ifc \type, prep 1064 clz \bdmax, \bdmax 1065 sub \bdmax, \bdmax, #24 1066 dup v5.4s, \bdmax 1067.else // put 1068 mov w9, #34 // rounding for 10-bit case 1069 mov w10, #40 // rounding for 12-bit case 1070 cmp \bdmax, #0xFFF 1071 csel w9, w9, w10, ne // select rounding based on \bdmax 1072 dup v5.8h, \bdmax 1073 dup v6.2d, x9 1074.endif 1075 add x11, x11, x12 1076 ld1sb {z4.h}, p0/z, [\xmx] 1077 br x11 1078 1079 .align JUMP_ALIGN 108020: // H - 4xN, put only: 2xN 108140: 1082 AARCH64_VALID_JUMP_TARGET 1083 add \src, \src, #4 // src - 1 * 2 1084 ext v4.16b, v4.16b, v4.16b, #4 // [\xmy + 2 * 2] 1085.ifc \type, put 1086 lsr \d_strd, \d_strd, #1 // hword index for `st1h` 1087 whilelt p1.h, wzr, \w // masking for writes 1088.endif 1089 .align LOOP_ALIGN 10904: 1091 ldr q17, [\src] 1092 ldr q19, [\src, \s_strd] 1093 add \src, \src, \s_strd, lsl #1 1094 1095.ifc \type, prep 1096 movi v0.2d, #0 1097 movi v1.2d, #0 1098 movi v2.2d, #0 1099 movi v3.2d, #0 1100.else 1101 mov v0.16b, v6.16b 1102 mov v1.16b, v6.16b 1103 mov v2.16b, v6.16b 1104 mov v3.16b, v6.16b 1105.endif 1106 tbl v16.16b, {v17.16b}, v30.16b 1107 tbl v17.16b, {v17.16b}, v31.16b 1108 sdot z0.d, z16.h, z4.h[0] 1109 sdot z1.d, z17.h, z4.h[0] 1110 subs \h, \h, #2 1111 tbl v18.16b, {v19.16b}, v30.16b 1112 tbl v19.16b, {v19.16b}, v31.16b 1113 sdot z2.d, z18.h, z4.h[0] 1114 sdot z3.d, z19.h, z4.h[0] 1115 1116 uzp1 v0.4s, v0.4s, v1.4s 1117 uzp1 v1.4s, v2.4s, v3.4s 1118.ifc \type, prep 1119 srshl v0.4s, v0.4s, v5.4s 1120 srshl v1.4s, v1.4s, v5.4s 1121 uzp1 v0.8h, v0.8h, v1.8h 1122 sub z0.h, z0.h, #PREP_BIAS 1123 str q0, [\dst], #16 1124.else // put 1125 sqshrun v0.4h, v0.4s, #6 1126 sqshrun v1.4h, v1.4s, #6 1127 umin v0.4h, v0.4h, v5.4h 1128 umin v1.4h, v1.4h, v5.4h 1129 st1h {z0.h}, p1, [\dst] 1130 st1h {z1.h}, p1, [\dst, \d_strd, lsl #1] 1131 add \dst, \dst, \d_strd, lsl #2 1132.endif 1133 b.gt 4b 1134 ret 1135 1136 .align JUMP_ALIGN 113780: // H - 8xN 1138 AARCH64_VALID_JUMP_TARGET 1139 1140 .align LOOP_ALIGN 11418: 1142 ldp q17, q21, [\src] 1143 ldur q19, [\src, #8] 1144 1145.ifc \type, prep 1146 movi v0.2d, #0 1147 movi v2.2d, #0 1148.else 1149 mov v0.16b, v6.16b 1150 mov v2.16b, v6.16b 1151.endif 1152 tbl v16.16b, {v17.16b}, v30.16b 1153 tbl v17.16b, {v17.16b}, v31.16b 1154 add \src, \src, \s_strd 1155 sdot z0.d, z16.h, z4.h[0] 1156 sdot z2.d, z17.h, z4.h[0] 1157 1158 tbl v18.16b, {v19.16b}, v30.16b 1159 tbl v19.16b, {v19.16b}, v31.16b 1160.ifc \type, prep 1161 movi v16.2d, #0 1162 movi v17.2d, #0 1163.else 1164 mov v16.16b, v6.16b 1165 mov v17.16b, v6.16b 1166.endif 1167 ldp q23, q27, [\src] 1168 ldur q25, [\src, #8] 1169 1170 sdot z16.d, z18.h, z4.h[0] 1171 sdot z17.d, z19.h, z4.h[0] 1172 1173 tbl v22.16b, {v23.16b}, v30.16b 1174 tbl v23.16b, {v23.16b}, v31.16b 1175.ifc \type, prep 1176 movi v1.2d, #0 1177 movi v3.2d, #0 1178.else 1179 mov v1.16b, v6.16b 1180 mov v3.16b, v6.16b 1181.endif 1182 add \src, \src, \s_strd 1183 sdot z1.d, z22.h, z4.h[0] 1184 sdot z3.d, z23.h, z4.h[0] 1185 1186 tbl v24.16b, {v25.16b}, v30.16b 1187 tbl v25.16b, {v25.16b}, v31.16b 1188.ifc \type, prep 1189 movi v22.2d, #0 1190 movi v23.2d, #0 1191.else 1192 mov v22.16b, v6.16b 1193 mov v23.16b, v6.16b 1194.endif 1195 sdot z22.d, z24.h, z4.h[0] 1196 sdot z23.d, z25.h, z4.h[0] 1197 1198 tbl v20.16b, {v21.16b}, v30.16b 1199 tbl v21.16b, {v21.16b}, v31.16b 1200 sdot z0.d, z18.h, z4.h[1] 1201 sdot z2.d, z19.h, z4.h[1] 1202 tbl v26.16b, {v27.16b}, v30.16b 1203 tbl v27.16b, {v27.16b}, v31.16b 1204 sdot z16.d, z20.h, z4.h[1] 1205 sdot z17.d, z21.h, z4.h[1] 1206 1207 sdot z1.d, z24.h, z4.h[1] 1208 sdot z3.d, z25.h, z4.h[1] 1209 1210 sdot z22.d, z26.h, z4.h[1] 1211 sdot z23.d, z27.h, z4.h[1] 1212 1213 subs \h, \h, #2 1214 uzp1 v0.4s, v0.4s, v2.4s 1215 uzp1 v2.4s, v16.4s, v17.4s 1216 uzp1 v1.4s, v1.4s, v3.4s 1217 uzp1 v3.4s, v22.4s, v23.4s 1218.ifc \type, prep 1219 srshl v0.4s, v0.4s, v5.4s 1220 srshl v2.4s, v2.4s, v5.4s 1221 srshl v1.4s, v1.4s, v5.4s 1222 srshl v3.4s, v3.4s, v5.4s 1223 uzp1 v0.8h, v0.8h, v2.8h 1224 uzp1 v1.8h, v1.8h, v3.8h 1225 sub z0.h, z0.h, #PREP_BIAS 1226 sub z1.h, z1.h, #PREP_BIAS 1227 stp q0, q1, [\dst], #32 1228.else // put 1229 sqshrun v0.4h, v0.4s, #6 1230 sqshrun2 v0.8h, v2.4s, #6 1231 sqshrun v1.4h, v1.4s, #6 1232 sqshrun2 v1.8h, v3.4s, #6 1233 umin v0.8h, v0.8h, v5.8h 1234 umin v1.8h, v1.8h, v5.8h 1235 st1 {v0.16b}, [\dst], \d_strd 1236 st1 {v1.16b}, [\dst], \d_strd 1237.endif 1238 b.gt 8b 1239 ret 1240 1241 .align JUMP_ALIGN 1242160: // H - 16xN 1243 AARCH64_VALID_JUMP_TARGET 1244 1245 .align LOOP_ALIGN 124616: 1247 ldp q17, q21, [\src] 1248 ldur q19, [\src, #8] 1249 1250.ifc \type, prep 1251 movi v0.2d, #0 1252 movi v2.2d, #0 1253.else 1254 mov v0.16b, v6.16b 1255 mov v2.16b, v6.16b 1256.endif 1257 tbl v16.16b, {v17.16b}, v30.16b 1258 tbl v17.16b, {v17.16b}, v31.16b 1259 sdot z0.d, z16.h, z4.h[0] 1260 sdot z2.d, z17.h, z4.h[0] 1261 1262 tbl v18.16b, {v19.16b}, v30.16b 1263 tbl v19.16b, {v19.16b}, v31.16b 1264.ifc \type, prep 1265 movi v16.2d, #0 1266 movi v17.2d, #0 1267.else 1268 mov v16.16b, v6.16b 1269 mov v17.16b, v6.16b 1270.endif 1271 ldur q25, [\src, #24] 1272 ldr q27, [\src, #32] 1273 1274 sdot z16.d, z18.h, z4.h[0] 1275 sdot z17.d, z19.h, z4.h[0] 1276 1277 tbl v22.16b, {v21.16b}, v30.16b 1278 tbl v23.16b, {v21.16b}, v31.16b 1279.ifc \type, prep 1280 movi v1.2d, #0 1281 movi v3.2d, #0 1282.else 1283 mov v1.16b, v6.16b 1284 mov v3.16b, v6.16b 1285.endif 1286 add \src, \src, \s_strd 1287 sdot z1.d, z22.h, z4.h[0] 1288 sdot z3.d, z23.h, z4.h[0] 1289 1290 tbl v24.16b, {v25.16b}, v30.16b 1291 tbl v25.16b, {v25.16b}, v31.16b 1292.ifc \type, prep 1293 movi v22.2d, #0 1294 movi v23.2d, #0 1295.else 1296 mov v22.16b, v6.16b 1297 mov v23.16b, v6.16b 1298.endif 1299 sdot z22.d, z24.h, z4.h[0] 1300 sdot z23.d, z25.h, z4.h[0] 1301 1302 tbl v20.16b, {v21.16b}, v30.16b 1303 tbl v21.16b, {v21.16b}, v31.16b 1304 sdot z0.d, z18.h, z4.h[1] 1305 sdot z2.d, z19.h, z4.h[1] 1306 tbl v26.16b, {v27.16b}, v30.16b 1307 tbl v27.16b, {v27.16b}, v31.16b 1308 sdot z16.d, z20.h, z4.h[1] 1309 sdot z17.d, z21.h, z4.h[1] 1310 1311 sdot z1.d, z24.h, z4.h[1] 1312 sdot z3.d, z25.h, z4.h[1] 1313 1314 sdot z22.d, z26.h, z4.h[1] 1315 sdot z23.d, z27.h, z4.h[1] 1316 1317 subs \h, \h, #1 1318 uzp1 v0.4s, v0.4s, v2.4s 1319 uzp1 v2.4s, v16.4s, v17.4s 1320 uzp1 v1.4s, v1.4s, v3.4s 1321 uzp1 v3.4s, v22.4s, v23.4s 1322.ifc \type, prep 1323 srshl v0.4s, v0.4s, v5.4s 1324 srshl v2.4s, v2.4s, v5.4s 1325 srshl v1.4s, v1.4s, v5.4s 1326 srshl v3.4s, v3.4s, v5.4s 1327 uzp1 v0.8h, v0.8h, v2.8h 1328 uzp1 v1.8h, v1.8h, v3.8h 1329 sub z0.h, z0.h, #PREP_BIAS 1330 sub z1.h, z1.h, #PREP_BIAS 1331 stp q0, q1, [\dst], #32 1332.else // put 1333 sqshrun v0.4h, v0.4s, #6 1334 sqshrun2 v0.8h, v2.4s, #6 1335 sqshrun v1.4h, v1.4s, #6 1336 sqshrun2 v1.8h, v3.4s, #6 1337 umin v0.8h, v0.8h, v5.8h 1338 umin v1.8h, v1.8h, v5.8h 1339 st1 {v0.16b, v1.16b}, [\dst], \d_strd 1340.endif 1341 b.gt 16b 1342 ret 1343 1344 .align JUMP_ALIGN 1345320: // H - 32xN+ 1346640: 13471280: 1348 AARCH64_VALID_JUMP_TARGET 1349.ifc \type, put 1350 sub \d_strd, \d_strd, \w, uxtw #1 1351.endif 1352 sub \s_strd, \s_strd, \w, uxtw #1 1353 mov w8, \w 1354 1355 .align LOOP_ALIGN 135632: 1357 ldp q17, q21, [\src] 1358 ldur q19, [\src, #8] 1359 1360.ifc \type, prep 1361 movi v0.2d, #0 1362 movi v2.2d, #0 1363.else 1364 mov v0.16b, v6.16b 1365 mov v2.16b, v6.16b 1366.endif 1367 tbl v16.16b, {v17.16b}, v30.16b 1368 tbl v17.16b, {v17.16b}, v31.16b 1369 sdot z0.d, z16.h, z4.h[0] 1370 sdot z2.d, z17.h, z4.h[0] 1371 1372 tbl v18.16b, {v19.16b}, v30.16b 1373 tbl v19.16b, {v19.16b}, v31.16b 1374.ifc \type, prep 1375 movi v16.2d, #0 1376 movi v17.2d, #0 1377.else 1378 mov v16.16b, v6.16b 1379 mov v17.16b, v6.16b 1380.endif 1381 ldur q25, [\src, #24] 1382 1383 sdot z16.d, z18.h, z4.h[0] 1384 sdot z17.d, z19.h, z4.h[0] 1385 1386 ldr q27, [\src, #32]! 1387 1388 tbl v22.16b, {v21.16b}, v30.16b 1389 tbl v23.16b, {v21.16b}, v31.16b 1390.ifc \type, prep 1391 movi v1.2d, #0 1392 movi v3.2d, #0 1393.else 1394 mov v1.16b, v6.16b 1395 mov v3.16b, v6.16b 1396.endif 1397 sdot z1.d, z22.h, z4.h[0] 1398 sdot z3.d, z23.h, z4.h[0] 1399 1400 tbl v24.16b, {v25.16b}, v30.16b 1401 tbl v25.16b, {v25.16b}, v31.16b 1402.ifc \type, prep 1403 movi v22.2d, #0 1404 movi v23.2d, #0 1405.else 1406 mov v22.16b, v6.16b 1407 mov v23.16b, v6.16b 1408.endif 1409 sdot z22.d, z24.h, z4.h[0] 1410 sdot z23.d, z25.h, z4.h[0] 1411 1412 tbl v20.16b, {v21.16b}, v30.16b 1413 tbl v21.16b, {v21.16b}, v31.16b 1414 sdot z0.d, z18.h, z4.h[1] 1415 sdot z2.d, z19.h, z4.h[1] 1416 tbl v26.16b, {v27.16b}, v30.16b 1417 tbl v27.16b, {v27.16b}, v31.16b 1418 sdot z16.d, z20.h, z4.h[1] 1419 sdot z17.d, z21.h, z4.h[1] 1420 1421 sdot z1.d, z24.h, z4.h[1] 1422 sdot z3.d, z25.h, z4.h[1] 1423 1424 sdot z22.d, z26.h, z4.h[1] 1425 sdot z23.d, z27.h, z4.h[1] 1426 1427 subs w8, w8, #16 1428 uzp1 v0.4s, v0.4s, v2.4s 1429 uzp1 v2.4s, v16.4s, v17.4s 1430 uzp1 v1.4s, v1.4s, v3.4s 1431 uzp1 v3.4s, v22.4s, v23.4s 1432.ifc \type, prep 1433 srshl v0.4s, v0.4s, v5.4s 1434 srshl v2.4s, v2.4s, v5.4s 1435 srshl v1.4s, v1.4s, v5.4s 1436 srshl v3.4s, v3.4s, v5.4s 1437 uzp1 v0.8h, v0.8h, v2.8h 1438 uzp1 v1.8h, v1.8h, v3.8h 1439 sub z0.h, z0.h, #PREP_BIAS 1440 sub z1.h, z1.h, #PREP_BIAS 1441.else // put 1442 sqshrun v0.4h, v0.4s, #6 1443 sqshrun2 v0.8h, v2.4s, #6 1444 sqshrun v1.4h, v1.4s, #6 1445 sqshrun2 v1.8h, v3.4s, #6 1446 umin v0.8h, v0.8h, v5.8h 1447 umin v1.8h, v1.8h, v5.8h 1448.endif 1449 stp q0, q1, [\dst], #32 1450 b.gt 32b 1451 1452 add \src, \src, \s_strd 1453.ifc \type, put 1454 add \dst, \dst, \d_strd 1455.endif 1456 subs \h, \h, #1 1457 mov w8, \w 1458 b.gt 32b 1459 ret 1460endfunc 1461 1462jumptable \type\()_8tap_h_\isa\()_tbl 1463 .word 1280b - \type\()_8tap_h_\isa\()_tbl 1464 .word 640b - \type\()_8tap_h_\isa\()_tbl 1465 .word 320b - \type\()_8tap_h_\isa\()_tbl 1466 .word 160b - \type\()_8tap_h_\isa\()_tbl 1467 .word 80b - \type\()_8tap_h_\isa\()_tbl 1468 .word 40b - \type\()_8tap_h_\isa\()_tbl 1469.ifc \type, put 1470 .word 20b - \type\()_8tap_h_\isa\()_tbl 1471.endif 1472endjumptable 1473.endm 1474 1475 1476function prep_sve 1477 movrel x9, prep_tbl 1478 mov w6, #19 1479 ldrsw x8, [x9, x8, lsl #2] 1480 sub w6, w6, w7, lsr #8 // 19 - bdmax / 256 1481 add x9, x9, x8 1482 movi v30.8h, #PREP_BIAS_NEG 1483 dup v29.8h, w6 // 10b: 1 << 4, 12b: 1 << 2 1484 br x9 1485 1486 .align JUMP_ALIGN 148740: // prep - 4xN 1488 AARCH64_VALID_JUMP_TARGET 1489 1490 .align LOOP_ALIGN 14914: 1492 ldr d0, [x1] 1493 ldr d1, [x1, x2] 1494 add x1, x1, x2, lsl #1 1495 subs w4, w4, #2 1496 mad z0.h, p0/m, z29.h, z30.h 1497 mad z1.h, p0/m, z29.h, z30.h 1498 stp d0, d1, [x0], #16 1499 b.gt 4b 1500 ret 1501 1502 .align JUMP_ALIGN 150380: // prep - 8xN 1504 AARCH64_VALID_JUMP_TARGET 1505 1506 .align LOOP_ALIGN 15078: 1508 ld1 {v0.8h}, [x1], x2 1509 ld1 {v1.8h}, [x1], x2 1510 subs w4, w4, #2 1511 mad z0.h, p0/m, z29.h, z30.h 1512 mad z1.h, p0/m, z29.h, z30.h 1513 stp q0, q1, [x0], #32 1514 b.gt 8b 1515 ret 1516 1517 .align JUMP_ALIGN 1518160: // prep - 16xN 1519 AARCH64_VALID_JUMP_TARGET 1520 1521 .align LOOP_ALIGN 152216: 1523 ld1 {v0.8h, v1.8h}, [x1], x2 1524 mad z0.h, p0/m, z29.h, z30.h 1525 mad z1.h, p0/m, z29.h, z30.h 1526 subs w4, w4, #2 1527 ld1 {v2.8h, v3.8h}, [x1], x2 1528 mad z2.h, p0/m, z29.h, z30.h 1529 mad z3.h, p0/m, z29.h, z30.h 1530 stp q0, q1, [x0] 1531 stp q2, q3, [x0, #32] 1532 add x0, x0, #64 1533 b.gt 16b 1534 ret 1535 1536 .align JUMP_ALIGN 1537320: // prep - 32xN 1538 AARCH64_VALID_JUMP_TARGET 1539 1540 .align LOOP_ALIGN 154132: 1542 ldp q0, q1, [x1] 1543 mad z0.h, p0/m, z29.h, z30.h 1544 mad z1.h, p0/m, z29.h, z30.h 1545 ldp q2, q3, [x1, #32] 1546 subs w4, w4, #1 1547 mad z2.h, p0/m, z29.h, z30.h 1548 mad z3.h, p0/m, z29.h, z30.h 1549 add x1, x1, x2 1550 stp q0, q1, [x0] 1551 stp q2, q3, [x0, #32] 1552 add x0, x0, #64 1553 b.gt 32b 1554 ret 1555 1556 .align JUMP_ALIGN 1557640: // prep - 64xN 1558 AARCH64_VALID_JUMP_TARGET 1559 1560 .align LOOP_ALIGN 156164: 1562 ldp q0, q1, [x1] 1563 mad z0.h, p0/m, z29.h, z30.h 1564 mad z1.h, p0/m, z29.h, z30.h 1565 ldp q2, q3, [x1, #32] 1566 mad z2.h, p0/m, z29.h, z30.h 1567 mad z3.h, p0/m, z29.h, z30.h 1568 ldp q4, q5, [x1, #64] 1569 mad z4.h, p0/m, z29.h, z30.h 1570 mad z5.h, p0/m, z29.h, z30.h 1571 ldp q6, q7, [x1, #96] 1572 add x1, x1, x2 1573 subs w4, w4, #1 1574 mad z6.h, p0/m, z29.h, z30.h 1575 mad z7.h, p0/m, z29.h, z30.h 1576 stp q0, q1, [x0] 1577 stp q2, q3, [x0, #32] 1578 stp q4, q5, [x0, #64] 1579 stp q6, q7, [x0, #96] 1580 add x0, x0, #128 1581 b.gt 64b 1582 ret 1583 1584 .align JUMP_ALIGN 15851280: // prep - 128xN 1586 AARCH64_VALID_JUMP_TARGET 1587 1588 .align LOOP_ALIGN 1589128: 1590 ldp q0, q1, [x1] 1591 mad z0.h, p0/m, z29.h, z30.h 1592 mad z1.h, p0/m, z29.h, z30.h 1593 ldp q2, q3, [x1, #32] 1594 mad z2.h, p0/m, z29.h, z30.h 1595 mad z3.h, p0/m, z29.h, z30.h 1596 ldp q4, q5, [x1, #64] 1597 mad z4.h, p0/m, z29.h, z30.h 1598 mad z5.h, p0/m, z29.h, z30.h 1599 ldp q6, q7, [x1, #96] 1600 mad z6.h, p0/m, z29.h, z30.h 1601 mad z7.h, p0/m, z29.h, z30.h 1602 ldp q16, q17, [x1, #128] 1603 mad z16.h, p0/m, z29.h, z30.h 1604 mad z17.h, p0/m, z29.h, z30.h 1605 ldp q18, q19, [x1, #160] 1606 mad z18.h, p0/m, z29.h, z30.h 1607 mad z19.h, p0/m, z29.h, z30.h 1608 ldp q20, q21, [x1, #192] 1609 mad z20.h, p0/m, z29.h, z30.h 1610 mad z21.h, p0/m, z29.h, z30.h 1611 ldp q22, q23, [x1, #224] 1612 add x1, x1, x2 1613 mad z22.h, p0/m, z29.h, z30.h 1614 mad z23.h, p0/m, z29.h, z30.h 1615 subs w4, w4, #1 1616 stp q0, q1, [x0] 1617 stp q2, q3, [x0, #32] 1618 stp q4, q5, [x0, #64] 1619 stp q6, q7, [x0, #96] 1620 stp q16, q17, [x0, #128] 1621 stp q18, q19, [x0, #160] 1622 stp q20, q21, [x0, #192] 1623 stp q22, q23, [x0, #224] 1624 add x0, x0, #256 1625 b.gt 128b 1626 ret 1627endfunc 1628 1629jumptable prep_tbl 1630 .word 1280b - prep_tbl 1631 .word 640b - prep_tbl 1632 .word 320b - prep_tbl 1633 .word 160b - prep_tbl 1634 .word 80b - prep_tbl 1635 .word 40b - prep_tbl 1636endjumptable 1637 1638 1639// dst(x0), d_strd(x9), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6), bdmax(w7) 1640// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w9), ws_strd(w2) 1641filter_8tap_fn prep, sve2, x0, x9, x1, x2, w3, w4, w5, w6, w7, x5, x6, x5, x6, w9, w2 1642 1643// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7), bdmax(w8) 1644// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1), ws_strd(w3) 1645filter_8tap_fn put, sve2, x0, x1, x2, x3, w4, w5, w6, w7, w8, x6, x7, x6, x7, w1, w3 1646 1647DISABLE_SVE2 1648DISABLE_SVE 1649#endif // HAVE_SVE2 1650