1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/loongarch/loongson_asm.S" 29#include "src/loongarch/loongson_util.S" 30 31.macro PUSH_REG 32 addi.d sp, sp, -64 33 fst.d f24, sp, 0 34 fst.d f25, sp, 8 35 fst.d f26, sp, 16 36 fst.d f27, sp, 24 37 fst.d f28, sp, 32 38 fst.d f29, sp, 40 39 fst.d f30, sp, 48 40 fst.d f31, sp, 56 41.endm 42 43.macro POP_REG 44 fld.d f24, sp, 0 45 fld.d f25, sp, 8 46 fld.d f26, sp, 16 47 fld.d f27, sp, 24 48 fld.d f28, sp, 32 49 fld.d f29, sp, 40 50 fld.d f30, sp, 48 51 fld.d f31, sp, 56 52 addi.d sp, sp, 64 53.endm 54 55.macro malloc_space number 56 li.w t0, \number 57 sub.d sp, sp, t0 58 addi.d sp, sp, -64 59 PUSH_REG 60.endm 61 62.macro free_space number 63 POP_REG 64 li.w t0, \number 65 add.d sp, sp, t0 66 addi.d sp, sp, 64 67.endm 68 69.macro iwht4 70 vadd.h vr0, vr0, vr1 71 vsub.h vr4, vr2, vr3 72 vsub.h vr5, vr0, vr4 73 vsrai.h vr5, vr5, 1 74 vsub.h vr2, vr5, vr1 75 vsub.h vr1, vr5, vr3 76 vadd.h vr3, vr4, vr2 77 vsub.h vr0, vr0, vr1 78.endm 79 80.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5 81 vilvl.w \in0, \in1, \in0 // 0 1 2 3 4 5 6 7 x ... 82 vilvl.w \in2, \in3, \in2 // 8 9 10 11 12 13 14 15 x ... 83 vsllwil.hu.bu \in0, \in0, 0 84 vsllwil.hu.bu \in2, \in2, 0 85 vadd.h \in0, \in4, \in0 86 vadd.h \in2, \in5, \in2 87 vssrani.bu.h \in2, \in0, 0 88 vstelm.w \in2, a0, 0, 0 89 vstelmx.w \in2, a0, a1, 1 90 vstelmx.w \in2, a0, a1, 2 91 vstelmx.w \in2, a0, a1, 3 92.endm 93 94.macro VLD_DST_ADD_W4 in0, in1 95 vld vr0, a0, 0 96 vldx vr1, a0, a1 97 vld vr2, t2, 0 98 vldx vr3, t2, a1 99 100 DST_ADD_W4 vr0, vr1, vr2, vr3, \in0, \in1 101.endm 102 103function inv_txfm_add_wht_wht_4x4_8bpc_lsx 104 vld vr0, a2, 0 105 vld vr2, a2, 16 106 107 vxor.v vr20, vr20, vr20 108 vsrai.h vr0, vr0, 2 109 vsrai.h vr2, vr2, 2 110 vst vr20, a2, 0 111 vpickod.d vr1, vr0, vr0 112 vpickod.d vr3, vr2, vr2 113 vst vr20, a2, 16 114 115 iwht4 116 117 LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5 118 119 iwht4 120 121 vilvl.d vr4, vr1, vr0 122 vilvl.d vr5, vr3, vr2 123 alsl.d t2, a1, a0, 1 124 VLD_DST_ADD_W4 vr4, vr5 125endfunc 126 127const idct_coeffs, align=4 128 .word 2896, 2896*8, 1567, 3784 129 .word 799, 4017, 3406, 2276 130 .word 401, 4076, 3166, 2598 131 .word 1931, 3612, 3920, 1189 132 .word 201, 4091, 3035, 2751 133 .word 1751, 3703, 3857, 1380 134 .word 995, 3973, 3513, 2106 135 .word 2440, 3290, 4052, 601 136endconst 137 138.macro vsrari_h_x4 in0, in1, in2, in3, out0, out1, out2, out3, shift 139 vsrari.h \out0, \in0, \shift 140 vsrari.h \out1, \in1, \shift 141 vsrari.h \out2, \in2, \shift 142 vsrari.h \out3, \in3, \shift 143.endm 144 145.macro vsrari_h_x8 in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 146 out1, out2, out3, out4, out5, out6, out7, shift 147 vsrari.h \out0, \in0, \shift 148 vsrari.h \out1, \in1, \shift 149 vsrari.h \out2, \in2, \shift 150 vsrari.h \out3, \in3, \shift 151 vsrari.h \out4, \in4, \shift 152 vsrari.h \out5, \in5, \shift 153 vsrari.h \out6, \in6, \shift 154 vsrari.h \out7, \in7, \shift 155.endm 156 157.macro vmulev_vmaddod_lsx in0, in1, in2, in3, out0, out1, sz 158 vmulwev.w.h \out0, \in0, \in2 159 vmulwod.w.h \out1, \in0, \in2 160 vmaddwev.w.h \out0, \in1, \in3 161 vmaddwod.w.h \out1, \in1, \in3 162.ifc \sz, .4h 163 vilvl.w \out0, \out1, \out0 164.else 165 vilvl.w vr22, \out1, \out0 166 vilvh.w \out1, \out1, \out0 167 vor.v \out0, vr22, vr22 168.endif 169.endm 170 171const idct_coeffs_h, align=4 172 .short 2896, 2896*8, 1567, 3784 173 .short 799, 4017, 3406, 2276 174 .short 401, 4076, 3166, 2598 175 .short 1931, 3612, 3920, 1189 176 .short 201, 4091, 3035, 2751 177 .short 1751, 3703, 3857, 1380 178 .short 995, 3973, 3513, 2106 179 .short 2440, 3290, 4052, 601 180endconst 181 182const iadst4_coeffs, align=4 183 .word 1321, 3803, 2482, 3344 184endconst 185 186.macro inv_dct4_lsx in0, in1, in2, in3, out0, out1, out2, out3, sz 187 la.local t0, idct_coeffs_h 188 189 vldrepl.h vr20, t0, 0 // 2896 190 vmulev_vmaddod_lsx \in0, \in2, vr20, vr20, vr16, vr18, \sz 191 vneg.h vr21, vr20 192 vmulev_vmaddod_lsx \in0, \in2, vr20, vr21, vr17, vr19, \sz 193 vssrarni.h.w vr18, vr16, 12 // t0 194 vssrarni.h.w vr19, vr17, 12 // t1 195 196 vldrepl.h vr20, t0, 4 // 1567 197 vldrepl.h vr21, t0, 6 // 3784 198 vmulev_vmaddod_lsx \in1, \in3, vr21, vr20, \in0, vr16, \sz 199 vneg.h vr21, vr21 200 vmulev_vmaddod_lsx \in1, \in3, vr20, vr21, \in2, vr17, \sz 201 vssrarni.h.w vr16, \in0, 12 // t3 202 vssrarni.h.w vr17, \in2, 12 // t2 203 204 vsadd.h \out0, vr18, vr16 205 vsadd.h \out1, vr19, vr17 206 vssub.h \out2, vr19, vr17 207 vssub.h \out3, vr18, vr16 208.endm 209 210functionl inv_dct_4h_x4_lsx 211 inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .4h 212endfuncl 213 214functionl inv_dct_8h_x4_lsx 215 inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .8h 216endfuncl 217 218.macro inv_adst4_core_lsx in0, in1, in2, in3, out0, out1, out2, out3 219 vsub.w vr16, \in0, \in2 // in0-in2 220 vmul.w vr17, \in0, vr20 // in0*1321 221 vmul.w vr19, \in0, vr22 // in0*2482 222 vmul.w vr18, \in1, vr23 // in1*3344 223 vmadd.w vr17, \in2, vr21 // in0*1321+in2*3803 224 vmsub.w vr19, \in2, vr20 // in2*1321 225 vadd.w vr16, vr16, \in3 // in0-in2+in3 226 vmadd.w vr17, \in3, vr22 // in0*1321+in2*3803+in3*2482 227 vmsub.w vr19, \in3, vr21 // in0*2482-in2*1321-in3*3803 228 vadd.w vr15, vr17, vr19 229 vmul.w \out2, vr16, vr23 // out[2] 8 9 10 11 230 vadd.w \out0, vr17, vr18 // out[0] 0 1 2 3 231 vadd.w \out1, vr19, vr18 // out[1] 4 5 6 7 232 vsub.w \out3, vr15, vr18 // out[3] 12 13 14 15 233.endm 234 235.macro inv_adst4_lsx in0, in1, in2, in3, out0, out1, out2, out3 236 la.local t0, iadst4_coeffs 237 238 vldrepl.w vr20, t0, 0 // 1321 239 vldrepl.w vr21, t0, 4 // 3803 240 vldrepl.w vr22, t0, 8 // 2482 241 vldrepl.w vr23, t0, 12 // 3344 242 243 vsllwil.w.h vr0, \in0, 0 244 vsllwil.w.h vr1, \in1, 0 245 vsllwil.w.h vr2, \in2, 0 246 vsllwil.w.h vr3, \in3, 0 247 inv_adst4_core_lsx vr0, vr1, vr2, vr3, \out0, \out1, \out2, \out3 248 vssrarni.h.w \out0, \out0, 12 249 vssrarni.h.w \out1, \out1, 12 250 vssrarni.h.w \out2, \out2, 12 251 vssrarni.h.w \out3, \out3, 12 252.endm 253 254functionl inv_adst_4h_x4_lsx 255 inv_adst4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3 256endfuncl 257 258functionl inv_flipadst_4h_x4_lsx 259 inv_adst4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0 260endfuncl 261 262.macro inv_adst_8x4_lsx in0, in1, in2, in3, out0, out1, out2, out3 263 la.local t0, iadst4_coeffs 264 vldrepl.w vr20, t0, 0 // 1321 265 vldrepl.w vr21, t0, 4 // 3803 266 vldrepl.w vr22, t0, 8 // 2482 267 vldrepl.w vr23, t0, 12 // 3344 268 269 vsllwil.w.h vr10, \in0, 0 // in0 270 vsllwil.w.h vr11, \in1, 0 // in1 271 vsllwil.w.h vr12, \in2, 0 // in2 272 vsllwil.w.h vr13, \in3, 0 // in3 273 inv_adst4_core_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13 274 275 vexth.w.h \in0, \in0 // in0 276 vexth.w.h \in1, \in1 // in1 277 vexth.w.h \in2, \in2 // in2 278 vexth.w.h \in3, \in3 // in3 279 inv_adst4_core_lsx \in0, \in1, \in2, \in3, \out0, \out1, \out2, \out3 280 281 vssrarni.h.w \out0, vr10, 12 282 vssrarni.h.w \out1, vr11, 12 283 vssrarni.h.w \out2, vr12, 12 284 vssrarni.h.w \out3, vr13, 12 285.endm 286 287functionl inv_adst_8h_x4_lsx 288 inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3 289endfuncl 290 291functionl inv_flipadst_8h_x4_lsx 292 inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0 293endfuncl 294 295functionl inv_identity_4h_x4_lsx 296 li.w t0, 1697 297 vreplgr2vr.h vr20, t0 298 299 vilvl.d vr0, vr1, vr0 300 vilvl.d vr2, vr3, vr2 301 vmulwev.w.h vr16, vr0, vr20 302 vmulwod.w.h vr17, vr0, vr20 303 vmulwev.w.h vr18, vr2, vr20 304 vmulwod.w.h vr19, vr2, vr20 305 vilvl.w vr1, vr17, vr16 306 vilvh.w vr3, vr17, vr16 307 vilvl.w vr22, vr19, vr18 308 vilvh.w vr23, vr19, vr18 309 vssrarni.h.w vr3, vr1, 12 310 vssrarni.h.w vr23, vr22, 12 311 vsadd.h vr0, vr3, vr0 // t0 312 vsadd.h vr2, vr23, vr2 // t2 313 vilvh.d vr1, vr0, vr0 // t1 314 vilvh.d vr3, vr2, vr2 // t3 315endfuncl 316 317.macro inv_identity4_lsx1 in0, in1, in2, out0, out1 318 vsllwil.w.h vr16, \in0, 0 319 vexth.w.h vr17, \in1 320 vmul.w vr18, vr16, \in2 321 vmul.w vr19, vr17, \in2 322 vsrari.w vr18, vr18, 12 323 vsrari.w vr19, vr19, 12 324 vadd.w \out0, vr18, vr16 325 vadd.w \out1, vr19, vr17 326 vssrarni.h.w \out1, \out0, 1 327.endm 328 329functionl inv_identity_8h_x4_lsx 330 li.w t0, 1697 331 vreplgr2vr.h vr20, t0 332 vmulwev.w.h vr16, vr0, vr20 333 vmulwod.w.h vr17, vr0, vr20 334 vmulwev.w.h vr18, vr1, vr20 335 vmulwod.w.h vr19, vr1, vr20 336 vilvl.w vr21, vr17, vr16 337 vilvh.w vr22, vr17, vr16 338 vilvl.w vr23, vr19, vr18 339 vilvh.w vr16, vr19, vr18 340 vssrarni.h.w vr22, vr21, 12 341 vssrarni.h.w vr16, vr23, 12 342 vsadd.h vr0, vr22, vr0 // t0 343 vsadd.h vr1, vr16, vr1 // t1 344 vmulwev.w.h vr16, vr2, vr20 345 vmulwod.w.h vr17, vr2, vr20 346 vmulwev.w.h vr18, vr3, vr20 347 vmulwod.w.h vr19, vr3, vr20 348 vilvl.w vr21, vr17, vr16 349 vilvh.w vr22, vr17, vr16 350 vilvl.w vr23, vr19, vr18 351 vilvh.w vr16, vr19, vr18 352 vssrarni.h.w vr22, vr21, 12 353 vssrarni.h.w vr16, vr23, 12 354 vsadd.h vr2, vr22, vr2 // t2 355 vsadd.h vr3, vr16, vr3 // t3 356endfuncl 357 358functionl inv_identity_8h_x4_lsx1 359 li.w t0, 1697 360 vreplgr2vr.w vr20, t0 361.irp i, vr0, vr1, vr2, vr3 362 inv_identity4_lsx1 \i, \i vr20, vr21, \i 363.endr 364endfuncl 365 366functionl inv_txfm_add_4x4_lsx 367 vxor.v vr23, vr23, vr23 368 vld vr0, a2, 0 369 vld vr2, a2, 16 370 vilvh.d vr1, vr0, vr0 371 vilvh.d vr3, vr2, vr2 372 vst vr23, a2, 0 373 vst vr23, a2, 16 374 375 move t6, ra 376 jirl ra, t7, 0 377 move ra, t6 378 379 LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5 380 381 move t6, ra 382 jirl ra, t8, 0 383 move ra, t6 384 385 vilvl.d vr4, vr1, vr0 386 vilvl.d vr5, vr3, vr2 387 vsrari.h vr4, vr4, 4 388 vsrari.h vr5, vr5, 4 389 alsl.d t2, a1, a0, 1 390 VLD_DST_ADD_W4 vr4, vr5 391endfuncl 392 393.macro idct_dc w, h, shift 394 ld.h t2, a2, 0 // dc 395 vldi vr0, 0x8b5 // 181 396 vreplgr2vr.w vr1, t2 397 vldi vr20, 0x880 // 128 398 vmul.w vr2, vr0, vr1 // dc * 181 399 st.h zero, a2, 0 400 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 401 vld vr10, a0, 0 // 0 1 2 3 4 5 6 7 402 403.if (2*\w == \h) || (2*\h == \w) 404 vmul.w vr2, vr0, vr2 405 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 406.endif 407.if \shift>0 408 vsrari.w vr2, vr2, \shift // (dc + rnd) >> shift 409.endif 410 vldx vr11, a0, a1 // 8 9 10 11 12 13 14 15 411 alsl.d t2, a1, a0, 1 412 vmadd.w vr20, vr2, vr0 413 vld vr12, t2, 0 // 16 17 18 19 20 21 22 23 414 vssrarni.h.w vr20, vr20, 12 415 vldx vr13, t2, a1 // 24 25 26 27 28 29 30 31 416.endm 417 418.macro fun4x4 txfm1, txfm2 419function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_lsx 420.ifc \txfm1\()_\txfm2, dct_dct 421 bnez a3, 1f 422 423 idct_dc 4, 4, 0 424 425 DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20 426 b .\txfm1\()_\txfm2\()_4X4_END 4271: 428.endif 429 430 la.local t7, inv_\txfm1\()_4h_x4_lsx 431 la.local t8, inv_\txfm2\()_4h_x4_lsx 432 433 b inv_txfm_add_4x4_lsx 434.\txfm1\()_\txfm2\()_4X4_END: 435endfunc 436.endm 437 438fun4x4 dct, dct 439fun4x4 identity, identity 440fun4x4 adst, dct 441fun4x4 dct, adst 442fun4x4 adst, adst 443fun4x4 dct, flipadst 444fun4x4 flipadst, adst 445fun4x4 adst, flipadst 446fun4x4 flipadst, dct 447fun4x4 flipadst, flipadst 448fun4x4 dct, identity 449fun4x4 identity, dct 450fun4x4 flipadst, identity 451fun4x4 identity, flipadst 452fun4x4 identity, adst 453fun4x4 adst, identity 454 455const iadst8_coeffs_h, align=4 456 .short 4076, 401, 3612, 1931 457 .short 2598, 3166, 1189, 3920 458 .short 2896, 0, 1567, 3784, 0, 0, 0, 0 459endconst 460 461.macro inv_adst8_lsx out0, out1, out2, out3, out4, out5, out6, out7, sz 462 la.local t0, iadst8_coeffs_h 463 464 vldrepl.h vr20, t0, 0 // 4076 465 vldrepl.h vr21, t0, 2 // 401 466 vmulev_vmaddod_lsx vr7, vr0, vr20, vr21, vr16, vr17, \sz 467 vneg.h vr20, vr20 468 vmulev_vmaddod_lsx vr7, vr0, vr21, vr20, vr18, vr19, \sz 469 vssrarni.h.w vr17, vr16, 12 // t0a 470 vssrarni.h.w vr19, vr18, 12 // t1a 471 472 vldrepl.h vr20, t0, 4 // 3612 473 vldrepl.h vr21, t0, 6 // 1931 474 vmulev_vmaddod_lsx vr5, vr2, vr20, vr21, vr0, vr16, \sz 475 vneg.h vr20, vr20 476 vmulev_vmaddod_lsx vr5, vr2, vr21, vr20, vr7, vr18, \sz 477 vssrarni.h.w vr16, vr0, 12 // t2a 478 vssrarni.h.w vr18, vr7, 12 // t3a 479 480 vldrepl.h vr20, t0, 8 // 2598 481 vldrepl.h vr21, t0, 10 // 3166 482 vmulev_vmaddod_lsx vr3, vr4, vr20, vr21, vr2, vr0, \sz 483 vneg.h vr20, vr20 484 vmulev_vmaddod_lsx vr3, vr4, vr21, vr20, vr5, vr7, \sz 485 vssrarni.h.w vr0, vr2, 12 // t4a 486 vssrarni.h.w vr7, vr5, 12 // t5a 487 488 vldrepl.h vr20, t0, 12 // 1189 489 vldrepl.h vr21, t0, 14 // 3920 490 vmulev_vmaddod_lsx vr1, vr6, vr20, vr21, vr3, vr2, \sz 491 vneg.h vr20, vr20 492 vmulev_vmaddod_lsx vr1, vr6, vr21, vr20, vr4, vr5, \sz 493 vssrarni.h.w vr2, vr3, 12 // t6a 494 vssrarni.h.w vr5, vr4, 12 // t7a 495 496 vsadd.h vr3, vr17, vr0 // t0 497 vssub.h vr4, vr17, vr0 // t4 498 vsadd.h vr1, vr19, vr7 // t1 499 vssub.h vr6, vr19, vr7 // t5 500 vsadd.h vr17, vr16, vr2 // t2 501 vssub.h vr19, vr16, vr2 // t6 502 vsadd.h vr0, vr18, vr5 // t3 503 vssub.h vr7, vr18, vr5 // t7 504 505 la.local t0, idct_coeffs_h 506 507 vldrepl.h vr20, t0, 4 // 1567 508 vldrepl.h vr21, t0, 6 // 3784 509 vmulev_vmaddod_lsx vr4, vr6, vr21, vr20, vr16, vr5, \sz 510 vneg.h vr21, vr21 511 vmulev_vmaddod_lsx vr4, vr6, vr20, vr21, vr18, vr2, \sz 512 vssrarni.h.w vr5, vr16, 12 // t4a 513 vssrarni.h.w vr2, vr18, 12 // t5a 514 515 vneg.h vr21, vr21 516 vmulev_vmaddod_lsx vr7, vr19, vr20, vr21, vr4, vr16, \sz 517 vneg.h vr20, vr20 518 vmulev_vmaddod_lsx vr7, vr19, vr21, vr20, vr6, vr18, \sz 519 vssrarni.h.w vr16, vr4, 12 // t7a 520 vssrarni.h.w vr18, vr6, 12 // t6a 521 522 vsadd.h vr4, vr5, vr18 // out1 523 vssub.h vr19, vr5, vr18 // t6 524 vsadd.h vr20, vr1, vr0 // out7 525 vssub.h vr18, vr1, vr0 // t3 526 vsadd.h \out0, vr3, vr17 // out0 527 vssub.h vr5, vr3, vr17 // t2 528 vsadd.h \out6, vr2, vr16 // out6 529 vssub.h vr23, vr2, vr16 // t7 530 531 vsllwil.w.h vr3, vr20, 0 // out7 532 vexth.w.h \out7, vr20 // out7 533 vsllwil.w.h vr21, vr4, 0 // out1 534 vexth.w.h \out1, vr4 // out1 535 vneg.w vr3, vr3 536 vneg.w \out7, \out7 537 vneg.w vr21, vr21 538 vneg.w \out1, \out1 539 vssrarni.h.w \out7, vr3, 0 540 vssrarni.h.w \out1, vr21, 0 541 542 la.local t0, idct_coeffs_h 543 544 vldrepl.h vr20, t0, 0 // 2896 545 vmulev_vmaddod_lsx vr5, vr18, vr20, vr20, vr16, \out3, \sz 546 vneg.h vr21, vr20 547 vmulev_vmaddod_lsx vr5, vr18, vr20, vr21, vr17, \out4, \sz 548 vsrari.w vr16, vr16, 12 549 vsrari.w \out3, \out3, 12 550 vneg.w vr16, vr16 551 vneg.w \out3, \out3 552 vssrarni.h.w \out3, vr16, 0 // out3 553 vssrarni.h.w \out4, vr17, 12 // out4 554 555 vmulev_vmaddod_lsx vr19, vr23, vr20, vr20, vr16, \out2, \sz 556 vmulev_vmaddod_lsx vr19, vr23, vr20, vr21, vr17, \out5, \sz 557 vssrarni.h.w \out2, vr16, 12 // out2 558 vsrari.w vr17, vr17, 12 559 vsrari.w \out5, \out5, 12 560 vneg.w vr17, vr17 561 vneg.w \out5, \out5 562 vssrarni.h.w \out5, vr17, 0 // out5 563.endm 564 565functionl inv_adst_8h_x8_lsx 566 inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h 567endfuncl 568 569functionl inv_flipadst_8h_x8_lsx 570 inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h 571endfuncl 572 573functionl inv_adst_4h_x8_lsx 574 inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h 575endfuncl 576 577functionl inv_flipadst_4h_x8_lsx 578 inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h 579endfuncl 580 581.macro inv_dct8_lsx in0, in1, in2, in3, in4, in5, in6, in7, sz 582 inv_dct4_lsx \in0, \in2, \in4, \in6, \in0, \in2, \in4, \in6, \sz 583 584 la.local t0, idct_coeffs_h 585 586 vldrepl.h vr20, t0, 8 // 799 587 vldrepl.h vr21, t0, 10 // 4017 588 vmulev_vmaddod_lsx \in1, \in7, vr21, vr20, vr16, vr17, \sz 589 vneg.h vr21, vr21 590 vmulev_vmaddod_lsx \in1, \in7, vr20, vr21, vr18, vr19, \sz 591 vssrarni.h.w vr17, vr16, 12 // t7a 592 vssrarni.h.w vr19, vr18, 12 // t4a 593 594 vldrepl.h vr20, t0, 12 // 3406 595 vldrepl.h vr21, t0, 14 // 2276 596 vmulev_vmaddod_lsx \in5, \in3, vr21, vr20, \in1, vr16, \sz 597 vneg.h vr21, vr21 598 vmulev_vmaddod_lsx \in5, \in3, vr20, vr21, \in7, vr18, \sz 599 vssrarni.h.w vr16, \in1, 12 // t6a 600 vssrarni.h.w vr18, \in7, 12 // t5a 601 602 vssub.h \in7, vr19, vr18 // t5a 603 vsadd.h vr18, vr19, vr18 // t4 604 vssub.h \in5, vr17, vr16 // t6a 605 vsadd.h vr16, vr17, vr16 // t7 606 607 vldrepl.h vr20, t0, 0 // 2896 608 vmulev_vmaddod_lsx \in5, \in7, vr20, vr20, \in1, vr17, \sz 609 vneg.h vr21, vr20 610 vmulev_vmaddod_lsx \in5, \in7, vr20, vr21, vr23, vr19, \sz 611 vssrarni.h.w vr17, \in1, 12 // t6 612 vssrarni.h.w vr19, vr23, 12 // t5 613 614 vssub.h \in7, \in0, vr16 //c[7] 615 vsadd.h \in0, \in0, vr16 //c[0] 616 vssub.h \in5, \in4, vr19 //c[5] 617 vsadd.h vr23, \in4, vr19 //c[2] 618 vssub.h \in4, \in6, vr18 //c[4] 619 vsadd.h \in3, \in6, vr18 //c[3] 620 vssub.h \in6, \in2, vr17 //c[6] 621 vsadd.h \in1, \in2, vr17 //c[1] 622 vor.v \in2, vr23, vr23 623.endm 624 625functionl inv_dct_8h_x8_lsx 626 inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h 627endfuncl 628 629functionl inv_dct_4h_x8_lsx 630 inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .4h 631endfuncl 632 633.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7 634 vsllwil.hu.bu vr0, \in0, 0 635 vsllwil.hu.bu vr1, \in1, 0 636 vsllwil.hu.bu vr2, \in2, 0 637 vsllwil.hu.bu vr3, \in3, 0 638 vadd.h vr0, \in4, vr0 639 vadd.h vr1, \in5, vr1 640 vadd.h vr2, \in6, vr2 641 vadd.h vr3, \in7, vr3 642 vssrani.bu.h vr1, vr0, 0 643 vssrani.bu.h vr3, vr2, 0 644 vstelm.d vr1, a0, 0, 0 645 vstelmx.d vr1, a0, a1, 1 646 vstelmx.d vr3, a0, a1, 0 647 vstelmx.d vr3, a0, a1, 1 648.endm 649 650.macro VLD_DST_ADD_W8 in0, in1, in2, in3 651 vld vr0, a0, 0 652 vldx vr1, a0, a1 653 vld vr2, t2, 0 654 vldx vr3, t2, a1 655 656 DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3 657.endm 658 659functionl inv_identity_8h_x8_lsx 660.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 661 vsadd.h \i, \i, \i 662.endr 663endfuncl 664 665functionl inv_identity_4h_x8_lsx 666.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 667 vsadd.h \i, \i, \i 668.endr 669endfuncl 670 671.macro def_fn_8x8_base variant 672functionl inv_txfm_\variant\()add_8x8_lsx 673 vxor.v vr23, vr23, vr23 674 vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 675.irp i, 0, 16, 32, 48, 64, 80, 96, 112 676 vst vr23, a2, \i 677.endr 678 679.ifc \variant, identity_ 680 // The identity shl #1 and downshift srshr #1 cancel out 681 b .itx_8x8_epilog 682.else 683 684 move t6, ra 685 jirl ra, t7, 0 686 move ra, t6 687 688.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 689 vsrari.h \i, \i, 1 690.endr 691 692.itx_8x8_epilog: 693 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 694 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 695 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 696 697 move t6, ra 698 jirl ra, t8, 0 699 move ra, t6 700 701 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 702 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4 703 704 alsl.d t2, a1, a0, 1 705 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 706 add.d a0, a0, a1 707 alsl.d t2, a1, a0, 1 708 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 709.endif 710endfuncl 711.endm 712 713def_fn_8x8_base identity_ 714def_fn_8x8_base 715 716.macro fn8x8 txfm1, txfm2 717function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_lsx 718.ifc \txfm1\()_\txfm2, dct_dct 719 bnez a3, .NO_HAS_DCONLY_8x8 720 721 idct_dc 8, 8, 1 722 723 DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20 724 725 add.d a0, a1, a0 726 alsl.d t2, a1, a0, 1 727 VLD_DST_ADD_W8 vr20, vr20, vr20, vr20 728 729 b .\txfm1\()_\txfm2\()_8X8_END 730.NO_HAS_DCONLY_8x8: 731.endif 732 la.local t8, inv_\txfm2\()_8h_x8_lsx 733.ifc \txfm1, identity 734 b inv_txfm_identity_add_8x8_lsx 735.else 736 la.local t7, inv_\txfm1\()_8h_x8_lsx 737 b inv_txfm_add_8x8_lsx 738.endif 739.\txfm1\()_\txfm2\()_8X8_END: 740endfunc 741.endm 742 743fn8x8 dct, dct 744fn8x8 identity, identity 745fn8x8 dct, adst 746fn8x8 dct, flipadst 747fn8x8 dct, identity 748fn8x8 adst, dct 749fn8x8 adst, adst 750fn8x8 adst, flipadst 751fn8x8 flipadst, dct 752fn8x8 flipadst, adst 753fn8x8 flipadst, flipadst 754fn8x8 identity, dct 755fn8x8 adst, identity 756fn8x8 flipadst, identity 757fn8x8 identity, adst 758fn8x8 identity, flipadst 759 760.macro rect2_lsx in0, in1, out0 761 vsllwil.w.h vr22, \in0, 0 // in1 762 vexth.w.h \in0, \in0 // in1 763 vmul.w vr22, vr22, \in1 764 vmul.w \out0, \in0, \in1 765 vssrarni.h.w \out0, vr22, 12 766.endm 767 768.macro LSX_TRANSPOSE8x4_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 769 out2, out3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5 770 vilvl.h \tmp0, \in1, \in0 771 vilvl.h \tmp1, \in3, \in2 772 vilvl.w \tmp2, \tmp1, \tmp0 773 vilvh.w \tmp3, \tmp1, \tmp0 774 vilvl.h \tmp0, \in5, \in4 775 vilvl.h \tmp1, \in7, \in6 776 vilvl.w \tmp4, \tmp1, \tmp0 777 vilvh.w \tmp5, \tmp1, \tmp0 778 vilvl.d \out0, \tmp4, \tmp2 779 vilvh.d \out1, \tmp4, \tmp2 780 vilvl.d \out2, \tmp5, \tmp3 781 vilvh.d \out3, \tmp5, \tmp3 782.endm 783 784functionl inv_txfm_add_8x4_lsx 785 vxor.v vr23, vr23, vr23 786 vld vr0, a2, 0 787 vld vr2, a2, 16 788 vld vr4, a2, 32 789 vld vr6, a2, 48 790.irp i, 0, 16, 32, 48 791 vst vr23, a2, \i 792.endr 793 794 li.w t0, 2896 795 vreplgr2vr.w vr23, t0 796 rect2_lsx vr0, vr23, vr0 797 rect2_lsx vr2, vr23, vr2 798 rect2_lsx vr4, vr23, vr4 799 rect2_lsx vr6, vr23, vr6 800 801 vilvh.d vr1, vr0, vr0 802 vilvh.d vr3, vr2, vr2 803 vilvh.d vr5, vr4, vr4 804 vilvh.d vr7, vr6, vr6 805 806 move t6, ra 807 jirl ra, t7, 0 808 move ra, t6 809 810 LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \ 811 vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21 812 813 move t6, ra 814 jirl ra, t8, 0 815 move ra, t6 816 817 vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4 818 819 alsl.d t2, a1, a0, 1 820 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 821endfuncl 822 823.macro LSX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, out4, \ 824 out5, out6, out7, tmp0, tmp1, tmp2, tmp3 825 vilvl.h \tmp0, \in1, \in0 826 vilvl.h \tmp1, \in3, \in2 827 vilvh.h \tmp2, \in1, \in0 828 vilvh.h \tmp3, \in3, \in2 829 vilvl.w \out0, \tmp1, \tmp0 830 vilvh.w \out2, \tmp1, \tmp0 831 vilvl.w \out4, \tmp3, \tmp2 832 vilvh.w \out6, \tmp3, \tmp2 833 834 vbsrl.v \out1, \out0, 8 835 vbsrl.v \out3, \out2, 8 836 vbsrl.v \out5, \out4, 8 837 vbsrl.v \out7, \out6, 8 838 vinsgr2vr.d \out0, zero, 1 839 vinsgr2vr.d \out2, zero, 1 840 vinsgr2vr.d \out4, zero, 1 841 vinsgr2vr.d \out6, zero, 1 842.endm 843 844functionl inv_txfm_add_4x8_lsx 845 vxor.v vr23, vr23, vr23 846 vld vr0, a2, 0 847 vld vr1, a2, 16 848 vld vr2, a2, 32 849 vld vr3, a2, 48 850.irp i, 0, 16, 32, 48 851 vst vr23, a2, \i 852.endr 853 854 li.w t0, 2896 855 vreplgr2vr.w vr23, t0 856 rect2_lsx vr0, vr23, vr0 857 rect2_lsx vr1, vr23, vr1 858 rect2_lsx vr2, vr23, vr2 859 rect2_lsx vr3, vr23, vr3 860 861 move t6, ra 862 jirl ra, t7, 0 863 move ra, t6 864 865 LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \ 866 vr6, vr7, vr16, vr17, vr18, vr19 867 868 move t6, ra 869 jirl ra, t8, 0 870 move ra, t6 871 872 vilvl.d vr0, vr1, vr0 873 vilvl.d vr1, vr3, vr2 874 vilvl.d vr2, vr5, vr4 875 vilvl.d vr3, vr7, vr6 876 877 vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4 878 879 alsl.d t2, a1, a0, 1 880 VLD_DST_ADD_W4 vr16, vr17 881 add.d a0, a1, a0 882 alsl.d t2, a1, a0, 1 883 VLD_DST_ADD_W4 vr18, vr19 884endfuncl 885 886.macro fn8x4 txfm1, txfm2 887function inv_txfm_add_\txfm1\()_\txfm2\()_8x4_8bpc_lsx 888.ifc \txfm1()_\txfm2, dct_dct 889 bnez a3, .NO_HAS_DCONLY_8x4 890 891 idct_dc 8, 4, 0 892 893 DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5 894 895 b .\txfm1\()_\txfm2\()_8X4_END 896.NO_HAS_DCONLY_8x4: 897.endif 898 la.local t7, inv_\txfm1\()_4h_x8_lsx 899 la.local t8, inv_\txfm2\()_8h_x4_lsx 900 b inv_txfm_add_8x4_lsx 901.\txfm1\()_\txfm2\()_8X4_END: 902endfunc 903.endm 904 905fn8x4 dct, dct 906fn8x4 identity, identity 907fn8x4 dct, adst 908fn8x4 dct, flipadst 909fn8x4 dct, identity 910fn8x4 adst, dct 911fn8x4 adst, adst 912fn8x4 adst, flipadst 913fn8x4 flipadst, dct 914fn8x4 flipadst, adst 915fn8x4 flipadst, flipadst 916fn8x4 identity, dct 917fn8x4 adst, identity 918fn8x4 flipadst, identity 919fn8x4 identity, adst 920fn8x4 identity, flipadst 921 922.macro fn4x8 txfm1, txfm2 923function inv_txfm_add_\txfm1\()_\txfm2\()_4x8_8bpc_lsx 924.ifc \txfm1()_\txfm2, dct_dct 925 bnez a3, .NO_HAS_DCONLY_4x8 926 927 idct_dc 4, 8, 0 928 929 DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20 930 931 add.d a0, a0, a1 932 alsl.d t2, a1, a0, 1 933 VLD_DST_ADD_W4 vr5, vr5 934 b .\txfm1\()_\txfm2\()_4X8_END 935.NO_HAS_DCONLY_4x8: 936.endif 937 la.local t7, inv_\txfm1\()_8h_x4_lsx 938 la.local t8, inv_\txfm2\()_4h_x8_lsx 939 b inv_txfm_add_4x8_lsx 940.\txfm1\()_\txfm2\()_4X8_END: 941endfunc 942.endm 943 944fn4x8 dct, dct 945fn4x8 identity, identity 946fn4x8 dct, adst 947fn4x8 dct, flipadst 948fn4x8 dct, identity 949fn4x8 adst, dct 950fn4x8 adst, adst 951fn4x8 adst, flipadst 952fn4x8 flipadst, dct 953fn4x8 flipadst, adst 954fn4x8 flipadst, flipadst 955fn4x8 identity, dct 956fn4x8 adst, identity 957fn4x8 flipadst, identity 958fn4x8 identity, adst 959fn4x8 identity, flipadst 960 961.macro inv_identity4_lsx_x2 in0, in1, in2, in3, in4, out0, out1 962 vsllwil.w.h vr4, \in0, 0 963 vexth.w.h vr5, \in0 964 vsllwil.w.h vr6, \in1, 0 965 vexth.w.h vr7, \in1 966 vmul.w vr4, vr4, \in2 967 vmul.w vr5, vr5, \in2 968 vmul.w vr6, vr6, \in2 969 vmul.w vr7, vr7, \in2 970 vssrarni.h.w vr5, vr4, 12 971 vssrarni.h.w vr7, vr6, 12 972 vsadd.h \out0, vr5, \in3 973 vsadd.h \out1, vr7, \in4 974.endm 975 976.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1 977 vsllwil.w.h vr22, \in0, 0 978 vexth.w.h vr23, \in0 979 vmul.w \out0, vr22, \in2 980 vmul.w \out1, vr23, \in2 981 vsllwil.w.h vr22, \in1, 0 982 vexth.w.h vr23, \in1 983 vmadd.w \out0, vr22, \in3 984 vmadd.w \out1, vr23, \in3 985.endm 986 987.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1 988 vsllwil.w.h vr22, \in0, 0 989 vexth.w.h vr23, \in0 990 vmul.w \out0, vr22, \in2 991 vmul.w \out1, vr23, \in2 992 vsllwil.w.h vr22, \in1, 0 993 vexth.w.h vr23, \in1 994 vmsub.w \out0, vr22, \in3 995 vmsub.w \out1, vr23, \in3 996.endm 997 998.macro inv_dct16_lsx sz 999 inv_dct8_lsx vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14, \sz 1000 1001 la.local t0, idct_coeffs_h 1002 vldrepl.h vr20, t0, 16 // 401 1003 vldrepl.h vr21, t0, 18 // 4076 1004 vmulev_vmaddod_lsx vr1, vr15, vr21, vr20, vr16, vr17, \sz 1005 vneg.h vr21, vr21 1006 vmulev_vmaddod_lsx vr1, vr15, vr20, vr21, vr18, vr19, \sz 1007 vssrarni.h.w vr17, vr16, 12 // t15a 1008 vssrarni.h.w vr19, vr18, 12 // t8a 1009 vldrepl.h vr20, t0, 20 // 3166 -> 1583 1010 vldrepl.h vr21, t0, 22 // 2598 -> 1299 1011 vmulev_vmaddod_lsx vr9, vr7, vr21, vr20, vr1, vr16, \sz 1012 vneg.h vr21, vr21 1013 vmulev_vmaddod_lsx vr9, vr7, vr20, vr21, vr15, vr18, \sz 1014 vssrarni.h.w vr16, vr1, 12 // t14a 1015 vssrarni.h.w vr18, vr15, 12 // t9a 1016 vldrepl.h vr20, t0, 24 // 1931 1017 vldrepl.h vr21, t0, 26 // 3612 1018 vmulev_vmaddod_lsx vr5, vr11, vr21, vr20, vr7, vr1, \sz 1019 vneg.h vr21, vr21 1020 vmulev_vmaddod_lsx vr5, vr11, vr20, vr21, vr9, vr15, \sz 1021 vssrarni.h.w vr1, vr7, 12 // t13a 1022 vssrarni.h.w vr15, vr9, 12 // t10a 1023 vldrepl.h vr20, t0, 28 // 3920 1024 vldrepl.h vr21, t0, 30 // 1189 1025 vmulev_vmaddod_lsx vr13, vr3, vr21, vr20, vr5, vr7, \sz 1026 vneg.h vr21, vr21 1027 vmulev_vmaddod_lsx vr13, vr3, vr20, vr21, vr11, vr9, \sz 1028 vssrarni.h.w vr7, vr5, 12 // t12a 1029 vssrarni.h.w vr9, vr11, 12 // t11a 1030 1031 vsadd.h vr5, vr19, vr18 // t8 1032 vssub.h vr11, vr19, vr18 // t9 1033 vssub.h vr3, vr9, vr15 // t10 1034 vsadd.h vr13, vr9, vr15 // t11 1035 vsadd.h vr18, vr7, vr1 // t12 1036 vssub.h vr19, vr7, vr1 // t13 1037 vssub.h vr9, vr17, vr16 // t14 1038 vsadd.h vr15, vr17, vr16 // t15 1039 1040 vldrepl.h vr20, t0, 4 // 1567 1041 vldrepl.h vr21, t0, 6 // 3784 1042 vmulev_vmaddod_lsx vr9, vr11, vr21, vr20, vr1, vr16, \sz 1043 vneg.h vr21, vr21 1044 vmulev_vmaddod_lsx vr9, vr11, vr20, vr21, vr7, vr17, \sz 1045 vssrarni.h.w vr16, vr1, 12 // t14a 1046 vssrarni.h.w vr17, vr7, 12 // t9a 1047 1048 vneg.h vr21, vr21 1049 vmulev_vmaddod_lsx vr19, vr3, vr21, vr20, vr9, vr1, \sz 1050 vneg.h vr21, vr21 1051 vmulev_vmaddod_lsx vr19, vr3, vr20, vr21, vr11, vr7, \sz 1052 vneg.w vr1, vr1 1053 vneg.w vr9, vr9 1054 vssrarni.h.w vr7, vr11, 12 // t13a 1055 vssrarni.h.w vr1, vr9, 12 // t10a 1056 vsadd.h vr9, vr5, vr13 // t8a 1057 vssub.h vr11, vr5, vr13 // t11a 1058 vssub.h vr3, vr15, vr18 // t12a 1059 vsadd.h vr19, vr15, vr18 // t15a 1060 vsadd.h vr5, vr17, vr1 // t9 1061 vssub.h vr13, vr17, vr1 // t10 1062 vssub.h vr15, vr16, vr7 // t13 1063 vsadd.h vr18, vr16, vr7 // t14 1064 1065 vldrepl.h vr20, t0, 0 // 2896 1066 vmulev_vmaddod_lsx vr15, vr13, vr20, vr20, vr1, vr7, \sz 1067 vneg.h vr21, vr20 1068 vmulev_vmaddod_lsx vr15, vr13, vr20, vr21, vr17, vr16, \sz 1069 vssrarni.h.w vr7, vr1, 12 // t13a 1070 vssrarni.h.w vr16, vr17, 12 // t10a 1071 1072 vmulev_vmaddod_lsx vr3, vr11, vr20, vr20, vr13, vr23, \sz 1073 vmulev_vmaddod_lsx vr3, vr11, vr20, vr21, vr15, vr17, \sz 1074 vssrarni.h.w vr23, vr13, 12 // t12 1075 vssrarni.h.w vr17, vr15, 12 // t11 1076 1077 vssub.h vr15, vr0, vr19 // c[15] 1078 vsadd.h vr0, vr0, vr19 // c[0] 1079 vsadd.h vr1, vr2, vr18 // c[1] 1080 vssub.h vr20, vr2, vr18 // c[14] 1081 vsadd.h vr2, vr4, vr7 // c[2] 1082 vssub.h vr13, vr4, vr7 // c[13] 1083 vsadd.h vr3, vr6, vr23 // c[3] 1084 vssub.h vr21, vr6, vr23 // c[12] 1085 vsadd.h vr4, vr8, vr17 // c[4] 1086 vssub.h vr11, vr8, vr17 // c[11] 1087 vsadd.h vr7, vr14, vr9 // c[7] 1088 vssub.h vr8, vr14, vr9 // c[8] 1089 vsadd.h vr6, vr12, vr5 // c[6] 1090 vssub.h vr9, vr12, vr5 // c[9] 1091 vsadd.h vr5, vr10, vr16 // c[5] 1092 vssub.h vr10, vr10, vr16 // c[10] 1093 vor.v vr14, vr20, vr20 1094 vor.v vr12, vr21, vr21 1095.endm 1096 1097functionl inv_dct_8h_x16_lsx 1098 inv_dct16_lsx .8h 1099endfuncl 1100 1101functionl inv_dct_4h_x16_lsx 1102 inv_dct16_lsx .4h 1103endfuncl 1104 1105.macro VLD_DST_ADD_W4_x4 in0, in1, in2, in3, in4, in5, in6 ,in7 1106 alsl.d t2, a1, a0, 1 1107 1108 VLD_DST_ADD_W4 \in0, \in1 1109 1110 add.d a0, a1, a0 1111 alsl.d t2, a1, a0, 1 1112 VLD_DST_ADD_W4 \in2, \in3 1113 1114 add.d a0, a1, a0 1115 alsl.d t2, a1, a0, 1 1116 VLD_DST_ADD_W4 \in4, \in5 1117 1118 add.d a0, a1, a0 1119 alsl.d t2, a1, a0, 1 1120 VLD_DST_ADD_W4 \in6, \in7 1121.endm 1122 1123.macro def_fn_4x16_base txfm 1124functionl inv_txfm_\txfm\()add_4x16_lsx 1125 PUSH_REG 1126 blt a3, t5, 416f 1127 vld vr0, a2, 16 1128 vld vr1, a2, 48 1129 vld vr2, a2, 80 1130 vld vr3, a2, 112 1131 vxor.v vr23, vr23, vr23 1132.irp i, 16, 48, 80, 112 1133 vst vr23, a2, \i 1134.endr 1135 1136 move t6, ra 1137 jirl ra, t7, 0 1138 move ra, t6 1139 1140.ifnc \txfm, identity_ 1141 vsrari.h vr0, vr0, 1 1142 vsrari.h vr1, vr1, 1 1143 vsrari.h vr2, vr2, 1 1144 vsrari.h vr3, vr3, 1 1145.endif 1146 1147 LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr8, vr9, vr24, vr25, vr26, \ 1148 vr27, vr14, vr28, vr10, vr11, vr12, vr13 1149 1150416: 1151 ble t5, a3, 416416f 1152.irp i, vr8, vr9, vr24, vr25, vr26, vr27, vr14, vr28 1153 vxor.v \i, \i, \i 1154.endr 1155 1156416416: 1157 vld vr0, a2, 0 1158 vld vr1, a2, 32 1159 vld vr2, a2, 64 1160 vld vr3, a2, 96 1161 vxor.v vr23, vr23, vr23 1162.irp i, 0, 32, 64, 96 1163 vst vr23, a2, \i 1164.endr 1165 1166 move t6, ra 1167 jirl ra, t7, 0 1168 move ra, t6 1169 1170.ifnc \txfm, identity_ 1171 vsrari.h vr0, vr0, 1 1172 vsrari.h vr1, vr1, 1 1173 vsrari.h vr2, vr2, 1 1174 vsrari.h vr3, vr3, 1 1175.endif 1176 1177 LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \ 1178 vr6, vr7, vr16, vr17, vr18, vr19 1179 1180 vor.v vr10, vr24, vr24 1181 vor.v vr11, vr25, vr25 1182 vor.v vr12, vr26, vr26 1183 vor.v vr13, vr27, vr27 1184 vor.v vr15, vr28, vr28 1185 1186 move t6, ra 1187 jirl ra, t8, 0 1188 move ra, t6 1189 1190 vilvl.d vr16, vr1, vr0 1191 vilvl.d vr17, vr3, vr2 1192 vilvl.d vr18, vr5, vr4 1193 vilvl.d vr19, vr7, vr6 1194 vilvl.d vr20, vr9, vr8 1195 vilvl.d vr21, vr11, vr10 1196 vilvl.d vr22, vr13, vr12 1197 vilvl.d vr23, vr15, vr14 1198 1199.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1200 vsrari.h \i, \i, 4 1201.endr 1202 1203 VLD_DST_ADD_W4_x4 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1204 POP_REG 1205endfuncl 1206.endm 1207 1208def_fn_4x16_base identity_ 1209def_fn_4x16_base 1210 1211.macro fn4x16 txfm1, txfm2, eob_half 1212function inv_txfm_add_\txfm1\()_\txfm2\()_4x16_8bpc_lsx 1213.ifc \txfm1()_\txfm2, dct_dct 1214 bnez a3, .NO_HAS_DCONLY_4x16 1215 1216 idct_dc 4, 16, 1 1217 1218 DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5 1219 1220.rept 3 1221 add.d a0, a1, a0 1222 alsl.d t2, a1, a0, 1 1223 1224 VLD_DST_ADD_W4 vr5, vr5 1225.endr 1226 b .\txfm1\()_\txfm2\()_4X16_END 1227 1228.NO_HAS_DCONLY_4x16: 1229.endif 1230 li.w t5, \eob_half 1231 la.local t7, inv_\txfm1\()_8h_x4_lsx 1232.ifc \txfm1, identity 1233 la.local t7, inv_\txfm1\()_8h_x4_lsx1 1234.endif 1235 la.local t8, inv_\txfm2\()_4h_x16_lsx 1236 1237.ifc \txfm1, identity 1238 b inv_txfm_identity_add_4x16_lsx 1239.else 1240 b inv_txfm_add_4x16_lsx 1241.endif 1242.\txfm1\()_\txfm2\()_4X16_END: 1243endfunc 1244.endm 1245 1246fn4x16 dct, dct, 29 1247fn4x16 identity, identity, 29 1248fn4x16 dct, adst, 29 1249fn4x16 dct, flipadst, 29 1250fn4x16 dct, identity, 8 1251fn4x16 adst, dct, 29 1252fn4x16 adst, adst, 29 1253fn4x16 adst, flipadst, 29 1254fn4x16 flipadst, dct, 29 1255fn4x16 flipadst, adst, 29 1256fn4x16 flipadst, flipadst, 29 1257fn4x16 identity, dct, 32 1258fn4x16 adst, identity, 8 1259fn4x16 flipadst, identity, 8 1260fn4x16 identity, adst, 32 1261fn4x16 identity, flipadst, 32 1262 1263.macro inv_identity16_lsx in0, in1, in2, out0, sz 1264.ifc \sz, .8h 1265 vsllwil.w.h vr16, \in0, 0 1266 vexth.w.h vr17, \in0 1267 vmul.w vr16, vr16, \in1 1268 vmul.w vr17, vr17, \in1 1269 vsadd.h \in2, \in2, \in2 1270 vssrarni.h.w vr17, vr16, 11 1271 vsadd.h \out0, vr17, \in2 1272.else 1273 vsllwil.w.h vr16, \in0, 0 1274 vmul.w vr16, vr16, \in1 1275 vsadd.h \in2, \in2, \in2 1276 vssrarni.h.w vr16, vr16, 11 1277 vsadd.h \out0, vr16, \in2 1278.endif 1279.endm 1280 1281.macro inv_identity16_lsx1 in0, in1, in2, out0 1282 vsllwil.w.h vr16, \in0, 0 1283 vexth.w.h vr17, \in1 1284 vmul.w vr18, vr16, \in2 1285 vmul.w vr19, vr17, \in2 1286 vsrari.w vr18, vr18, 11 1287 vsrari.w vr19, vr19, 11 1288 vslli.w vr16, vr16, 1 1289 vslli.w vr17, vr17, 1 1290 vadd.w vr16, vr18, vr16 1291 vadd.w \out0, vr19, vr17 1292 vssrarni.h.w \out0, vr16, 1 1293.endm 1294 1295functionl inv_identity_8h_x16_lsx 1296 li.w t0, 1697 1297 vreplgr2vr.w vr20, t0 1298.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \ 1299 vr9, vr10, vr11, vr12, vr13, vr14, vr15 1300 inv_identity16_lsx \i, vr20, \i, \i, .8h 1301.endr 1302endfuncl 1303 1304functionl inv_identity_4h_x16_lsx 1305 li.w t0, 1697 1306 vreplgr2vr.w vr20, t0 1307.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \ 1308 vr9, vr10, vr11, vr12, vr13, vr14, vr15 1309 inv_identity16_lsx \i, vr20, \i, \i, .4h 1310.endr 1311endfuncl 1312 1313functionl inv_identity_8h_x16_lsx1 1314 li.w t0, 1697 1315 vreplgr2vr.w vr20, t0 1316.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \ 1317 vr9, vr10, vr11, vr12, vr13, vr14, vr15 1318 inv_identity16_lsx1 \i, \i, vr20, \i 1319.endr 1320endfuncl 1321 1322const iadst16_coeffs_h, align=4 1323 .short 4091, 201, 3973, 995 1324 .short 3703, 1751, 3290, 2440 1325 .short 2751, 3035, 2106, 3513 1326 .short 1380, 3857, 601, 4052 1327endconst 1328 1329.macro inv_adst16_lsx txfm, sz 1330 la.local t0, iadst16_coeffs_h 1331 vldrepl.h vr20, t0, 0 // 4091 1332 vldrepl.h vr21, t0, 2 // 201 1333 vmulev_vmaddod_lsx vr15, vr0, vr20, vr21, vr16, vr18, \sz 1334 vneg.h vr20, vr20 1335 vmulev_vmaddod_lsx vr15, vr0, vr21, vr20, vr17, vr19, \sz 1336 vssrarni.h.w vr18, vr16, 12 // t0 1337 vssrarni.h.w vr19, vr17, 12 // t1 1338 vldrepl.h vr20, t0, 4 // 3973 1339 vldrepl.h vr21, t0, 6 // 995 1340 vmulev_vmaddod_lsx vr13, vr2, vr20, vr21, vr16, vr0, \sz 1341 vneg.h vr20, vr20 1342 vmulev_vmaddod_lsx vr13, vr2, vr21, vr20, vr17, vr15, \sz 1343 vssrarni.h.w vr0, vr16, 12 // t2 1344 vssrarni.h.w vr15, vr17, 12 // t3 1345 vldrepl.h vr20, t0, 8 // 3703 1346 vldrepl.h vr21, t0, 10 // 1751 1347 vmulev_vmaddod_lsx vr11, vr4, vr20, vr21, vr16, vr2, \sz 1348 vneg.h vr20, vr20 1349 vmulev_vmaddod_lsx vr11, vr4, vr21, vr20, vr17, vr13, \sz 1350 vssrarni.h.w vr2, vr16, 12 // t4 1351 vssrarni.h.w vr13, vr17, 12 // t5 1352 vldrepl.h vr20, t0, 12 // 3290 -> 1645 1353 vldrepl.h vr21, t0, 14 // 2440 -> 1220 1354 vmulev_vmaddod_lsx vr9, vr6, vr20, vr21, vr16, vr4, \sz 1355 vneg.h vr20, vr20 1356 vmulev_vmaddod_lsx vr9, vr6, vr21, vr20, vr17, vr11, \sz 1357 vssrarni.h.w vr4, vr16, 12 // t6 1358 vssrarni.h.w vr11, vr17, 12 // t7 1359 vldrepl.h vr20, t0, 16 // 2751 1360 vldrepl.h vr21, t0, 18 // 3035 1361 vmulev_vmaddod_lsx vr7, vr8, vr20, vr21, vr16, vr6, \sz 1362 vneg.h vr20, vr20 1363 vmulev_vmaddod_lsx vr7, vr8, vr21, vr20, vr17, vr9, \sz 1364 vssrarni.h.w vr6, vr16, 12 // t8 1365 vssrarni.h.w vr9, vr17, 12 // t9 1366 vldrepl.h vr20, t0, 20 // 2106 1367 vldrepl.h vr21, t0, 22 // 3513 1368 vmulev_vmaddod_lsx vr5, vr10, vr20, vr21, vr16, vr7, \sz 1369 vneg.h vr20, vr20 1370 vmulev_vmaddod_lsx vr5, vr10, vr21, vr20, vr17, vr8, \sz 1371 vssrarni.h.w vr7, vr16, 12 // t10 1372 vssrarni.h.w vr8, vr17, 12 // t11 1373 vldrepl.h vr20, t0, 24 // 1380 1374 vldrepl.h vr21, t0, 26 // 3857 1375 vmulev_vmaddod_lsx vr3, vr12, vr20, vr21, vr16, vr5, \sz 1376 vneg.h vr20, vr20 1377 vmulev_vmaddod_lsx vr3, vr12, vr21, vr20, vr17, vr10, \sz 1378 vssrarni.h.w vr5, vr16, 12 // t12 1379 vssrarni.h.w vr10, vr17, 12 // t13 1380 vldrepl.h vr20, t0, 28 // 601 1381 vldrepl.h vr21, t0, 30 // 4052 1382 vmulev_vmaddod_lsx vr1, vr14, vr20, vr21, vr16, vr3, \sz 1383 vneg.h vr20, vr20 1384 vmulev_vmaddod_lsx vr1, vr14, vr21, vr20, vr17, vr12, \sz 1385 vssrarni.h.w vr3, vr16, 12 // t14 1386 vssrarni.h.w vr12, vr17, 12 // t15 1387 1388 vsadd.h vr1, vr18, vr6 // t0a 1389 vssub.h vr14, vr18, vr6 // t8a 1390 vsadd.h vr16, vr19, vr9 // t1a 1391 vssub.h vr17, vr19, vr9 // t9a 1392 vsadd.h vr6, vr0, vr7 // t2a 1393 vssub.h vr18, vr0, vr7 // t10a 1394 vsadd.h vr9, vr15, vr8 // t3a 1395 vssub.h vr19, vr15, vr8 // t11a 1396 vsadd.h vr0, vr2, vr5 // t4a 1397 vssub.h vr7, vr2, vr5 // t12a 1398 vsadd.h vr8, vr13, vr10 // t5a 1399 vssub.h vr15, vr13, vr10 // t13a 1400 vsadd.h vr2, vr4, vr3 // t6a 1401 vssub.h vr5, vr4, vr3 // t14a 1402 vsadd.h vr10, vr11, vr12 // t7a 1403 vssub.h vr13, vr11, vr12 // t15a 1404 1405 la.local t0, idct_coeffs_h 1406 1407 vldrepl.h vr20, t0, 8 // 799 1408 vldrepl.h vr21, t0, 10 // 4017 1409 vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr3, vr11, \sz 1410 vneg.h vr21, vr21 1411 vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr4, vr12, \sz 1412 vssrarni.h.w vr11, vr3, 12 // t8 1413 vssrarni.h.w vr12, vr4, 12 // t9 1414 vneg.h vr21, vr21 1415 vmulev_vmaddod_lsx vr15, vr7, vr20, vr21, vr3, vr14, \sz 1416 vneg.h vr20, vr20 1417 vmulev_vmaddod_lsx vr15, vr7, vr21, vr20, vr4, vr17, \sz 1418 vssrarni.h.w vr14, vr3, 12 // t13 1419 vssrarni.h.w vr17, vr4, 12 // t12 1420 vldrepl.h vr20, t0, 12 // 3406 1421 vldrepl.h vr21, t0, 14 // 2276 1422 vmulev_vmaddod_lsx vr18, vr19, vr21, vr20, vr3, vr7, \sz 1423 vneg.h vr21, vr21 1424 vmulev_vmaddod_lsx vr18, vr19, vr20, vr21, vr4, vr15, \sz 1425 vssrarni.h.w vr7, vr3, 12 // t10 1426 vssrarni.h.w vr15, vr4, 12 // t11 1427 vneg.h vr21, vr21 1428 vmulev_vmaddod_lsx vr13, vr5, vr20, vr21, vr3, vr18, \sz 1429 vneg.h vr20, vr20 1430 vmulev_vmaddod_lsx vr13, vr5, vr21, vr20, vr4, vr19, \sz 1431 vssrarni.h.w vr18, vr3, 12 // t15 1432 vssrarni.h.w vr19, vr4, 12 // t14 1433 1434 vsadd.h vr5, vr1, vr0 // t0 1435 vssub.h vr13, vr1, vr0 // t4 1436 vsadd.h vr3, vr16, vr8 // t1 1437 vssub.h vr4, vr16, vr8 // t5 1438 vsadd.h vr0, vr6, vr2 // t2 1439 vssub.h vr1, vr6, vr2 // t6 1440 vsadd.h vr8, vr9, vr10 // t3 1441 vssub.h vr16, vr9, vr10 // t7 1442 vsadd.h vr2, vr11, vr17 // t8a 1443 vssub.h vr6, vr11, vr17 // t12a 1444 vsadd.h vr9, vr12, vr14 // t9a 1445 vssub.h vr10, vr12, vr14 // t13a 1446 vsadd.h vr11, vr7, vr19 // t10a 1447 vssub.h vr17, vr7, vr19 // t14a 1448 vsadd.h vr12, vr15, vr18 // t11a 1449 vssub.h vr14, vr15, vr18 // t15a 1450 1451 vldrepl.h vr20, t0, 4 // 1567 1452 vldrepl.h vr21, t0, 6 // 3784 1453 vmulev_vmaddod_lsx vr13, vr4, vr21, vr20, vr7, vr18, \sz 1454 vneg.h vr21, vr21 1455 vmulev_vmaddod_lsx vr13, vr4, vr20, vr21, vr15, vr19, \sz 1456 vssrarni.h.w vr18, vr7, 12 // t4a 1457 vssrarni.h.w vr19, vr15, 12 // t5a 1458 vneg.h vr21, vr21 1459 vmulev_vmaddod_lsx vr16, vr1, vr20, vr21, vr7, vr4, \sz 1460 vneg.h vr20, vr20 1461 vmulev_vmaddod_lsx vr16, vr1, vr21, vr20, vr15, vr13, \sz 1462 vssrarni.h.w vr4, vr7, 12 // t7a 1463 vssrarni.h.w vr13, vr15, 12 // t6a 1464 vneg.h vr20, vr20 1465 vmulev_vmaddod_lsx vr6, vr10, vr21, vr20, vr7, vr1, \sz 1466 vneg.h vr21, vr21 1467 vmulev_vmaddod_lsx vr6, vr10, vr20, vr21, vr15, vr16, \sz 1468 vssrarni.h.w vr1, vr7, 12 // t12 1469 vssrarni.h.w vr16, vr15, 12 // t13 1470 vneg.h vr21, vr21 1471 vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr7, vr6, \sz 1472 vneg.h vr20, vr20 1473 vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr15, vr10, \sz 1474 vssrarni.h.w vr6, vr7, 12 // t15 1475 vssrarni.h.w vr10, vr15, 12 // t14 1476 1477 vssub.h vr17, vr5, vr0 // t2a 1478 vsadd.h vr14, vr5, vr0 // out[0] 1479 vssub.h vr7, vr3, vr8 // t3a 1480 vsadd.h vr15, vr3, vr8 // out[15] 1481 vsllwil.w.h vr22, vr15, 0 1482 vexth.w.h vr15, vr15 1483 vneg.w vr22, vr22 1484 vneg.w vr15, vr15 1485 vssrarni.h.w vr15, vr22, 0 // out[15] 1486 1487 vsadd.h vr3, vr19, vr4 // out[12] 1488 vssub.h vr8, vr19, vr4 // t7 1489 vssub.h vr0, vr18, vr13 // t6 1490 vsadd.h vr5, vr18, vr13 // out[3] 1491 vsllwil.w.h vr22, vr5, 0 1492 vexth.w.h vr5, vr5 1493 vneg.w vr22, vr22 1494 vneg.w vr5, vr5 1495 vssrarni.h.w vr5, vr22, 0 // out[3] 1496 1497 vsadd.h vr13, vr9, vr12 // out[14] 1498 vssub.h vr19, vr9, vr12 // t11 1499 vssub.h vr4, vr2, vr11 // t10 1500 vsadd.h vr18, vr2, vr11 // out[1] 1501 vsllwil.w.h vr22, vr18, 0 1502 vexth.w.h vr18, vr18 1503 vneg.w vr22, vr22 1504 vneg.w vr18, vr18 1505 vssrarni.h.w vr18, vr22, 0 // out[1] 1506 1507 vsadd.h vr2, vr1, vr10 // out[2] 1508 vssub.h vr11, vr1, vr10 // t14a 1509 vssub.h vr12, vr16, vr6 // t15a 1510 vsadd.h vr9, vr16, vr6 // out[13] 1511 vsllwil.w.h vr22, vr9, 0 1512 vexth.w.h vr9, vr9 1513 vneg.w vr22, vr22 1514 vneg.w vr9, vr9 1515 vssrarni.h.w vr9, vr22, 0 // out[13] 1516 1517 vldrepl.h vr20, t0, 0 // 2896 1518 vmulev_vmaddod_lsx vr17, vr7, vr20, vr20, vr6, vr10, \sz 1519 vneg.h vr21, vr20 1520 vmulev_vmaddod_lsx vr17, vr7, vr20, vr21, vr16, vr1, \sz 1521 vssrarni.h.w vr1, vr16, 12 // out[8] 1522 vsrari.w vr6, vr6, 12 1523 vsrari.w vr10, vr10, 12 1524 vneg.w vr6, vr6 1525 vneg.w vr10, vr10 1526 vssrarni.h.w vr10, vr6, 0 // out[7] 1527 vmulev_vmaddod_lsx vr0, vr8, vr20, vr21, vr16, vr17, \sz 1528 vmulev_vmaddod_lsx vr0, vr8, vr20, vr20, vr6, vr7, \sz 1529 vssrarni.h.w vr7, vr6, 12 // out[4] 1530 vsrari.w vr16, vr16, 12 1531 vsrari.w vr17, vr17, 12 1532 vneg.w vr16, vr16 1533 vneg.w vr17, vr17 1534 vssrarni.h.w vr17, vr16, 0 // out[11] 1535 1536 vmulev_vmaddod_lsx vr4, vr19, vr20, vr21, vr16, vr0, \sz 1537 vmulev_vmaddod_lsx vr4, vr19, vr20, vr20, vr6, vr8, \sz 1538 vssrarni.h.w vr8, vr6, 12 // out[6] 1539 vsrari.w vr16, vr16, 12 1540 vsrari.w vr0, vr0, 12 1541 vneg.w vr16, vr16 1542 vneg.w vr0, vr0 1543 vssrarni.h.w vr0, vr16, 0 // out[9] 1544 1545 vmulev_vmaddod_lsx vr11, vr12, vr20, vr20, vr6, vr4, \sz 1546 vmulev_vmaddod_lsx vr11, vr12, vr20, vr21, vr16, vr19, \sz 1547 vssrarni.h.w vr19, vr16, 12 // out[10] 1548 vsrari.w vr6, vr6, 12 1549 vsrari.w vr4, vr4, 12 1550 vneg.w vr6, vr6 1551 vneg.w vr4, vr4 1552 vssrarni.h.w vr4, vr6, 0 // out[5] 1553 1554.ifc \txfm, adst 1555 vor.v vr12, vr3, vr3 1556 vor.v vr3, vr5, vr5 1557 vor.v vr5, vr4, vr4 1558 vor.v vr4, vr7, vr7 1559 vor.v vr7, vr10, vr10 1560 vor.v vr10, vr19, vr19 1561 vor.v vr6, vr8, vr8 1562 vor.v vr8, vr1, vr1 1563 vor.v vr11, vr17, vr17 1564 vor.v vr20, vr13, vr13 1565 vor.v vr13, vr9, vr9 1566 vor.v vr9, vr0, vr0 1567 vor.v vr0, vr14, vr14 1568 vor.v vr14, vr20, vr20 1569 vor.v vr1, vr18, vr18 1570.else 1571 vor.v vr6, vr0, vr0 1572 vor.v vr0, vr15, vr15 1573 vor.v vr15, vr14, vr14 1574 vor.v vr14, vr18, vr18 1575 vor.v vr11, vr7, vr7 1576 vor.v vr7, vr1, vr1 1577 vor.v vr1, vr13, vr13 1578 vor.v vr13, vr2, vr2 1579 vor.v vr2, vr9, vr9 1580 vor.v vr9, vr8, vr8 1581 vor.v vr8, vr10, vr10 1582 vor.v vr10, vr4, vr4 1583 vor.v vr4, vr17, vr17 1584 vor.v vr12, vr5, vr5 1585 vor.v vr5, vr19, vr19 1586.endif 1587.endm // inv_adst16_lsx 1588 1589functionl inv_adst_8h_x16_lsx 1590 inv_adst16_lsx adst, 8h 1591endfuncl 1592 1593functionl inv_flipadst_8h_x16_lsx 1594 inv_adst16_lsx flipadst, 8h 1595endfuncl 1596 1597functionl inv_adst_4h_x16_lsx 1598 inv_adst16_lsx adst, 4h 1599endfuncl 1600 1601functionl inv_flipadst_4h_x16_lsx 1602 inv_adst16_lsx flipadst, 4h 1603endfuncl 1604 1605.macro VLD_DST_ADD_W8_x4 in0, in1, in2, in3, in4, in5, in6, in7, in8, \ 1606 in9, in10, in11, in12, in13, in14, in15 1607 1608 alsl.d t2, a1, a0, 1 1609 VLD_DST_ADD_W8 \in0, \in1, \in2, \in3 1610 1611 add.d a0, a1, a0 1612 alsl.d t2, a1, a0, 1 1613 VLD_DST_ADD_W8 \in4, \in5, \in6, \in7 1614 1615 add.d a0, a1, a0 1616 alsl.d t2, a1, a0, 1 1617 VLD_DST_ADD_W8 \in8, \in9, \in10, \in11 1618 1619 add.d a0, a1, a0 1620 alsl.d t2, a1, a0, 1 1621 VLD_DST_ADD_W8 \in12, \in13, \in14, \in15 1622.endm 1623 1624.macro def_base_8x16 txfm1 1625functionl inv_txfm_\txfm1\()add_8x16_lsx 1626 blt a3, t5, 816f 1627 vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 1628 vxor.v vr23, vr23, vr23 1629.irp i, 16, 48, 80, 112, 144, 176, 208, 240 1630 vst vr23, a2, \i 1631.endr 1632 1633 li.w t0, 2896 1634 vreplgr2vr.w vr23, t0 1635.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 1636 rect2_lsx \i, vr23, \i 1637.endr 1638 1639.ifc \txfm1, identity_ 1640 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1641 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 1642 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1643.else 1644 move t6, ra 1645 jirl ra, t7, 0 1646 move ra, t6 1647 1648 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1649 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 1 1650 1651 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 1652 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 1653 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 1654.endif 1655 1656816: 1657 ble t5, a3, 816816f 1658.irp i, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 1659 vxor.v \i, \i, \i 1660.endr 1661 1662816816: 1663 vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 1664 vxor.v vr23, vr23, vr23 1665.irp i, 0, 32, 64, 96, 128, 160, 192, 224 1666 vst vr23, a2, \i 1667.endr 1668 1669 li.w t0, 2896 1670 vreplgr2vr.w vr23, t0 1671.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 1672 rect2_lsx \i, vr23, \i 1673.endr 1674 1675.ifc \txfm1, identity_ 1676 1677.else 1678 move t6, ra 1679 jirl ra, t7, 0 1680 move ra, t6 1681 1682.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 1683 vsrari.h \i, \i, 1 1684.endr 1685.endif 1686 1687 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1688 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1689 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1690 1691 move t6, ra 1692 jirl ra, t8, 0 1693 move ra, t6 1694 1695 vor.v vr0, vr0, vr0 1696 vsrari_h_x8 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 1697 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4 1698 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1699 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4 1700 1701 VLD_DST_ADD_W8_x4 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 1702 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1703endfuncl 1704.endm 1705 1706def_base_8x16 identity_ 1707def_base_8x16 1708 1709.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11 1710 vsllwil.hu.bu vr4, \in0, 0 1711 vexth.hu.bu vr0, \in0 1712 vsllwil.hu.bu vr5, \in1, 0 1713 vexth.hu.bu vr1, \in1 1714 vsllwil.hu.bu vr6, \in2, 0 1715 vexth.hu.bu vr2, \in2 1716 vsllwil.hu.bu vr7, \in3, 0 1717 vexth.hu.bu vr3, \in3 1718 vadd.h vr4, vr4, \in4 1719 vadd.h vr0, vr0, \in5 1720 vadd.h vr5, vr5, \in6 1721 vadd.h vr1, vr1, \in7 1722 vadd.h vr6, vr6, \in8 1723 vadd.h vr2, vr2, \in9 1724 vadd.h vr7, vr7, \in10 1725 vadd.h vr3, vr3, \in11 1726 vssrani.bu.h vr0, vr4, 0 1727 vssrani.bu.h vr1, vr5, 0 1728 vssrani.bu.h vr2, vr6, 0 1729 vssrani.bu.h vr3, vr7, 0 1730 vst vr0, a0, 0 1731 vstx vr1, a0, a1 1732 vst vr2, t2, 0 1733 vstx vr3, t2, a1 1734.endm 1735 1736.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7 1737 vld vr0, a0, 0 1738 vldx vr1, a0, a1 1739 vld vr2, t2, 0 1740 vldx vr3, t2, a1 1741 DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \ 1742 \in4, \in5, \in6, \in7 1743.endm 1744 1745.macro def_fn_16x8 txfm1 1746functionl inv_txfm_\txfm1\()add_16x8_lsx 1747 PUSH_REG 1748 1749 vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1750 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 1751 vxor.v vr23, vr23, vr23 1752.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, \ 1753 176, 192, 208, 224, 240 1754 vst vr23, a2, \i 1755.endr 1756 1757 li.w t0, 2896 1758 vreplgr2vr.w vr23, t0 1759.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1760 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 1761 rect2_lsx \i, vr23, \i 1762.endr 1763 1764 move t6, ra 1765 jirl ra, t7, 0 1766 move ra, t6 1767 1768.ifnc \txfm1, identity_ 1769.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1770 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 1771 vsrari.h \i, \i, 1 1772.endr 1773.endif 1774 1775 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1776 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1777 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1778 1779 move t6, ra 1780 jirl ra, t8, 0 1781 move ra, t6 1782 1783 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1784 vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, 4 1785 1786 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 1787 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1788 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1789 move t6, ra 1790 jirl ra, t8, 0 1791 move ra, t6 1792 1793 vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1794 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4 1795 1796 alsl.d t2, a1, a0, 1 1797 VLD_DST_ADD_W16 vr24, vr8, vr25, vr9, vr26, vr10, vr27, vr11 1798 1799 alsl.d a0, a1, a0, 2 1800 alsl.d t2, a1, a0, 1 1801 VLD_DST_ADD_W16 vr28, vr12, vr29, vr13, vr30, vr14, vr31, vr15 1802 1803 POP_REG 1804endfuncl 1805.endm 1806 1807def_fn_16x8 identity_ 1808def_fn_16x8 1809 1810.macro fun16x8 txfm1, txfm2 1811function inv_txfm_add_\txfm1\()_\txfm2\()_16x8_8bpc_lsx 1812.ifc \txfm1\()_\txfm2, dct_dct 1813 bnez a3, .NO_HAS_DCONLY_16x8 1814 1815 idct_dc 16, 8, 1 1816 1817 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ 1818 vr20, vr20, vr20, vr20, vr20 1819 1820 alsl.d a0, a1, a0, 2 1821 alsl.d t2, a1, a0, 1 1822 VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20, 1823 b .\txfm1\()_\txfm2\()_16x8_END 1824.NO_HAS_DCONLY_16x8: 1825.endif 1826 1827 la.local t7, inv_\txfm1\()_8h_x16_lsx 1828.ifc \txfm1, identity 1829 la.local t7, inv_identity_8h_x16_lsx1 1830.endif 1831 1832 la.local t8, inv_\txfm2\()_8h_x8_lsx 1833 1834.ifc \txfm1, identity 1835 b inv_txfm_identity_add_16x8_lsx 1836.else 1837 b inv_txfm_add_16x8_lsx 1838.endif 1839 1840.\txfm1\()_\txfm2\()_16x8_END: 1841endfunc 1842.endm 1843 1844fun16x8 dct, dct 1845fun16x8 identity, identity 1846fun16x8 dct, adst 1847fun16x8 dct, flipadst 1848fun16x8 dct, identity 1849fun16x8 adst, dct 1850fun16x8 adst, adst 1851fun16x8 adst, flipadst 1852fun16x8 flipadst, dct 1853fun16x8 flipadst, adst 1854fun16x8 flipadst, flipadst 1855fun16x8 identity, dct 1856fun16x8 adst, identity 1857fun16x8 flipadst, identity 1858fun16x8 identity, adst 1859fun16x8 identity, flipadst 1860 1861.macro fun8x16 txfm1, txfm2, eob_half 1862function inv_txfm_add_\txfm1\()_\txfm2\()_8x16_8bpc_lsx 1863.ifc \txfm1\()_\txfm2, dct_dct 1864 bnez a3, .NO_HAS_DCONLY_8x16 1865 1866 idct_dc 8, 16, 1 1867 1868 DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20 1869.rept 3 1870 add.d a0, a1, a0 1871 alsl.d t2, a1, a0, 1 1872 VLD_DST_ADD_W8 vr20, vr20, vr20, vr20 1873.endr 1874 1875 b .\txfm1\()_\txfm2\()_8x16_END 1876.NO_HAS_DCONLY_8x16: 1877.endif 1878 li.w t5, \eob_half 1879.ifnc \txfm1, identity 1880 la.local t7, inv_\txfm1\()_8h_x8_lsx 1881.endif 1882 1883 la.local t8, inv_\txfm2\()_8h_x16_lsx 1884.ifc \txfm1, identity 1885 b inv_txfm_identity_add_8x16_lsx 1886.else 1887 b inv_txfm_add_8x16_lsx 1888.endif 1889.\txfm1\()_\txfm2\()_8x16_END: 1890endfunc 1891.endm 1892 1893fun8x16 dct, dct, 43 1894fun8x16 identity, identity, 43 1895fun8x16 dct, adst, 43 1896fun8x16 dct, flipadst, 43 1897fun8x16 dct, identity, 8 1898fun8x16 adst, dct, 43 1899fun8x16 adst, adst, 43 1900fun8x16 adst, flipadst, 43 1901fun8x16 flipadst, dct, 43 1902fun8x16 flipadst, adst, 43 1903fun8x16 flipadst, flipadst, 43 1904fun8x16 identity, dct, 64 1905fun8x16 adst, identity, 8 1906fun8x16 flipadst, identity, 8 1907fun8x16 identity, adst, 64 1908fun8x16 identity, flipadst, 64 1909 1910functionl inv_txfm_add_16x16_lsx 1911 malloc_space 512 1912 1913 addi.d t1, sp, 64 1914 addi.d t2, a2, 0 1915.rept 2 1916 vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1917 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 1918 1919 vxor.v vr23, vr23, vr23 1920.irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, \ 1921 384, 416, 448, 480 1922 vst vr23, a2, \i 1923.endr 1924 1925 move t6, ra 1926 jirl ra, t7, 0 1927 move ra, t6 1928 1929 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1930 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1931 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1932 1933 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 1934 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 1935 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1936 1937.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1938 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 1939 vsrari.h \i, \i, 2 1940.endr 1941 vst_x8 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 1942 vst_x8 t1, 16, 32, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 1943 addi.d t1, t1, 256 1944 addi.d a2, a2, 16 1945 blt a3, t5, 1616f 1946.endr 1947 19481616: 1949 ble t5, a3, 16161616f 1950 addi.d t1, sp, 320 1951 vxor.v vr23, vr23, vr23 1952.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \ 1953 240 1954 vst vr23, t1, \i 1955.endr 1956 195716161616: 1958 addi.d t1, sp, 64 1959.rept 2 1960 vld_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1961 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 1962 1963 move t6, ra 1964 jirl ra, t8, 0 1965 move ra, t6 1966 1967 vst_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 1968 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 1969 1970 addi.d t1, t1, 16 1971.endr 1972 alsl.d t2, a1, a0, 1 1973 addi.d t1, sp, 64 1974.rept 4 1975 vld_x8 t1, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1976 vsrari_h_x8 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ 1977 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4 1978 VLD_DST_ADD_W16 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 1979 alsl.d a0, a1, a0, 2 1980 alsl.d t2, a1, a0, 1 1981 addi.d t1, t1, 128 1982.endr 1983 free_space 512 1984endfuncl 1985 1986.macro fun16x16 txfm1, txfm2, eob_half 1987function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_lsx 1988.ifc \txfm1\()_\txfm2, dct_dct 1989 bnez a3, .NO_HAS_DCONLY_16x16 1990 1991 idct_dc 16, 16, 2 1992 1993 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ 1994 vr20, vr20, vr20, vr20, vr20 1995.rept 3 1996 alsl.d a0, a1, a0, 2 1997 alsl.d t2, a1, a0, 1 1998 1999 VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 2000.endr 2001 b .\txfm1\()_\txfm2\()_16x16_END 2002.NO_HAS_DCONLY_16x16: 2003.endif 2004 li.w t5, \eob_half 2005 la.local t7, inv_\txfm1\()_8h_x16_lsx 2006 la.local t8, inv_\txfm2\()_8h_x16_lsx 2007 2008 b inv_txfm_add_16x16_lsx 2009.\txfm1\()_\txfm2\()_16x16_END: 2010endfunc 2011.endm 2012 2013fun16x16 dct, dct, 36 2014fun16x16 adst, adst, 36 2015fun16x16 adst, dct, 36 2016fun16x16 dct, adst, 36 2017fun16x16 flipadst, dct, 36 2018fun16x16 dct, flipadst, 36 2019fun16x16 adst, flipadst, 36 2020fun16x16 flipadst, adst, 36 2021 2022.macro dct_8x32_core_lsx in1, in2, vld_st0, vld_st1, vld_stride, \ 2023 vst_st0, vst_st1, vst_st2, vst_st3, vst_stride, \ 2024 transpose8x8, shift 2025 la.local t0, idct_coeffs 2026 vldrepl.w vr20, t0, 64 // 201 2027 vldrepl.w vr21, t0, 68 // 4091 2028 vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9 2029 vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10 2030 vssrarni.h.w vr9, vr8, 12 // t31a 2031 vssrarni.h.w vr10, vr11, 12 // t16a 2032 vldrepl.w vr20, t0, 72 // 3035 2033 vldrepl.w vr21, t0, 76 // 2751 2034 vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0 2035 vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30 2036 vssrarni.h.w vr0, vr8, 12 // t30a 2037 vssrarni.h.w vr30, vr11, 12 // t17a 2038 vldrepl.w vr20, t0, 80 // 1751 2039 vldrepl.w vr21, t0, 84 // 3703 2040 vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7 2041 vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19 2042 vssrarni.h.w vr7, vr8, 12 // t29a 2043 vssrarni.h.w vr19, vr11, 12 // t18a 2044 vldrepl.w vr20, t0, 88 // 3857 2045 vldrepl.w vr21, t0, 92 // 1380 2046 vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4 2047 vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26 2048 vssrarni.h.w vr4, vr8, 12 // t28a 2049 vssrarni.h.w vr26, vr11, 12 // t19a 2050 vldrepl.w vr20, t0, 96 // 995 2051 vldrepl.w vr21, t0, 100 // 3973 2052 vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3 2053 vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27 2054 vssrarni.h.w vr3, vr8, 12 // t27a 2055 vssrarni.h.w vr27, vr11, 12 // t20a 2056 vldrepl.w vr20, t0, 104 // 3513 2057 vldrepl.w vr21, t0, 108 // 2106 2058 vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2 2059 vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28 2060 vssrarni.h.w vr2, vr8, 12 // t26a 2061 vssrarni.h.w vr28, vr11, 12 // t21a 2062 vldrepl.w vr20, t0, 112 // 2440 -> 1220 2063 vldrepl.w vr21, t0, 116 // 3290 -> 1645 2064 vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5 2065 vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25 2066 vssrarni.h.w vr5, vr8, 12 // t25a 2067 vssrarni.h.w vr25, vr11, 12 // t22a 2068 vldrepl.w vr20, t0, 120 // 4052 2069 vldrepl.w vr21, t0, 124 // 601 2070 vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6 2071 vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24 2072 vssrarni.h.w vr6, vr8, 12 // t24a 2073 vssrarni.h.w vr24, vr11, 12 // t23a 2074 2075 vsadd.h vr1, vr10, vr30 // t16 2076 vssub.h vr29, vr10, vr30 // t17 2077 vssub.h vr8, vr26, vr19 // t18 2078 vsadd.h vr31, vr26, vr19 // t19 2079 vsadd.h vr10, vr27, vr28 // t20 2080 vssub.h vr30, vr27, vr28 // t21 2081 vssub.h vr19, vr24, vr25 // t22 2082 vsadd.h vr26, vr24, vr25 // t23 2083 vsadd.h vr27, vr6, vr5 // t24 2084 vssub.h vr28, vr6, vr5 // t25 2085 vssub.h vr24, vr3, vr2 // t26 2086 vsadd.h vr25, vr3, vr2 // t27 2087 vsadd.h vr5, vr4, vr7 // t28 2088 vssub.h vr6, vr4, vr7 // t29 2089 vssub.h vr2, vr9, vr0 // t30 2090 vsadd.h vr3, vr9, vr0 // t31 2091 2092 vldrepl.w vr20, t0, 16 // 799 2093 vldrepl.w vr21, t0, 20 // 4017 2094 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 2095 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 2096 vssrarni.h.w vr7, vr4, 12 // t30a 2097 vssrarni.h.w vr0, vr11, 12 // t17a 2098 vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 2099 vneg.w vr4, vr4 2100 vneg.w vr9, vr9 2101 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 2102 vssrarni.h.w vr9, vr4, 12 // t18a 2103 vssrarni.h.w vr2, vr11, 12 // t29a 2104 vldrepl.w vr20, t0, 24 // 3406 -> 1703 2105 vldrepl.w vr21, t0, 28 // 2276 -> 1138 2106 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 2107 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 2108 vssrarni.h.w vr29, vr4, 12 // t26a 2109 vssrarni.h.w vr6, vr11, 12 // t21a 2110 vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 2111 vneg.w vr4, vr4 2112 vneg.w vr8, vr8 2113 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 2114 vssrarni.h.w vr8, vr4, 12 // t22a 2115 vssrarni.h.w vr24, vr11, 12 // t25a 2116 2117 vsadd.h vr4, vr1, vr31 // t16a 2118 vssub.h vr30, vr1, vr31 // t19a 2119 vsadd.h vr19, vr0, vr9 // t17 2120 vssub.h vr28, vr0, vr9 // t18 2121 vssub.h vr1, vr26, vr10 // t20a 2122 vsadd.h vr31, vr26, vr10 // t23a 2123 vssub.h vr0, vr8, vr6 // t21 2124 vsadd.h vr9, vr8, vr6 // t22 2125 vsadd.h vr10, vr27, vr25 // t24a 2126 vssub.h vr26, vr27, vr25 // t27a 2127 vsadd.h vr6, vr24, vr29 // t25 2128 vssub.h vr8, vr24, vr29 // t26 2129 vssub.h vr25, vr3, vr5 // t28a 2130 vsadd.h vr27, vr3, vr5 // t31a 2131 vssub.h vr24, vr7, vr2 // t29 2132 vsadd.h vr29, vr7, vr2 // t30 2133 2134 vldrepl.w vr20, t0, 8 // 1567 2135 vldrepl.w vr21, t0, 12 // 3784 2136 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 2137 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 2138 vssrarni.h.w vr5, vr3, 12 // t29a 2139 vssrarni.h.w vr2, vr11, 12 // 18a 2140 vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 2141 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 2142 vssrarni.h.w vr7, vr3, 12 // t28 2143 vssrarni.h.w vr24, vr11, 12 // t19 2144 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 2145 vneg.w vr3, vr3 2146 vneg.w vr28, vr28 2147 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 2148 vssrarni.h.w vr28, vr3, 12 // t20 2149 vssrarni.h.w vr25, vr11, 12 // t27 2150 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 2151 vneg.w vr3, vr3 2152 vneg.w vr30, vr30 2153 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 2154 vssrarni.h.w vr30, vr3, 12 // t21a 2155 vssrarni.h.w vr1, vr11, 12 // t26a 2156 2157 vsadd.h vr3, vr4, vr31 // t16 2158 vssub.h vr26, vr4, vr31 // t23 2159 vsadd.h vr0, vr19, vr9 // t17a 2160 vssub.h vr8, vr19, vr9 // t22a 2161 vsadd.h vr4, vr2, vr30 // t18 2162 vssub.h vr31, vr2, vr30 // t21 2163 vsadd.h vr9, vr24, vr28 // t19a 2164 vssub.h vr19, vr24, vr28 // t20a 2165 vssub.h vr2, vr27, vr10 // t24 2166 vsadd.h vr30, vr27, vr10 // t31 2167 vssub.h vr24, vr29, vr6 // t25a 2168 vsadd.h vr28, vr29, vr6 // t30a 2169 vssub.h vr10, vr5, vr1 // t26 2170 vsadd.h vr27, vr5, vr1 // t29 2171 vssub.h vr6, vr7, vr25 // t27a 2172 vsadd.h vr29, vr7, vr25 // t28a 2173 2174 vldrepl.w vr20, t0, 0 // 2896 2175 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 2176 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 2177 vssrarni.h.w vr5, vr1, 12 // t20 2178 vssrarni.h.w vr7, vr11, 12 // t27 2179 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 2180 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 2181 vssrarni.h.w vr25, vr1, 12 // t21a 2182 vssrarni.h.w vr6, vr11, 12 // t26a 2183 vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 2184 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 2185 vssrarni.h.w vr19, vr1, 12 // t22 2186 vssrarni.h.w vr10, vr11, 12 // t25 2187 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 2188 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 2189 vssrarni.h.w vr31, vr1, 12 // t23a 2190 vssrarni.h.w vr8, vr11, 12 // t24a 2191 2192 // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 2193 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 2194 vld_x8 \in2, \vld_st0, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 2195 2196 vsadd.h vr1, vr11, vr30 // c[0] 2197 vssub.h vr2, vr11, vr30 // c[31] 2198 vsadd.h vr24, vr12, vr28 // c[1] 2199 vssub.h vr26, vr12, vr28 // c[30] 2200 vsadd.h vr11, vr13, vr27 // c[2] 2201 vssub.h vr30, vr13, vr27 // c[29] 2202 vsadd.h vr12, vr14, vr29 // c[3] 2203 vssub.h vr28, vr14, vr29 // c[28] 2204 vsadd.h vr13, vr15, vr7 // c[4] 2205 vssub.h vr27, vr15, vr7 // c[27] 2206 vsadd.h vr14, vr16, vr6 // c[5] 2207 vssub.h vr29, vr16, vr6 // c[26] 2208 vsadd.h vr7, vr17, vr10 // c[6] 2209 vssub.h vr15, vr17, vr10 // c[25] 2210 vsadd.h vr6, vr18, vr8 // c[7] 2211 vssub.h vr16, vr18, vr8 // c[24] 2212 2213.ifnb \transpose8x8 2214 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 2215 vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 2216 vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 2217.endif 2218 2219.ifnb \shift 2220.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 2221 vsrari.h \i, \i, \shift 2222.endr 2223.endif 2224 2225 vst_x8 \in1, \vst_st0, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 2226 2227.ifnb \transpose8x8 2228 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ 2229 vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ 2230 vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 2231.endif 2232 2233.ifnb \shift 2234.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 2235 vsrari.h \i, \i, \shift 2236.endr 2237.endif 2238 2239 vst_x8 \in1, \vst_st1, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 2240 2241 vld_x8 \in2, \vld_st1, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 2242 2243 vsadd.h vr1, vr11, vr31 // c[8] 2244 vssub.h vr2, vr11, vr31 // c[23] 2245 vsadd.h vr24, vr12, vr19 // c[9] 2246 vssub.h vr26, vr12, vr19 // c[22] 2247 vsadd.h vr11, vr13, vr25 // c[10] 2248 vssub.h vr30, vr13, vr25 // c[21] 2249 vsadd.h vr12, vr14, vr5 // c[11] 2250 vssub.h vr28, vr14, vr5 // c[20] 2251 vsadd.h vr13, vr15, vr9 // c[12] 2252 vssub.h vr27, vr15, vr9 // c[19] 2253 vsadd.h vr14, vr16, vr4 // c[13] 2254 vssub.h vr29, vr16, vr4 // c[18] 2255 vsadd.h vr7, vr17, vr0 // c[14] 2256 vssub.h vr15, vr17, vr0 // c[17] 2257 vsadd.h vr6, vr18, vr3 // c[15] 2258 vssub.h vr16, vr18, vr3 // c[16] 2259 2260.ifnb \transpose8x8 2261 LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 2262 vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \ 2263 vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 2264.endif 2265 2266.ifnb \shift 2267.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 2268 vsrari.h \i, \i, \shift 2269.endr 2270.endif 2271 2272 vst_x8 \in1, \vst_st2, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 2273 2274.ifnb \transpose8x8 2275 LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ 2276 vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \ 2277 vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23 2278.endif 2279 2280.ifnb \shift 2281.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 2282 vsrari.h \i, \i, \shift 2283.endr 2284.endif 2285 2286 vst_x8 \in1, \vst_st3, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 2287.endm 2288 2289const eob_32x32 2290 .short 36, 136, 300, 1024 2291endconst 2292 2293const eob_8x32 2294 .short 43, 107, 171, 256 2295endconst 2296 2297const eob_16x32 2298 .short 36, 151, 279, 512 2299endconst 2300 2301.macro DST_ADD_W32 in0, in1, in2, in3, in4, in5, in6, in7 2302 vsllwil.hu.bu vr4, vr10, 0 2303 vsllwil.hu.bu vr5, vr11, 0 2304 vsllwil.hu.bu vr6, vr12, 0 2305 vsllwil.hu.bu vr7, vr13, 0 2306 vexth.hu.bu vr10, vr10 2307 vexth.hu.bu vr11, vr11 2308 vexth.hu.bu vr12, vr12 2309 vexth.hu.bu vr13, vr13 2310 vadd.h vr4, vr4, \in0 2311 vadd.h vr10, vr10, \in1 2312 vadd.h vr5, vr5, \in2 2313 vadd.h vr11, vr11, \in3 2314 vadd.h vr6, vr6, \in4 2315 vadd.h vr12, vr12, \in5 2316 vadd.h vr7, vr7, \in6 2317 vadd.h vr13, vr13, \in7 2318 vssrani.bu.h vr10, vr4, 0 2319 vssrani.bu.h vr11, vr5, 0 2320 vssrani.bu.h vr12, vr6, 0 2321 vssrani.bu.h vr13, vr7, 0 2322 vst vr10, a0, 0 2323 vst vr11, a0, 16 2324 vst vr12, t2, 0 2325 vst vr13, t2, 16 2326.endm 2327 2328.macro idct_dc_w32 w, h, shift 2329 ld.h t2, a2, 0 // dc 2330 vldi vr0, 0x8b5 // 181 2331 vreplgr2vr.w vr1, t2 2332 vldi vr20, 0x880 // 128 2333 vmul.w vr2, vr0, vr1 // dc * 181 2334 st.h zero, a2, 0 2335 add.d t2, a0, a1 2336 vsrari.w vr2, vr2, 8 // (dc * 181 + 128) >> 8 2337 vld vr13, t2, 16 2338 2339.if (2*\w == \h) || (2*\h == \w) 2340 vmul.w vr2, vr2, vr0 2341 vsrari.w vr2, vr2, 8 2342.endif 2343 2344.if \shift>0 2345 vsrari.w vr2, vr2, \shift // (dc + rnd) >> shift 2346.endif 2347 vld vr11, a0, 16 2348 vmadd.w vr20, vr2, vr0 2349 vld vr12, t2, 0 2350 vssrarni.h.w vr20, vr20, 12 2351 vld vr10, a0, 0 2352.endm 2353 2354function inv_txfm_add_dct_dct_32x8_8bpc_lsx 2355 bnez a3, .NO_HAS_DCONLY_32x8 2356 2357 idct_dc_w32 32, 8, 2 2358 2359 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 2360 2361.rept 3 2362 alsl.d a0, a1, a0, 1 2363 add.d t2, a0, a1 2364 vld vr10, a0, 0 2365 vld vr11, a0, 16 2366 vld vr12, t2, 0 2367 vld vr13, t2, 16 2368 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 2369.endr 2370 b .DCT_DCT_32X8_END 2371.NO_HAS_DCONLY_32x8: 2372 malloc_space 512+256 2373 2374 addi.d t1, sp, 64 2375 addi.d t2, a2, 0 2376 addi.d t3, sp, 64 2377 addi.d t3, t3, 512 2378 2379 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2380 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2381 2382 vxor.v vr31, vr31, vr31 2383 vst_x16 t2, 0, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2384 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2385 2386 inv_dct16_lsx .8h 2387 2388 vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2389 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2390 2391 vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2392 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 2393 2394 vxor.v vr31, vr31, vr31 2395 2396 vst_x16 t2, 16, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2397 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2398 2399 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2 2400 2401 addi.d t2, sp, 64 2402.rept 4 2403 vld_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 2404 2405 inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h 2406 2407.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 2408 vsrari.h \i, \i, 4 2409.endr 2410 2411 vst_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 2412 2413 addi.d t2, t2, 16 2414.endr 2415 2416 addi.d t0, sp, 64 2417.rept 4 2418 add.d t2, a0, a1 2419 vld vr10, a0, 0 2420 vld vr11, a0, 16 2421 vld vr12, t2, 0 2422 vld vr13, t2, 16 2423 vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 2424 DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 2425 alsl.d a0, a1, a0, 1 2426 addi.d t0, t0, 128 2427.endr 2428 free_space 512+256 2429.DCT_DCT_32X8_END: 2430endfunc 2431 2432function inv_txfm_add_dct_dct_32x16_8bpc_lsx 2433 bnez a3, .NO_HAS_DCONLY_32x16 2434 2435 idct_dc_w32 32, 16, 1 2436 2437 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 2438 2439.rept 7 2440 alsl.d a0, a1, a0, 1 2441 add.d t2, a0, a1 2442 vld vr10, a0, 0 2443 vld vr11, a0, 16 2444 vld vr12, t2, 0 2445 vld vr13, t2, 16 2446 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 2447.endr 2448 b .DCT_DCT_32X16_END 2449.NO_HAS_DCONLY_32x16: 2450 malloc_space 1024+256 // 32*32*2+512 2451 addi.d t1, sp, 64 2452 addi.d t2, a2, 0 2453 addi.d t3, sp, 64 2454 addi.d t3, t3, 1024 2455.rept 2 2456 vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2457 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2458 2459 vxor.v vr31, vr31, vr31 2460 vst_x16 t2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2461 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2462 2463 li.w t0, 2896 2464 vreplgr2vr.w vr23, t0 2465.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2466 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2467 rect2_lsx \i, vr23, \i 2468.endr 2469 2470 inv_dct16_lsx .8h 2471 2472 vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2473 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2474 2475 vld_x16 t2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2476 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 2477 2478 la.local t0, idct_coeffs 2479 vldrepl.w vr23, t0, 0 // 2896 2480.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2481 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 2482 rect2_lsx \i, vr23, \i 2483.endr 2484 vxor.v vr31, vr31, vr31 2485 vst_x16 t2, 32, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2486 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2487 2488 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 1 2489 2490 addi.d t2, t2, 16 2491 addi.d t1, t1, 512 2492.endr 2493 2494 addi.d t2, sp, 64 2495.rept 4 2496 vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2497 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2498 2499 inv_dct16_lsx .8h 2500 2501.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2502 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2503 vsrari.h \i, \i, 4 2504.endr 2505 2506 vst_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2507 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2508 2509 addi.d t2, t2, 16 2510.endr 2511 2512 addi.d t0, sp, 64 2513.rept 8 2514 add.d t2, a0, a1 2515 vld vr10, a0, 0 2516 vld vr11, a0, 16 2517 vld vr12, t2, 0 2518 vld vr13, t2, 16 2519 vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 2520 DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 2521 2522 alsl.d a0, a1, a0, 1 2523 addi.d t0, t0, 128 2524.endr 2525 free_space 1024+256 2526.DCT_DCT_32X16_END: 2527endfunc 2528 2529function inv_txfm_add_dct_dct_32x32_8bpc_lsx 2530 bnez a3, .NO_HAS_DCONLY_32x32 2531 2532 idct_dc_w32 32, 32, 2 2533 2534 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 2535.rept 15 2536 alsl.d a0, a1, a0, 1 2537 add.d t2, a0, a1 2538 vld vr10, a0, 0 2539 vld vr11, a0, 16 2540 vld vr12, t2, 0 2541 vld vr13, t2, 16 2542 DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 2543.endr 2544 b .DCT_DCT_32X32_END 2545.NO_HAS_DCONLY_32x32: 2546 malloc_space 2560 // 32*32*2+512 2547 2548 addi.d t1, sp, 64 2549 addi.d t2, a2, 0 2550 addi.d t3, sp, 1024 2551 addi.d t3, t3, 1024 2552 addi.d t3, t3, 64 2553 2554 la.local t8, eob_32x32 2555.DCT_DCT_EOB_32x32: 2556 ld.h t7, t8, 0 2557 addi.d t8, t8, 2 2558 2559 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2560 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2561 2562 vxor.v vr31, vr31, vr31 2563 vst_x16 t2, 0, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2564 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2565 2566 inv_dct16_lsx .8h 2567 2568 vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2569 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2570 2571 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2572 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 2573 2574 vxor.v vr31, vr31, vr31 2575 2576 vst_x16 t2, 64, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2577 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2578 2579 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2 2580 2581 addi.d t2, t2, 16 2582 addi.d t1, t1, 512 2583 bge a3, t7, .DCT_DCT_EOB_32x32 2584 2585 la.local t8, eob_32x32 2586 vxor.v vr31, vr31, vr31 2587 ld.h t7, t8, 4 2588 bge a3, t7, .DCT_DCT_EOB_32x32_END // a3>=t7 2589 vst_x16 sp, 64+1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2590 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2591 addi.d t1, sp, 256+64 2592 vst_x16 t1, 1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2593 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2594 2595 ld.h t7, t8, 2 2596 bge a3, t7, .DCT_DCT_EOB_32x32_END 2597 vst_x16 sp, 64+1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2598 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2599 vst_x16 t1, 1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2600 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2601 2602 ld.h t7, t8, 0 2603 bge a3, t7, .DCT_DCT_EOB_32x32_END 2604 vst_x16 sp, 64+512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2605 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2606 2607 vst_x16 t1, 512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 2608 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 2609 2610.DCT_DCT_EOB_32x32_END: 2611 addi.d t2, sp, 64 2612 addi.d t1, sp, 64 2613.rept 4 2614 vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2615 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2616 2617 inv_dct16_lsx .8h 2618 2619 vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2620 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 2621 2622 vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 2623 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 2624 2625 dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 1536, 512, 1024, 64, , 4 2626 2627 addi.d t2, t2, 16 2628 addi.d t1, t1, 16 2629.endr 2630 2631 addi.d t0, sp, 64 2632.rept 16 2633 add.d t2, a0, a1 2634 vld vr10, a0, 0 2635 vld vr11, a0, 16 2636 vld vr12, t2, 0 2637 vld vr13, t2, 16 2638 vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 2639 DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 2640 alsl.d a0, a1, a0, 1 2641 addi.d t0, t0, 128 2642.endr 2643 2644 free_space 2560 // 32*32*2+512 2645.DCT_DCT_32X32_END: 2646endfunc 2647 2648/* 2649 * temp: vr8, vr9, vr10, vr12, vr20, vr21, vr22, vr23 2650 */ 2651.macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \ 2652 out1, out2, out3, out4, out5, out6, out7, rect2 2653 2654 la.local t0, idct_coeffs 2655 2656.ifc \rect2, rect2_lsx 2657 vldrepl.w vr23, t0, 0 // 2896 2658.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7 2659 rect2_lsx \i, vr23, \i 2660.endr 2661.endif 2662 2663 la.local t0, idct_coeffs 2664 2665 vldrepl.w vr20, t0, 8 // 1567 2666 vldrepl.w vr21, t0, 12 // 3784 2667 vsllwil.w.h vr22, \in2, 0 2668 vexth.w.h vr23, \in2 2669 vmul.w vr8, vr22, vr20 2670 vmul.w vr10, vr23, vr20 2671 vmul.w \in2, vr22, vr21 2672 vmul.w vr9, vr23, vr21 2673 vssrarni.h.w vr10, vr8, 12 // t2 2674 vssrarni.h.w vr9, \in2, 12 // t3 2675 2676 vldrepl.w vr20, t0, 0 // 2896 2677 vsllwil.w.h vr22, \in0, 0 2678 vexth.w.h vr23, \in0 2679 vmul.w vr8, vr22, vr20 2680 vmul.w \in2, vr23, vr20 2681 vssrarni.h.w \in2, vr8, 12 2682 2683 vsadd.h vr8, \in2, vr9 // c[0] 2684 vssub.h vr9, \in2, vr9 // c[3] 2685 vsadd.h \in0, \in2, vr10 // c[1] 2686 vssub.h vr10, \in2, vr10 // c[2] 2687 2688 // inv_dct8_1d_internal_c tx64 2689 // in1 in3 2690 vldrepl.w vr20, t0, 16 // 799 2691 vldrepl.w vr21, t0, 20 // 4017 2692 2693 vsllwil.w.h vr22, \in1, 0 2694 vexth.w.h vr23, \in1 2695 vmul.w \in2, vr22, vr21 2696 vmul.w \in4, vr23, vr21 2697 vmul.w \in1, vr22, vr20 2698 vmul.w \in6, vr23, vr20 2699 vssrarni.h.w \in4, \in2, 12 // t7a 2700 vssrarni.h.w \in6, \in1, 12 // t4a 2701 2702 vldrepl.w vr20, t0, 24 // 3406 2703 vldrepl.w vr21, t0, 28 // 2276 2704 2705 vsllwil.w.h vr22, \in3, 0 2706 vexth.w.h vr23, \in3 2707 vneg.w vr21, vr21 2708 vmul.w \in2, vr22, vr20 2709 vmul.w \in1, vr23, vr20 2710 vmul.w \in3, vr22, vr21 2711 vmul.w \in7, vr23, vr21 2712 vssrarni.h.w \in1, \in2, 12 // t6a 2713 vssrarni.h.w \in7, \in3, 12 // t5a 2714 2715 vsadd.h \in3, \in6, \in7 // t4 2716 vssub.h \in6, \in6, \in7 // t5a 2717 vsadd.h \in5, \in4, \in1 // t7 2718 vssub.h \in4, \in4, \in1 // t6a 2719 2720 vldrepl.w vr20, t0, 0 // 2896 2721 vmul_vmadd_w \in4, \in6, vr20, vr20, vr21, \in1 2722 vmul_vmsub_w \in4, \in6, vr20, vr20, \in2, \in7 2723 vssrarni.h.w \in1, vr21, 12 // t6 2724 vssrarni.h.w \in7, \in2, 12 // t5 2725 2726 vsadd.h \out0, vr8, \in5 // c[0] 2727 vssub.h \out7, vr8, \in5 // c[7] 2728 vsadd.h \out1, \in0, \in1 // c[1] 2729 vssub.h \out6, \in0, \in1 // c[6] 2730 vsadd.h \out2, vr10, \in7 // c[2] 2731 vssub.h \out5, vr10, \in7 // c[5] 2732 vsadd.h \out3, vr9, \in3 // c[3] 2733 vssub.h \out4, vr9, \in3 // c[4] 2734.endm 2735 2736/* 2737 * input: in0, in1, in2, in3, in4, in5, in6, in7 (fixed) 2738 * vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 2739 * in8, in9, in10, in11, in12, in13, in14, in15 2740 * vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 2741 * output: out0, out1, out2, out3, out4, out5, out6, out7 (fixed) 2742 * vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16 2743 * out8, out9, out10, out11, out12, out13, out14, out15 2744 * vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 2745 */ 2746.macro dct_8x16_tx64_core_lsx rect2 2747 dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \ 2748 vr12, vr13, vr14, vr15, vr16, vr17, vr18, \rect2 2749 2750 // in1 in3 in5 in7 in9 in11 in13 in15 2751 // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30 2752 la.local t0, idct_coeffs 2753 2754.ifc \rect2, rect2_lsx 2755 vldrepl.w vr23, t0, 0 // 2896 2756.irp i, vr1, vr3, vr5, vr7, vr24, vr26, vr28, vr30 2757 rect2_lsx \i, vr23, \i 2758.endr 2759.endif 2760 2761 vldrepl.w vr20, t0, 32 // 401 2762 vldrepl.w vr21, t0, 36 // 4076 2763 vsllwil.w.h vr22, vr1, 0 2764 vexth.w.h vr23, vr1 2765 vmul.w vr0, vr22, vr21 2766 vmul.w vr10, vr23, vr21 2767 vmul.w vr1, vr22, vr20 2768 vmul.w vr29, vr23, vr20 2769 vssrarni.h.w vr10, vr0, 12 // t15a 2770 vssrarni.h.w vr29, vr1, 12 // t8a 2771 2772 vldrepl.w vr20, t0, 40 // 3166 -> 1583 2773 vldrepl.w vr21, t0, 44 // 2598 -> 1299 2774 vsllwil.w.h vr22, vr7, 0 2775 vexth.w.h vr23, vr7 2776 vneg.w vr21, vr21 2777 vmul.w vr0, vr22, vr20 2778 vmul.w vr30, vr23, vr20 2779 vmul.w vr7, vr22, vr21 2780 vmul.w vr31, vr23, vr21 2781 vssrarni.h.w vr30, vr0, 12 // t14a 2782 vssrarni.h.w vr31, vr7, 12 // t9a 2783 2784 vldrepl.w vr20, t0, 48 // 1931 2785 vldrepl.w vr21, t0, 52 // 3612 2786 vsllwil.w.h vr22, vr5, 0 2787 vexth.w.h vr23, vr5 2788 vmul.w vr0, vr22, vr21 2789 vmul.w vr24, vr23, vr21 2790 vmul.w vr5, vr22, vr20 2791 vmul.w vr25, vr23, vr20 2792 vssrarni.h.w vr24, vr0, 12 // t13a 2793 vssrarni.h.w vr25, vr5, 12 // t10a 2794 2795 vldrepl.w vr20, t0, 56 // 3920 2796 vldrepl.w vr21, t0, 60 // 1189 2797 vsllwil.w.h vr22, vr3, 0 2798 vexth.w.h vr23, vr3 2799 vneg.w vr21, vr21 2800 vmul.w vr0, vr22, vr20 2801 vmul.w vr26, vr23, vr20 2802 vmul.w vr3, vr22, vr21 2803 vmul.w vr27, vr23, vr21 2804 vssrarni.h.w vr26, vr0, 12 // t12a 2805 vssrarni.h.w vr27, vr3, 12 // t11a 2806 2807 // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27 2808 vsadd.h vr28, vr29, vr31 // t8 2809 vssub.h vr19, vr29, vr31 // t9 2810 vssub.h vr29, vr27, vr25 // t10 2811 vsadd.h vr9, vr27, vr25 // t11 2812 vsadd.h vr31, vr26, vr24 // t12 2813 vssub.h vr25, vr26, vr24 // t13 2814 vssub.h vr27, vr10, vr30 // t14 2815 vsadd.h vr24, vr10, vr30 // t15 2816 2817 vldrepl.w vr20, t0, 8 // 1567 2818 vldrepl.w vr21, t0, 12 // 3784 2819 vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26 2820 vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30 2821 vssrarni.h.w vr26, vr0, 12 // t14a 2822 vssrarni.h.w vr30, vr1, 12 // t9a 2823 2824 vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19 2825 vneg.w vr0, vr0 2826 vneg.w vr19, vr19 2827 vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27 2828 vssrarni.h.w vr19, vr0, 12 // t10a 2829 vssrarni.h.w vr27, vr1, 12 // t13a 2830 2831 vsadd.h vr25, vr28, vr9 // t8a 2832 vssub.h vr29, vr28, vr9 // t11a 2833 vssub.h vr28, vr24, vr31 // t12a 2834 vsadd.h vr10, vr24, vr31 // t15a 2835 vsadd.h vr9, vr30, vr19 // t9 2836 vssub.h vr31, vr30, vr19 // t10 2837 vssub.h vr30, vr26, vr27 // t13 2838 vsadd.h vr24, vr26, vr27 // t14 2839 2840 vldrepl.w vr20, t0, 0 // 2896 2841 vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26 2842 vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27 2843 vssrarni.h.w vr26, vr0, 12 // t13a 2844 vssrarni.h.w vr27, vr1, 12 // t10a 2845 2846 vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31 2847 vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30 2848 vssrarni.h.w vr31, vr0, 12 // t12 2849 vssrarni.h.w vr30, vr1, 12 // t11 2850 2851 // vr11 vr12 ... vr18 2852 vsadd.h vr28, vr14, vr31 // c[3] 2853 vssub.h vr29, vr14, vr31 // c[12] 2854 vsadd.h vr20, vr15, vr30 // c[4] 2855 vssub.h vr21, vr15, vr30 // c[11] 2856 vsadd.h vr14, vr16, vr27 // c[5] 2857 vssub.h vr23, vr16, vr27 // c[10] 2858 vsadd.h vr15, vr17, vr9 // c[6] 2859 vssub.h vr30, vr17, vr9 // c[9] 2860 vsadd.h vr16, vr18, vr25 // c[7] 2861 vssub.h vr27, vr18, vr25 // c[8] 2862 vsadd.h vr17, vr13, vr26 // c[2] 2863 vssub.h vr26, vr13, vr26 // c[13] 2864 vsadd.h vr18, vr12, vr24 // c[1] 2865 vssub.h vr25, vr12, vr24 // c[14] 2866 vsadd.h vr22, vr11, vr10 // c[0] 2867 vssub.h vr24, vr11, vr10 // c[15] 2868.endm // dct_8x16_tx64_core_lsx 2869 2870.macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1 2871 vsllwil.w.h vr22, \in0, 0 2872 vexth.w.h vr23, \in0 2873 vmul.w \tmp0, vr22, \in1 2874 vmul.w \out0, vr23, \in1 2875 vmul.w \tmp1, vr22, \in2 2876 vmul.w \out1, vr23, \in2 2877 vssrarni.h.w \out0, \tmp0, 12 2878 vssrarni.h.w \out1, \tmp1, 12 2879.endm 2880 2881const idct64_coeffs, align=4 2882 .word 101, 4095, 2967, -2824 2883 .word 1660, 3745, 3822, -1474 2884 .word 4076, 401, 4017, 799 2885 .word 4036, -700, 2359, 3349 2886 .word 3461, -2191, 897, 3996 2887 .word -3166, -2598, -799, -4017 2888 .word 501, 4065, 3229, -2520 2889 .word 2019, 3564, 3948, -1092 2890 .word 3612, 1931, 2276, 3406 2891 .word 4085, -301, 2675, 3102 2892 .word 3659, -1842, 1285, 3889 2893 .word -3920, -1189, -3406, -2276 2894endconst 2895 2896.macro dct64_step1_lsx 2897 vldrepl.w vr20, t0, 0 // 101 2898 vldrepl.w vr21, t0, 4 // 4095 2899 vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9 // vr8 t32a vr9 t63a 2900 vldrepl.w vr20, t0, 8 // 2967 2901 vldrepl.w vr21, t0, 12 // -2824 2902 vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11 // vr10 t62a vr11 t33a 2903 vldrepl.w vr20, t0, 16 // 1660 2904 vldrepl.w vr21, t0, 20 // 3745 2905 vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13 // vr12 t34a vr13 t61a 2906 vldrepl.w vr20, t0, 24 // 3822 2907 vldrepl.w vr21, t0, 28 // -1474 2908 vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15 // vr14 t60a vr15 t35a 2909 2910 vsadd.h vr0, vr8, vr11 // t32 2911 vssub.h vr1, vr8, vr11 // t33 2912 vssub.h vr2, vr15, vr12 // t34 2913 vsadd.h vr3, vr15, vr12 // t35 2914 vsadd.h vr4, vr14, vr13 // t60 2915 vssub.h vr5, vr14, vr13 // t61 2916 vssub.h vr6, vr9, vr10 // t62 2917 vsadd.h vr7, vr9, vr10 // t63 2918 2919 vldrepl.w vr20, t0, 32 // 4076 2920 vldrepl.w vr21, t0, 36 // 401 2921 vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10 2922 vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11 2923 vssrarni.h.w vr10, vr9, 12 // t62a 2924 vssrarni.h.w vr11, vr13, 12 // t33a 2925 2926 vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1 2927 vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6 2928 vneg.w vr9, vr9 2929 vneg.w vr1, vr1 2930 vssrarni.h.w vr6, vr13, 12 // t61a 2931 vssrarni.h.w vr1, vr9, 12 // t34a 2932 2933 vsadd.h vr2, vr0, vr3 // t32a 2934 vssub.h vr5, vr0, vr3 // t35a 2935 vsadd.h vr9, vr11, vr1 // t33 2936 vssub.h vr13, vr11, vr1 // t34 2937 vssub.h vr0, vr7, vr4 // t60a 2938 vsadd.h vr3, vr7, vr4 // t63a 2939 vssub.h vr1, vr10, vr6 // t61 2940 vsadd.h vr11, vr10, vr6 // t62 2941 2942 vldrepl.w vr20, t0, 40 // 4017 2943 vldrepl.w vr21, t0, 44 // 799 2944 vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4 2945 vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7 2946 vssrarni.h.w vr4, vr8, 12 // t61a 2947 vssrarni.h.w vr7, vr12, 12 // t34a 2948 2949 vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6 2950 vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10 2951 vssrarni.h.w vr6, vr8, 12 // t60 2952 vssrarni.h.w vr10, vr12, 12 // t35 2953 2954 vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3 2955.endm // dct64_step1 2956 2957 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 2958 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 2959 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 2960 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 2961.macro dct64_step2_lsx 2962 vld vr0, t5, 0 // t32a 2963 vld vr2, t4, 0 // t63a 2964 vld vr3, t5, 16*8 // t56a 2965 vld vr1, t4, 16*8 // t39a 2966 vld vr4, t5, 16*16 // t40a 2967 vld vr6, t4, 16*16 // t55a 2968 vld vr7, t5, 16*24 // t48a 2969 vld vr5, t4, 16*24 // t47a 2970 2971 vsadd.h vr8, vr0, vr1 // t32 2972 vssub.h vr9, vr0, vr1 // t39 2973 vsadd.h vr10, vr2, vr3 // t63 2974 vssub.h vr11, vr2, vr3 // t56 2975 vssub.h vr12, vr5, vr4 // t40 2976 vsadd.h vr13, vr5, vr4 // t47 2977 vsadd.h vr14, vr7, vr6 // t48 2978 vssub.h vr15, vr7, vr6 // t55 2979 vldrepl.w vr20, t0, 8 // 1567 2980 vldrepl.w vr21, t0, 12 // 3784 2981 vmul_vmadd_w vr11, vr9, vr21, vr20, vr0, vr2 2982 vmul_vmsub_w vr11, vr9, vr20, vr21, vr1, vr3 2983 vssrarni.h.w vr2, vr0, 12 // t56a 2984 vssrarni.h.w vr3, vr1, 12 // t39a 2985 vmul_vmadd_w vr15, vr12, vr21, vr20, vr0, vr4 2986 vmul_vmsub_w vr15, vr12, vr20, vr21, vr1, vr5 2987 vneg.w vr0, vr0 2988 vneg.w vr4, vr4 2989 vssrarni.h.w vr5, vr1, 12 // t55a 2990 vssrarni.h.w vr4, vr0, 12 // t40a 2991 vsadd.h vr9, vr8, vr13 // t32a 2992 vssub.h vr11, vr8, vr13 // t47a 2993 vsadd.h vr6, vr3, vr4 // t39 2994 vssub.h vr7, vr3, vr4 // t40 2995 vssub.h vr12, vr10, vr14 // t48a 2996 vsadd.h vr15, vr10, vr14 // t63a 2997 vssub.h vr0, vr2, vr5 // t55 2998 vsadd.h vr1, vr2, vr5 // t56 2999 3000 vldrepl.w vr20, t0, 0 // 2896 3001 vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13 3002 vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4 3003 vssrarni.h.w vr13, vr8, 12 // t40a 3004 vssrarni.h.w vr4, vr3, 12 // t55a 3005 vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10 3006 vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14 3007 vssrarni.h.w vr10, vr8, 12 // t47 3008 vssrarni.h.w vr14, vr3, 12 // t48 3009 3010 // t32a t39 t40a t47 t48 t55a t56 t63a 3011 // vr9 vr6 vr13 vr10 vr14 vr4 vr1 vr15 3012 vst vr9, t5, 0 // t32a 3013 vst vr6, t4, 0 // t39 3014 vst vr13, t5, 16*8 // t40a 3015 vst vr10, t4, 16*8 // t47 3016 vst vr14, t5, 16*16 // t48 3017 vst vr4, t4, 16*16 // t55a 3018 vst vr1, t5, 16*24 // t56 3019 vst vr15, t4, 16*24 // t63a 3020.endm // dct64_step2_lsx 3021 3022.macro dct64_step3_lsx 3023 // t0 t1 t2 t3 t4 t5 t6 t7 3024 vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17 3025 vld vr9, t5, 16*24 // t56 3026 vld vr6, t5, 16*24+16 // t57a 3027 vld vr13, t5, 16*24+32 // t58 3028 vld vr10, t5, 16*24+48 // t59a 3029 vld vr14, t4, 16*24-48 // t60 3030 vld vr4, t4, 16*24-32 // t61a 3031 vld vr1, t4, 16*24-16 // t62 3032 vld vr15, t4, 16*24 // t63a 3033 vsadd.h vr20, vr2, vr15 // c[0] 3034 vssub.h vr21, vr2, vr15 // c[63] 3035 vsadd.h vr22, vr3, vr1 // c[1] 3036 vssub.h vr23, vr3, vr1 // c[62] 3037 vsadd.h vr24, vr7, vr4 // c[2] 3038 vssub.h vr25, vr7, vr4 // c[61] 3039 vsadd.h vr26, vr8, vr14 // c[3] 3040 vssub.h vr27, vr8, vr14 // c[60] 3041 vsadd.h vr28, vr11, vr10 // c[4] 3042 vssub.h vr29, vr11, vr10 // c[59] 3043 vsadd.h vr30, vr12, vr13 // c[5] 3044 vssub.h vr31, vr12, vr13 // c[58] 3045 vsadd.h vr2, vr16, vr6 // c[6] 3046 vssub.h vr15, vr16, vr6 // c[57] 3047 vsadd.h vr1, vr17, vr9 // c[7] 3048 vssub.h vr3, vr17, vr9 // c[56] 3049.endm // dct64_step3_lsx 3050 3051.macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1 3052 dct64_step3_lsx 3053 3054.ifnb \transpose8x8 3055 LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ 3056 vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ 3057 vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 3058 3059 LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ 3060 vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \ 3061 vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13 3062.endif 3063 3064.ifnb \shift 3065.irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \ 3066 vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 3067 vsrari.h \i, \i, \shift 3068.endr 3069.endif 3070 3071 vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 3072 3073 vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 3074.endm // dct64_step4_lsx 3075 3076.macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7 3077 fld.d f4, t0, 0 3078 fldx.d f5, t0, a1 3079 fld.d f6, t6, 0 3080 fldx.d f7, t6, a1 3081 alsl.d t0, a1, t0, 2 3082 alsl.d t6, a1, t6, 2 3083 fld.d f8, t0, 0 3084 fldx.d f9, t0, a1 3085 fld.d f10, t6, 0 3086 fldx.d f11, t6, a1 3087.irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11 3088 vsllwil.hu.bu \i, \i, 0 3089.endr 3090 vsrari.h vr20, \in0, 4 3091 vsrari.h vr22, \in1, 4 3092 vsrari.h vr24, \in2, 4 3093 vsrari.h vr26, \in3, 4 3094 vsrari.h vr28, \in4, 4 3095 vsrari.h vr30, \in5, 4 3096 vsrari.h vr2, \in6, 4 3097 vsrari.h vr1, \in7, 4 3098 vadd.h vr4, vr4, vr20 3099 vadd.h vr5, vr5, vr22 3100 vadd.h vr6, vr6, vr24 3101 vadd.h vr7, vr7, vr26 3102 vadd.h vr8, vr8, vr28 3103 vadd.h vr9, vr9, vr30 3104 vadd.h vr10, vr10, vr2 3105 vadd.h vr11, vr11, vr1 3106 vssrani.bu.h vr5, vr4, 0 3107 vssrani.bu.h vr7, vr6, 0 3108 vssrani.bu.h vr9, vr8, 0 3109 vssrani.bu.h vr11, vr10, 0 3110 3111 vstelm.d vr5, t1, 0, 0 3112 vstelm.d vr5, t2, 0, 1 3113 alsl.d t1, a1, t1, 1 3114 alsl.d t2, a1, t2, 1 3115 vstelm.d vr7, t1, 0, 0 3116 vstelm.d vr7, t2, 0, 1 3117 alsl.d t1, a1, t1, 1 3118 alsl.d t2, a1, t2, 1 3119 vstelm.d vr9, t1, 0, 0 3120 vstelm.d vr9, t2, 0, 1 3121 alsl.d t1, a1, t1, 1 3122 alsl.d t2, a1, t2, 1 3123 vstelm.d vr11, t1, 0, 0 3124 vstelm.d vr11, t2, 0, 1 3125.endm // dct64_step5_lsx 3126 3127.macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1, rect2 3128 vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 3129 3130 dct_8x16_tx64_core_lsx \rect2 3131 3132 vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \ 3133 vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24 3134 3135 vxor.v vr31, vr31, vr31 3136 vst_x8 t2, \vld_loc0, \stride0, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 3137 3138 vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 3139 3140 vst_x8 t2, \vld_loc1, \stride1, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 3141 3142 la.local t0, idct_coeffs 3143 3144.ifc \rect2, rect2_lsx 3145 vldrepl.w vr23, t0, 0 // 2896 3146.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 3147 rect2_lsx \i, vr23, \i 3148.endr 3149.endif 3150 3151 vldrepl.w vr20, t0, 64 // 201 3152 vldrepl.w vr21, t0, 68 // 4091 3153 vsllwil.w.h vr22, vr0, 0 3154 vexth.w.h vr23, vr0 3155 vmul.w vr8, vr22, vr21 3156 vmul.w vr9, vr23, vr21 3157 vmul.w vr0, vr22, vr20 3158 vmul.w vr10, vr23, vr20 3159 vssrarni.h.w vr9, vr8, 12 // t31a 3160 vssrarni.h.w vr10, vr0, 12 // t16a 3161 3162 vldrepl.w vr20, t0, 72 // 3035 3163 vldrepl.w vr21, t0, 76 // 2751 3164 vsllwil.w.h vr22, vr7, 0 3165 vexth.w.h vr23, vr7 3166 vneg.w vr21, vr21 3167 vmul.w vr8, vr22, vr20 3168 vmul.w vr0, vr23, vr20 3169 vmul.w vr7, vr22, vr21 3170 vmul.w vr30, vr23, vr21 3171 vssrarni.h.w vr0, vr8, 12 // t30a 3172 vssrarni.h.w vr30, vr7, 12 // t17a 3173 3174 vldrepl.w vr20, t0, 80 // 1751 3175 vldrepl.w vr21, t0, 84 // 3703 3176 vsllwil.w.h vr22, vr4, 0 3177 vexth.w.h vr23, vr4 3178 vmul.w vr8, vr22, vr21 3179 vmul.w vr7, vr23, vr21 3180 vmul.w vr4, vr22, vr20 3181 vmul.w vr19, vr23, vr20 3182 vssrarni.h.w vr7, vr8, 12 // t29a 3183 vssrarni.h.w vr19, vr4, 12 // t18a 3184 3185 vldrepl.w vr20, t0, 88 // 3857 3186 vldrepl.w vr21, t0, 92 // 1380 3187 vsllwil.w.h vr22, vr3, 0 3188 vexth.w.h vr23, vr3 3189 vneg.w vr21, vr21 3190 vmul.w vr8, vr22, vr20 3191 vmul.w vr4, vr23, vr20 3192 vmul.w vr3, vr22, vr21 3193 vmul.w vr26, vr23, vr21 3194 vssrarni.h.w vr4, vr8, 12 // t28a 3195 vssrarni.h.w vr26, vr3, 12 // t19a 3196 3197 vldrepl.w vr20, t0, 96 // 995 3198 vldrepl.w vr21, t0, 100 // 3973 3199 vsllwil.w.h vr22, vr2, 0 3200 vexth.w.h vr23, vr2 3201 vmul.w vr8, vr22, vr21 3202 vmul.w vr3, vr23, vr21 3203 vmul.w vr2, vr22, vr20 3204 vmul.w vr27, vr23, vr20 3205 vssrarni.h.w vr3, vr8, 12 // t27a 3206 vssrarni.h.w vr27, vr2, 12 // t20a 3207 3208 vldrepl.w vr20, t0, 104 // 3513 3209 vldrepl.w vr21, t0, 108 // 2106 3210 vsllwil.w.h vr22, vr5, 0 3211 vexth.w.h vr23, vr5 3212 vneg.w vr21, vr21 3213 vmul.w vr8, vr22, vr20 3214 vmul.w vr2, vr23, vr20 3215 vmul.w vr5, vr22, vr21 3216 vmul.w vr28, vr23, vr21 3217 vssrarni.h.w vr2, vr8, 12 // t26a 3218 vssrarni.h.w vr28, vr5, 12 // t21a 3219 3220 vldrepl.w vr20, t0, 112 // 2440 -> 1220 3221 vldrepl.w vr21, t0, 116 // 3290 -> 1645 3222 vsllwil.w.h vr22, vr6, 0 3223 vexth.w.h vr23, vr6 3224 vmul.w vr8, vr22, vr21 3225 vmul.w vr5, vr23, vr21 3226 vmul.w vr6, vr22, vr20 3227 vmul.w vr25, vr23, vr20 3228 vssrarni.h.w vr5, vr8, 12 // t25a 3229 vssrarni.h.w vr25, vr6, 12 // t22a 3230 3231 vldrepl.w vr20, t0, 120 // 4052 3232 vldrepl.w vr21, t0, 124 // 601 3233 vsllwil.w.h vr22, vr1, 0 3234 vexth.w.h vr23, vr1 3235 vneg.w vr21, vr21 3236 vmul.w vr8, vr22, vr20 3237 vmul.w vr6, vr23, vr20 3238 vmul.w vr1, vr22, vr21 3239 vmul.w vr24, vr23, vr21 3240 vssrarni.h.w vr6, vr8, 12 // t24a 3241 vssrarni.h.w vr24, vr1, 12 // t23a 3242 3243 vsadd.h vr1, vr10, vr30 // t16 3244 vssub.h vr29, vr10, vr30 // t17 3245 vssub.h vr8, vr26, vr19 // t18 3246 vsadd.h vr31, vr26, vr19 // t19 3247 vsadd.h vr10, vr27, vr28 // t20 3248 vssub.h vr30, vr27, vr28 // t21 3249 vssub.h vr19, vr24, vr25 // t22 3250 vsadd.h vr26, vr24, vr25 // t23 3251 vsadd.h vr27, vr6, vr5 // t24 3252 vssub.h vr28, vr6, vr5 // t25 3253 vssub.h vr24, vr3, vr2 // t26 3254 vsadd.h vr25, vr3, vr2 // t27 3255 vsadd.h vr5, vr4, vr7 // t28 3256 vssub.h vr6, vr4, vr7 // t29 3257 vssub.h vr2, vr9, vr0 // t30 3258 vsadd.h vr3, vr9, vr0 // t31 3259 3260 vldrepl.w vr20, t0, 16 // 799 3261 vldrepl.w vr21, t0, 20 // 4017 3262 vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7 3263 vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0 3264 vssrarni.h.w vr7, vr4, 12 // t30a 3265 vssrarni.h.w vr0, vr11, 12 // t17a 3266 vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9 3267 vneg.w vr4, vr4 3268 vneg.w vr9, vr9 3269 vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2 3270 vssrarni.h.w vr9, vr4, 12 // t18a 3271 vssrarni.h.w vr2, vr11, 12 // t29a 3272 3273 vldrepl.w vr20, t0, 24 // 3406 -> 1703 3274 vldrepl.w vr21, t0, 28 // 2276 -> 1138 3275 vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29 3276 vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6 3277 vssrarni.h.w vr29, vr4, 12 // t26a 3278 vssrarni.h.w vr6, vr11, 12 // t21a 3279 3280 vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8 3281 vneg.w vr4, vr4 3282 vneg.w vr8, vr8 3283 vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24 3284 vssrarni.h.w vr8, vr4, 12 // t22a 3285 vssrarni.h.w vr24, vr11, 12 // t25a 3286 3287 vsadd.h vr4, vr1, vr31 // t16a 3288 vssub.h vr30, vr1, vr31 // t19a 3289 vsadd.h vr19, vr0, vr9 // t17 3290 vssub.h vr28, vr0, vr9 // t18 3291 vssub.h vr1, vr26, vr10 // t20a 3292 vsadd.h vr31, vr26, vr10 // t23a 3293 vssub.h vr0, vr8, vr6 // t21 3294 vsadd.h vr9, vr8, vr6 // t22 3295 vsadd.h vr10, vr27, vr25 // t24a 3296 vssub.h vr26, vr27, vr25 // t27a 3297 vsadd.h vr6, vr24, vr29 // t25 3298 vssub.h vr8, vr24, vr29 // t26 3299 vssub.h vr25, vr3, vr5 // t28a 3300 vsadd.h vr27, vr3, vr5 // t31a 3301 vssub.h vr24, vr7, vr2 // t29 3302 vsadd.h vr29, vr7, vr2 // t30 3303 3304 vldrepl.w vr20, t0, 8 // 1567 3305 vldrepl.w vr21, t0, 12 // 3784 3306 vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5 3307 vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2 3308 vssrarni.h.w vr5, vr3, 12 // t29a 3309 vssrarni.h.w vr2, vr11, 12 // 18a 3310 3311 vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7 3312 vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24 3313 vssrarni.h.w vr7, vr3, 12 // t28 3314 vssrarni.h.w vr24, vr11, 12 // t19 3315 3316 vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28 3317 vneg.w vr3, vr3 3318 vneg.w vr28, vr28 3319 vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25 3320 vssrarni.h.w vr28, vr3, 12 // t20 3321 vssrarni.h.w vr25, vr11, 12 // t27 3322 3323 vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30 3324 vneg.w vr3, vr3 3325 vneg.w vr30, vr30 3326 vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1 3327 vssrarni.h.w vr30, vr3, 12 // t21a 3328 vssrarni.h.w vr1, vr11, 12 // t26a 3329 3330 vsadd.h vr3, vr4, vr31 // t16 3331 vssub.h vr26, vr4, vr31 // t23 3332 vsadd.h vr0, vr19, vr9 // t17a 3333 vssub.h vr8, vr19, vr9 // t22a 3334 vsadd.h vr4, vr2, vr30 // t18 3335 vssub.h vr31, vr2, vr30 // t21 3336 vsadd.h vr9, vr24, vr28 // t19a 3337 vssub.h vr19, vr24, vr28 // t20a 3338 vssub.h vr2, vr27, vr10 // t24 3339 vsadd.h vr30, vr27, vr10 // t31 3340 vssub.h vr24, vr29, vr6 // t25a 3341 vsadd.h vr28, vr29, vr6 // t30a 3342 vssub.h vr10, vr5, vr1 // t26 3343 vsadd.h vr27, vr5, vr1 // t29 3344 vssub.h vr6, vr7, vr25 // t27a 3345 vsadd.h vr29, vr7, vr25 // t28a 3346 3347 vldrepl.w vr20, t0, 0 // 2896 3348 vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5 3349 vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7 3350 vssrarni.h.w vr5, vr1, 12 // t20 3351 vssrarni.h.w vr7, vr11, 12 // t27 3352 3353 vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25 3354 vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6 3355 vssrarni.h.w vr25, vr1, 12 // t21a 3356 vssrarni.h.w vr6, vr11, 12 // t26a 3357 3358 vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19 3359 vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10 3360 vssrarni.h.w vr19, vr1, 12 // t22 3361 vssrarni.h.w vr10, vr11, 12 // t25 3362 3363 vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31 3364 vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8 3365 vssrarni.h.w vr31, vr1, 12 // t23a 3366 vssrarni.h.w vr8, vr11, 12 // t24a 3367 3368 // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16 3369 // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3 3370 vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 3371 3372 vsadd.h vr1, vr11, vr30 // c[0] 3373 vssub.h vr2, vr11, vr30 // c[31] 3374 vsadd.h vr24, vr12, vr28 // c[1] 3375 vssub.h vr26, vr12, vr28 // c[30] 3376 vsadd.h vr11, vr13, vr27 // c[2] 3377 vssub.h vr30, vr13, vr27 // c[29] 3378 vsadd.h vr12, vr14, vr29 // c[3] 3379 vssub.h vr28, vr14, vr29 // c[28] 3380 vsadd.h vr13, vr15, vr7 // c[4] 3381 vssub.h vr27, vr15, vr7 // c[27] 3382 vsadd.h vr14, vr16, vr6 // c[5] 3383 vssub.h vr29, vr16, vr6 // c[26] 3384 vsadd.h vr7, vr17, vr10 // c[6] 3385 vssub.h vr15, vr17, vr10 // c[25] 3386 vsadd.h vr6, vr18, vr8 // c[7] 3387 vssub.h vr16, vr18, vr8 // c[24] 3388 3389 vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 3390 3391 vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 3392 3393 vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18 3394 3395 vsadd.h vr1, vr11, vr31 // c[8] 3396 vssub.h vr2, vr11, vr31 // c[23] 3397 vsadd.h vr24, vr12, vr19 // c[9] 3398 vssub.h vr26, vr12, vr19 // c[22] 3399 vsadd.h vr11, vr13, vr25 // c[10] 3400 vssub.h vr30, vr13, vr25 // c[21] 3401 vsadd.h vr12, vr14, vr5 // c[11] 3402 vssub.h vr28, vr14, vr5 // c[20] 3403 vsadd.h vr13, vr15, vr9 // c[12] 3404 vssub.h vr27, vr15, vr9 // c[19] 3405 vsadd.h vr14, vr16, vr4 // c[13] 3406 vssub.h vr29, vr16, vr4 // c[18] 3407 vsadd.h vr7, vr17, vr0 // c[14] 3408 vssub.h vr15, vr17, vr0 // c[17] 3409 vsadd.h vr6, vr18, vr3 // c[15] 3410 vssub.h vr16, vr18, vr3 // c[16] 3411 3412 vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6 3413 3414 vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2 3415.endm // dct_8x32_tx64_new_lsx 3416 3417.macro DST_ADD_W64 in0, in1, in2, in3, in4, in5, in6, in7 3418 vsllwil.hu.bu vr4, vr10, 0 3419 vsllwil.hu.bu vr5, vr11, 0 3420 vsllwil.hu.bu vr6, vr12, 0 3421 vsllwil.hu.bu vr7, vr13, 0 3422 vexth.hu.bu vr10, vr10 3423 vexth.hu.bu vr11, vr11 3424 vexth.hu.bu vr12, vr12 3425 vexth.hu.bu vr13, vr13 3426 vadd.h vr4, vr4, \in0 3427 vadd.h vr10, vr10, \in1 3428 vadd.h vr5, vr5, \in2 3429 vadd.h vr11, vr11, \in3 3430 vadd.h vr6, vr6, \in4 3431 vadd.h vr12, vr12, \in5 3432 vadd.h vr7, vr7, \in6 3433 vadd.h vr13, vr13, \in7 3434 vssrani.bu.h vr10, vr4, 0 3435 vssrani.bu.h vr11, vr5, 0 3436 vssrani.bu.h vr12, vr6, 0 3437 vssrani.bu.h vr13, vr7, 0 3438 vst vr10, a0, 0 3439 vst vr11, a0, 16 3440 vst vr12, a0, 32 3441 vst vr13, a0, 48 3442.endm 3443 3444.macro idct_dc_w64 w, h, shift 3445 ld.h t2, a2, 0 3446 vldi vr0, 0x8b5 3447 vreplgr2vr.w vr1, t2 3448 vldi vr20, 0x880 3449 vmul.w vr2, vr0, vr1 3450 st.h zero, a2, 0 3451 vsrari.w vr2, vr2, 8 3452 vld vr13, a0, 48 3453 3454.if (2*\w == \h) || (2*\h == \w) 3455 vmul.w vr2, vr2, vr0 3456 vsrari.w vr2, vr2, 8 3457.endif 3458 3459.if \shift>0 3460 vsrari.w vr2, vr2, \shift 3461.endif 3462 vld vr11, a0, 16 3463 vmadd.w vr20, vr2, vr0 3464 vld vr12, a0, 32 3465 vssrarni.h.w vr20, vr20, 12 3466 vld vr10, a0, 0 3467.endm 3468 3469function inv_txfm_add_dct_dct_64x64_8bpc_lsx 3470 bnez a3, .NO_HAS_DCONLY_64x64 3471 3472 idct_dc_w64 64, 64, 2 3473 3474 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 3475 3476 li.w t3, 63 3477.loop63: 3478 add.d a0, a0, a1 3479 vld vr10, a0, 0 3480 vld vr11, a0, 16 3481 vld vr12, a0, 32 3482 vld vr13, a0, 48 3483 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 3484 addi.d t3, t3, -1 3485 blt zero, t3, .loop63 3486 b .DCT_DCT_64X64_END 3487.NO_HAS_DCONLY_64x64: 3488 3489 malloc_space 64*32*2+512+512 3490 3491.macro dct64x64_core1_lsx shift, rect2 3492 //addi.d t2, a2, \in0 3493 //addi.d t7, t7, \in1 3494 li.w t4, 64*32*2+64 3495 add.d t3, sp, t4 3496 addi.d t6, t3, 512 3497 add.d t5, t6, zero 3498 3499 dct_8x32_tx64_new_lsx 0, 256, 128, 256, \rect2 3500 3501 la.local t0, idct64_coeffs 3502 vxor.v vr31, vr31, vr31 3503 3504 //addi.d a4, a2, \in2 // 32 ... 3505 // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a 3506 vld vr0, a4, 128*0 // in1 3507 vld vr1, a4, 128*15 // in31 3508 vld vr2, a4, 128*8 // in17 3509 vld vr3, a4, 128*7 // in15 3510 la.local a6, idct_coeffs 3511.ifc \rect2, rect2_lsx 3512 vldrepl.w vr23, a6, 0 // 2896 3513.irp i, vr0, vr1, vr2, vr3 3514 rect2_lsx \i, vr23, \i 3515.endr 3516.endif 3517 vst vr31, a4, 128*0 3518 vst vr31, a4, 128*15 3519 vst vr31, a4, 128*8 3520 vst vr31, a4, 128*7 3521 dct64_step1_lsx 3522 3523 addi.d t0, t0, 48 3524 addi.d t6, t6, 128 3525 // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a 3526 vld vr0, a4, 128*3 // in7 3527 vld vr1, a4, 128*12 // in25 3528 vld vr2, a4, 128*11 // in23 3529 vld vr3, a4, 128*4 // in9 3530 la.local a6, idct_coeffs 3531.ifc \rect2, rect2_lsx 3532 vldrepl.w vr23, a6, 0 // 2896 3533.irp i, vr0, vr1, vr2, vr3 3534 rect2_lsx \i, vr23, \i 3535.endr 3536.endif 3537 vst vr31, a4, 128*3 3538 vst vr31, a4, 128*12 3539 vst vr31, a4, 128*11 3540 vst vr31, a4, 128*4 3541 dct64_step1_lsx 3542 3543 addi.d t0, t0, 48 3544 addi.d t6, t6, 128 3545 // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a 3546 vld vr0, a4, 128*2 // in5 3547 vld vr1, a4, 128*13 // in27 3548 vld vr2, a4, 128*10 // in21 3549 vld vr3, a4, 128*5 // in11 3550 la.local a6, idct_coeffs 3551.ifc \rect2, rect2_lsx 3552 vldrepl.w vr23, a6, 0 // 2896 3553.irp i, vr0, vr1, vr2, vr3 3554 rect2_lsx \i, vr23, \i 3555.endr 3556.endif 3557 vst vr31, a4, 128*2 3558 vst vr31, a4, 128*13 3559 vst vr31, a4, 128*10 3560 vst vr31, a4, 128*5 3561 dct64_step1_lsx 3562 3563 addi.d t0, t0, 48 3564 addi.d t6, t6, 128 3565 // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a 3566 vld vr0, a4, 128*1 // in3 3567 vld vr1, a4, 128*14 // in29 3568 vld vr2, a4, 128*9 // in19 3569 vld vr3, a4, 128*6 // in13 3570 la.local a6, idct_coeffs 3571.ifc \rect2, rect2_lsx 3572 vldrepl.w vr23, a6, 0 // 2896 3573.irp i, vr0, vr1, vr2, vr3 3574 rect2_lsx \i, vr23, \i 3575.endr 3576.endif 3577 vst vr31, a4, 128*1 3578 vst vr31, a4, 128*14 3579 vst vr31, a4, 128*9 3580 vst vr31, a4, 128*6 3581 dct64_step1_lsx 3582 3583 la.local t0, idct_coeffs 3584 addi.d t4, t5, 16*7 3585 // t32a/t39/t40a/t47/t48/t55a/t56/t63a 3586 dct64_step2_lsx 3587 3588 addi.d t5, t5, 16 3589 addi.d t4, t4, -16 3590 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 3591 dct64_step2_lsx 3592 3593 addi.d t5, t5, 16 3594 addi.d t4, t4, -16 3595 // t34a/t37/t42a/t45/t50/t53a/t58/t61a 3596 dct64_step2_lsx 3597 3598 addi.d t5, t5, 16 3599 addi.d t4, t4, -16 3600 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 3601 dct64_step2_lsx 3602 3603 li.w t4, 64*32*2+64+512 3604 add.d t5, t4, sp 3605 addi.d t4, t5, 16*7 3606 dct64_step4_lsx transpose8x8, \shift, 0, 128, 112, 128 3607 3608 addi.d t3, t3, 128 3609 addi.d t4, t4, -16*8 3610 addi.d t5, t5, -16*8 3611 dct64_step4_lsx transpose8x8, \shift, 16, 128, 96, 128 3612 3613 addi.d t5, t5, -16*8 3614 addi.d t4, t4, -16*8 3615 addi.d t3, t3, 128 3616 dct64_step4_lsx transpose8x8, \shift, 32, 128, 80, 128 3617 3618 addi.d t5, t5, -16*8 3619 addi.d t4, t4, -16*8 3620 addi.d t3, t3, 128 3621 dct64_step4_lsx transpose8x8, \shift, 48, 128, 64, 128 3622.endm 3623 la.local t8, eob_32x32 3624 addi.d t2, a2, 0 3625 addi.d t7, sp, 64 3626 addi.d t7, t7, 0 3627 addi.d a4, a2, 64 3628.DCT_DCT_EOB_64x64: 3629 ld.h a5, t8, 0 3630 addi.d t8, t8, 2 3631 dct64x64_core1_lsx 2, no_rect2 3632 addi.d t2, t2, 16 3633 addi.d t7, t7, 128*8 3634 addi.d a4, a4, 16 3635 bge a3, a5, .DCT_DCT_EOB_64x64 3636 3637 la.local t8, eob_32x32 3638 vxor.v vr31, vr31, vr31 3639 3640 ld.h t7, t8, 4 3641 bge a3, t7, .DCT_DCT_EOB_64x64_END 3642 li.d t1, 1024*3+64 3643 add.d t0, sp, t1 3644.rept 4 3645 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 3646 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 3647 addi.d t0, t0, 256 3648.endr 3649 3650 ld.h t7, t8, 2 3651 bge a3, t7, .DCT_DCT_EOB_64x64_END 3652 li.d t1, 1024*2+64 3653 add.d t0, sp, t1 3654.rept 4 3655 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 3656 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 3657 addi.d t0, t0, 256 3658.endr 3659 ld.h t7, t8, 0 3660 bge a3, t7, .DCT_DCT_EOB_64x64_END 3661 3662 li.d t1, 1024*1+64 3663 add.d t0, sp, t1 3664.rept 4 3665 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 3666 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 3667 addi.d t0, t0, 256 3668.endr 3669 3670.DCT_DCT_EOB_64x64_END: 3671 3672.macro dct64x64_core2_lsx in0, in1, rect2 3673 addi.d t2, sp, 64+\in0 3674 addi.d t7, sp, 64+\in0 3675 li.w t4, 64*32*2+64 3676 add.d t3, sp, t4 3677 addi.d t6, t3, 512 3678 add.d t5, t6, zero 3679 3680 addi.d t2, t2, 1024 3681 addi.d t2, t2, 1024 3682 dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512, \rect2 3683 3684 la.local t0, idct64_coeffs 3685 addi.d t2, sp, 64+64*2+\in0 3686 addi.d t4, t2, 256*7 3687 addi.d t4, t4, 256 3688 3689 vld vr0, t2, 256*0 // in1 3690 vld vr1, t4, 256*7 // in31 3691 vld vr2, t4, 256*0 // in17 3692 vld vr3, t2, 256*7 // in15 3693 dct64_step1_lsx 3694 3695 addi.d t0, t0, 48 3696 addi.d t6, t6, 128 3697 vld vr0, t2, 256*3 // in7 3698 vld vr1, t4, 256*4 // in25 3699 vld vr2, t4, 256*3 // in23 3700 vld vr3, t2, 256*4 // in9 3701 dct64_step1_lsx 3702 3703 addi.d t0, t0, 48 3704 addi.d t6, t6, 128 3705 vld vr0, t2, 256*2 // in5 3706 vld vr1, t4, 256*5 // in27 3707 vld vr2, t4, 256*2 // in21 3708 vld vr3, t2, 256*5 // in11 3709 dct64_step1_lsx 3710 3711 addi.d t0, t0, 48 3712 addi.d t6, t6, 128 3713 vld vr0, t2, 256*1 // in3 3714 vld vr1, t4, 256*6 // in29 3715 vld vr2, t4, 256*1 // in19 3716 vld vr3, t2, 256*6 // in13 3717 dct64_step1_lsx 3718 3719 la.local t0, idct_coeffs 3720 addi.d t4, t5, 16*7 3721 // t32a/t39/t40a/t47/t48/t55a/t56/t63a 3722 dct64_step2_lsx 3723 3724 addi.d t5, t5, 16 3725 addi.d t4, t4, -16 3726 // t33/t38a/t41/t46a/t49a/t54/t57a/t62 3727 dct64_step2_lsx 3728 3729 addi.d t5, t5, 16 3730 addi.d t4, t4, -16 3731 // t34a/t37/t42a/t45/t50/t53a/t58/t61a 3732 dct64_step2_lsx 3733 3734 addi.d t5, t5, 16 3735 addi.d t4, t4, -16 3736 // t35/t36a/t43/t44a/t51a/t52/t59a/t60 3737 dct64_step2_lsx 3738 3739 li.w t4, 64*32*2+64+512 3740 add.d t5, t4, sp 3741 addi.d t4, t5, 16*7 3742 addi.d a0, a0, \in1 3743 // 0 - 7, 56 -63 3744 dct64_step3_lsx 3745 li.w t8, 0 3746 mul.w t0, t8, a1 3747 add.d t0, a0, t0 3748 alsl.d t6, a1, t0, 1 3749 addi.d t1, t0, 0 3750 add.d t2, t0, a1 3751 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 3752 li.w t8, 56 3753 mul.w t0, t8, a1 3754 add.d t0, a0, t0 3755 alsl.d t6, a1, t0, 1 3756 addi.d t1, t0, 0 3757 add.d t2, t0, a1 3758 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 3759 3760 // 8 - 15, 48 - 55 3761 addi.d t3, t3, 128 3762 addi.d t4, t4, -16*8 3763 addi.d t5, t5, -16*8 3764 dct64_step3_lsx 3765 li.w t8, 8 3766 mul.w t0, t8, a1 3767 add.d t0, t0, a0 3768 alsl.d t6, a1, t0, 1 3769 addi.d t1, t0, 0 3770 add.d t2, t0, a1 3771 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 3772 li.w t8, 48 3773 mul.w t0, t8, a1 3774 add.d t0, t0, a0 3775 alsl.d t6, a1, t0, 1 3776 addi.d t1, t0, 0 3777 add.d t2, t0, a1 3778 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 3779 3780 // 16 - 23, 40 - 47 3781 addi.d t3, t3, 128 3782 addi.d t4, t4, -16*8 3783 addi.d t5, t5, -16*8 3784 dct64_step3_lsx 3785 li.w t8, 16 3786 mul.w t0, t8, a1 3787 add.d t0, t0, a0 3788 alsl.d t6, a1, t0, 1 3789 addi.d t1, t0, 0 3790 add.d t2, t0, a1 3791 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 3792 li.w t8, 40 3793 mul.w t0, t8, a1 3794 add.d t0, t0, a0 3795 alsl.d t6, a1, t0, 1 3796 addi.d t1, t0, 0 3797 add.d t2, t0, a1 3798 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 3799 3800 // 24 - 31, 32 - 39 3801 addi.d t3, t3, 128 3802 addi.d t4, t4, -16*8 3803 addi.d t5, t5, -16*8 3804 dct64_step3_lsx 3805 li.w t8, 24 3806 mul.w t0, t8, a1 3807 add.d t0, t0, a0 3808 alsl.d t6, a1, t0, 1 3809 addi.d t1, t0, 0 3810 add.d t2, t0, a1 3811 dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1 3812 li.w t8, 32 3813 mul.w t0, t8, a1 3814 add.d t0, t0, a0 3815 alsl.d t6, a1, t0, 1 3816 addi.d t1, t0, 0 3817 add.d t2, t0, a1 3818 dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21 3819.endm 3820 dct64x64_core2_lsx 16*0, 0, no_rect2 3821 dct64x64_core2_lsx 16*1, 8, no_rect2 3822 dct64x64_core2_lsx 16*2, 8, no_rect2 3823 dct64x64_core2_lsx 16*3, 8, no_rect2 3824 dct64x64_core2_lsx 16*4, 8, no_rect2 3825 dct64x64_core2_lsx 16*5, 8, no_rect2 3826 dct64x64_core2_lsx 16*6, 8, no_rect2 3827 dct64x64_core2_lsx 16*7, 8, no_rect2 3828 3829 free_space 64*32*2+512+512 3830.DCT_DCT_64X64_END: 3831endfunc 3832 3833function inv_txfm_add_dct_dct_64x32_8bpc_lsx 3834 bnez a3, .NO_HAS_DCONLY_64x32 3835 3836 idct_dc_w64 64, 32, 1 3837 3838 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 3839 3840 li.w t3, 31 3841.loop31: 3842 add.d a0, a0, a1 3843 vld vr10, a0, 0 3844 vld vr11, a0, 16 3845 vld vr12, a0, 32 3846 vld vr13, a0, 48 3847 DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 3848 addi.d t3, t3, -1 3849 blt zero, t3, .loop31 3850 b .DCT_DCT_64X32_END 3851.NO_HAS_DCONLY_64x32: 3852 malloc_space 64*32*2+512+512 3853 3854 la.local t8, eob_32x32 3855 addi.d t2, a2, 0 3856 addi.d t7, sp, 64 3857 addi.d t7, t7, 0 3858 addi.d a4, a2, 64 3859.DCT_DCT_EOB_64x32: 3860 ld.h a5, t8, 0 3861 addi.d t8, t8, 2 3862 dct64x64_core1_lsx 1, rect2_lsx 3863 addi.d t2, t2, 16 3864 addi.d t7, t7, 128*8 3865 addi.d a4, a4, 16 3866 bge a3, a5, .DCT_DCT_EOB_64x32 3867 3868 la.local t8, eob_32x32 3869 vxor.v vr31, vr31, vr31 3870 3871 ld.h t7, t8, 4 3872 bge a3, t7, .DCT_DCT_EOB_64x32_END 3873 li.d t1, 1024*3+64 3874 add.d t0, sp, t1 3875.rept 4 3876 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 3877 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 3878 addi.d t0, t0, 256 3879.endr 3880 3881 ld.h t7, t8, 2 3882 bge a3, t7, .DCT_DCT_EOB_64x32_END 3883 li.d t1, 1024*2+64 3884 add.d t0, sp, t1 3885.rept 4 3886 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 3887 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 3888 addi.d t0, t0, 256 3889.endr 3890 3891 ld.h t7, t8, 0 3892 bge a3, t7, .DCT_DCT_EOB_64x32_END 3893 li.d t1, 1024*1+64 3894 add.d t0, sp, t1 3895.rept 4 3896 vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \ 3897 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 3898 addi.d t0, t0, 256 3899.endr 3900 3901.DCT_DCT_EOB_64x32_END: 3902 addi.d t2, sp, 64 3903 li.w t4, 64*32*2+64 3904 add.d t3, sp, t4 3905 addi.d t5, sp, 64 3906 addi.d t5, t5, 1024 3907 addi.d t5, t5, 1024 3908.rept 8 3909 vld_x8 t2, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 3910 3911 addi.d t4, t2, 1024 3912 addi.d t4, t4, 1024 3913 3914 vld_x8 t4, 0, 256, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 3915 3916 inv_dct16_lsx no_rect2 3917 3918 vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 3919 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 3920 3921 addi.d t4, t2, 128 3922 vld_x8 t4, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 3923 3924 addi.d t4, t4, 1024 3925 addi.d t4, t4, 1024 3926 3927 vld_x8 t4, 0, 256, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 3928 3929 dct_8x32_core_lsx t5, t3, 0, 128, 16, -2048, 1024, -1024, 0, 128, , 4 3930 3931 addi.d t2, t2, 16 3932 addi.d t5, t5, 16 3933 addi.d t1, t1, 16 3934.endr 3935 addi.d t2, sp, 64 3936 li.w t3, 32 3937.loop32: 3938 vld vr10, a0, 0 3939 vld vr11, a0, 16 3940 vld vr12, a0, 32 3941 vld vr13, a0, 48 3942 vld_x8 t2, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 3943 DST_ADD_W64 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 3944 add.d a0, a0, a1 3945 addi.d t2, t2, 128 3946 addi.d t3, t3, -1 3947 blt zero, t3, .loop32 3948 3949 free_space 64*32*2+512+512 3950.DCT_DCT_64X32_END: 3951endfunc 3952 3953.macro VLD_DST_ADD_W8_H32 in0 3954 vld vr4, t3, 0 3955 vld vr5, t3, 16 3956 vld vr6, t3, 32 3957 vld vr7, t3, 48 3958 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 3959 addi.d t3, t3, 64 3960 add.d a0, a1, a0 3961 alsl.d t2, a1, t2, 2 3962 vld vr4, t3, 0 3963 vld vr5, t3, 16 3964 vld vr6, t3, 32 3965 vld vr7, t3, 48 3966 VLD_DST_ADD_W8 vr4, vr5, vr6, vr7 3967 addi.d t3, sp, \in0 3968 add.d a0, a1, a0 3969 alsl.d t2, a1, t2, 2 3970.endm 3971 3972function inv_txfm_add_dct_dct_8x32_8bpc_lsx 3973 bnez a3, .NO_HAS_DCONLY_8x32 3974 3975 idct_dc 8, 32, 2 3976 3977 DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20 3978.rept 7 3979 add.d a0, a1, a0 3980 alsl.d t2, a1, a0, 1 3981 3982 VLD_DST_ADD_W8 vr20, vr20, vr20, vr20 3983.endr 3984 b .DCT_DCT_8X32_END 3985.NO_HAS_DCONLY_8x32: 3986 malloc_space 512 3987 3988 la.local t8, eob_8x32 3989 addi.d t3, sp, 64 3990 addi.d t2, a2, 0 3991.DCT_DCT_EOB_8x32: 3992 ld.h t7, t8, 0 3993 addi.d t8, t8, 2 3994 3995 vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 3996 3997 inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h 3998 3999.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4000 vsrari.h \i, \i, 2 4001.endr 4002 4003 vxor.v vr31, vr31, vr31 4004 vst_x8 a2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4005 4006 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4007 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4008 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4009 4010 vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4011 4012 addi.d a2, a2, 16 4013 addi.d t3, t3, 128 4014 bge a3, t7, .DCT_DCT_EOB_8x32 4015 4016 la.local t8, eob_8x32 4017 vxor.v vr31, vr31, vr31 4018 ld.h t7, t8, 4 4019 bge a3, t7, .DCT_DCT_EOB_8x32_END 4020 vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4021 4022 ld.h t7, t8, 2 4023 bge a3, t7, .DCT_DCT_EOB_8x32_END 4024 vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4025 4026 ld.h t7, t8, 0 4027 bge a3, t7, .DCT_DCT_EOB_8x32_END 4028 vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4029.DCT_DCT_EOB_8x32_END: 4030 addi.d t2, sp, 64 4031 addi.d t3, sp, 64 4032 4033 vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4034 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 4035 4036 inv_dct16_lsx .8h 4037 4038 vst_x16 t3, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4039 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 4040 4041 vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4042 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 4043 4044 dct_8x32_core_lsx t2, t3, 0, 256, 32, 0, 128, 256, 384, 16, , 4 4045 4046 alsl.d t2, a1, a0, 1 4047 addi.d t3, sp, 64 4048 4049 VLD_DST_ADD_W8_H32 320 4050 VLD_DST_ADD_W8_H32 448 4051 VLD_DST_ADD_W8_H32 192 4052 VLD_DST_ADD_W8_H32 0 4053 4054 free_space 512 4055.DCT_DCT_8X32_END: 4056endfunc 4057 4058function inv_txfm_add_identity_identity_8x32_8bpc_lsx 4059 la.local t7, eob_8x32 4060 alsl.d t2, a1, a0, 1 4061 4062.IDENTITY_IDENTITY_EOB_8x32: 4063 ld.h t6, t7, 0 4064 addi.d t7, t7, 2 4065 vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4066 4067 vxor.v vr23, vr23, vr23 4068 vst_x8 a2, 0, 64, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23 4069 4070.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 4071 vsrari.h \i, \i, 1 4072.endr 4073 4074 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4075 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \ 4076 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 4077 4078.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4079 vsrari.h \i, \i, 2 4080.endr 4081 VLD_DST_ADD_W8 vr16, vr17, vr18, vr19 4082 add.d a0, a1, a0 4083 alsl.d t2, a1, a0, 1 4084 4085 VLD_DST_ADD_W8 vr20, vr21, vr22, vr23 4086 add.d a0, a1, a0 4087 alsl.d t2, a1, a0, 1 4088 4089 addi.d a2, a2, 16 4090 bge a3, t6, .IDENTITY_IDENTITY_EOB_8x32 4091endfunc 4092 4093.macro def_fn_16x4_base txfm 4094functionl inv_txfm_\txfm\()add_16x4_lsx 4095 vld_x8 a2, 0, 16, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14 4096 4097.ifc \txfm, identity_ 4098 li.w t0, 1697 4099 vreplgr2vr.w vr20, t0 4100.irp i, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14 4101 inv_identity16_lsx \i, vr20, \i, \i, .8h 4102.endr 4103 4104 vilvh.d vr1, vr0, vr0 4105 vilvh.d vr3, vr2, vr2 4106 vilvh.d vr5, vr4, vr4 4107 vilvh.d vr7, vr6, vr6 4108 vilvh.d vr9, vr8, vr8 4109 vilvh.d vr11, vr10, vr10 4110 vilvh.d vr13, vr12, vr12 4111 vilvh.d vr15, vr14, vr14 4112.else 4113 vilvh.d vr1, vr0, vr0 4114 vilvh.d vr3, vr2, vr2 4115 vilvh.d vr5, vr4, vr4 4116 vilvh.d vr7, vr6, vr6 4117 vilvh.d vr9, vr8, vr8 4118 vilvh.d vr11, vr10, vr10 4119 vilvh.d vr13, vr12, vr12 4120 vilvh.d vr15, vr14, vr14 4121 4122 move t6, ra 4123 jirl ra, t7, 0 4124 move ra, t6 4125.endif 4126 4127 vxor.v vr23, vr23, vr23 4128 vst_x8 a2, 0, 16, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23 4129 4130 LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \ 4131 vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21 4132 4133 LSX_TRANSPOSE8x4_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, vr4, \ 4134 vr5, vr6, vr7, vr16, vr17, vr18, vr19, vr20, vr21 4135 4136 vsrari.h vr0, vr0, 1 4137 vsrari.h vr1, vr1, 1 4138 vsrari.h vr2, vr2, 1 4139 vsrari.h vr3, vr3, 1 4140 move t6, ra 4141 jirl ra, t8, 0 4142 move ra, t6 4143 4144 vsrari.h vr8, vr0, 4 4145 vsrari.h vr9, vr1, 4 4146 vsrari.h vr10, vr2, 4 4147 vsrari.h vr11, vr3, 4 4148 vsrari.h vr0, vr4, 1 4149 vsrari.h vr1, vr5, 1 4150 vsrari.h vr2, vr6, 1 4151 vsrari.h vr3, vr7, 1 4152 4153 move t6, ra 4154 jirl ra, t8, 0 4155 move ra, t6 4156 4157 vsrari.h vr16, vr0, 4 4158 vsrari.h vr17, vr1, 4 4159 vsrari.h vr18, vr2, 4 4160 vsrari.h vr19, vr3, 4 4161 4162 alsl.d t2, a1, a0, 1 4163 VLD_DST_ADD_W16 vr8, vr16, vr9, vr17, vr10, vr18, vr11, vr19 4164endfuncl 4165.endm 4166 4167def_fn_16x4_base identity_ 4168def_fn_16x4_base 4169 4170.macro fn_16x4 txfm1, txfm2 4171function inv_txfm_add_\txfm1\()_\txfm2\()_16x4_8bpc_lsx 4172.ifc \txfm1\()_\txfm2, dct_dct 4173 bnez a3, .NO_HAS_DCONLY_16x4 4174 4175 idct_dc 16, 4, 1 4176 4177 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ 4178 vr20, vr20, vr20, vr20, vr20 4179 b .\txfm1\()_\txfm2\()_16x4_END 4180.NO_HAS_DCONLY_16x4: 4181.endif 4182 4183.ifnc \txfm1, identity 4184 la.local t7, inv_\txfm1\()_4h_x16_lsx 4185.endif 4186 la.local t8, inv_\txfm2\()_8h_x4_lsx 4187 4188.ifc \txfm1, identity 4189 b inv_txfm_identity_add_16x4_lsx 4190.else 4191 b inv_txfm_add_16x4_lsx 4192.endif 4193.\txfm1\()_\txfm2\()_16x4_END: 4194endfunc 4195.endm 4196 4197fn_16x4 dct, dct 4198fn_16x4 identity, identity 4199fn_16x4 adst, dct 4200 4201.macro VLD_DST_ADD_W16_H32 in0 4202 vld vr14, t3, 0 4203 vld vr15, t3, 16 4204 vld vr16, t3, 32 4205 vld vr17, t3, 48 4206 vld vr18, t5, 0 4207 vld vr19, t5, 16 4208 vld vr20, t5, 32 4209 vld vr21, t5, 48 4210 vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \ 4211 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4 4212 VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21 4213 alsl.d a0, a1, a0, 2 4214 alsl.d t2, a1, t2, 2 4215 addi.d t3, t3, 64 4216 addi.d t5, t5, 64 4217 vld vr14, t3, 0 4218 vld vr15, t3, 16 4219 vld vr16, t3, 32 4220 vld vr17, t3, 48 4221 vld vr18, t5, 0 4222 vld vr19, t5, 16 4223 vld vr20, t5, 32 4224 vld vr21, t5, 48 4225 vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \ 4226 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4 4227 VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21 4228 alsl.d a0, a1, a0, 2 4229 alsl.d t2, a1, t2, 2 4230 addi.d t3, sp, \in0 4231 addi.d t5, sp, \in0+512 4232.endm 4233 4234function inv_txfm_add_dct_dct_16x32_8bpc_lsx 4235 bnez a3, .NO_HAS_DCONLY_16x32 4236 4237 idct_dc 16, 32, 1 4238 4239 DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \ 4240 vr20, vr20, vr20, vr20, vr20 4241.rept 7 4242 alsl.d a0, a1, a0, 2 4243 alsl.d t2, a1, a0, 1 4244 4245 VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20 4246.endr 4247 b .DCT_DCT_16x32_END 4248.NO_HAS_DCONLY_16x32: 4249 malloc_space 512+512 4250 4251 addi.d t3, sp, 64 4252 la.local t8, eob_16x32 4253 4254.DCT_DCT_EOB_16x32: 4255 ld.h t7, t8, 0 4256 addi.d t8, t8, 2 4257 vld_x16 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4258 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 4259 4260 vxor.v vr31, vr31, vr31 4261.irp i, 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960 4262 vst vr31, a2, \i 4263.endr 4264 4265 li.w t0, 2896 4266 vreplgr2vr.w vr23, t0 4267.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4268 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 4269 rect2_lsx \i, vr23, \i 4270.endr 4271 4272 inv_dct16_lsx .8h 4273 4274 LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4275 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4276 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4277 4278 LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 4279 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \ 4280 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4281 4282.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4283 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 4284 vsrari.h \i, \i, 1 4285.endr 4286 4287 vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, 4288 vst_x8 t3, 512, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 4289 4290 addi.d a2, a2, 16 4291 addi.d t3, t3, 128 4292 bge a3, t7, .DCT_DCT_EOB_16x32 4293 4294 la.local t8, eob_16x32 4295 vxor.v vr31, vr31, vr31 4296 4297 ld.h t7, t8, 4 4298 bge a3, t7, .DCT_DCT_EOB_16x32_END 4299 vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4300 vst_x8 sp, 64+896, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4301 4302 ld.h t7, t8, 2 4303 bge a3, t7, .DCT_DCT_EOB_16x32_END 4304 vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4305 vst_x8 sp, 64+768, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4306 4307 ld.h t7, t8, 0 4308 bge a3, t7, .DCT_DCT_EOB_16x32_END 4309 vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4310 vst_x8 sp, 64+512+128, 16 vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31 4311 4312.DCT_DCT_EOB_16x32_END: 4313 addi.d t7, sp, 64 4314.rept 2 4315 vld_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4316 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 4317 4318 inv_dct16_lsx .8h 4319 4320 vst_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4321 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15 4322 4323 vld_x16 t7, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \ 4324 vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30 4325 4326 dct_8x32_core_lsx t7, t7, 0, 256, 32, 0, 128, 256, 384, 16, , 4327 4328 addi.d t7, t7, 512 4329.endr 4330 alsl.d t2, a1, a0, 1 4331 addi.d t3, sp, 64 4332 addi.d t5, sp, 512+64 4333 4334 VLD_DST_ADD_W16_H32 320 4335 VLD_DST_ADD_W16_H32 448 4336 VLD_DST_ADD_W16_H32 192 4337 VLD_DST_ADD_W16_H32 0 4338 4339 free_space 512+512 4340.DCT_DCT_16x32_END: 4341endfunc 4342 4343.macro xvmulev_xvmaddod_lasx in0, in1, in2, in3, out0, out1 4344 xvmulwev.w.h \out0, \in0, \in2 4345 xvmulwod.w.h \out1, \in0, \in2 4346 xvmaddwev.w.h \out0, \in1, \in3 4347 xvmaddwod.w.h \out1, \in1, \in3 4348.endm 4349 4350.macro xvsrari_h_x16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ 4351 in11, in12, in13, in14, in15, out0, out1, out2, out3, \ 4352 out4, out5, out6, out7, out8, out9, out10, out11, out12, \ 4353 out13, out14, out15, shift 4354 xvsrari.h \out0, \in0, \shift 4355 xvsrari.h \out1, \in1, \shift 4356 xvsrari.h \out2, \in2, \shift 4357 xvsrari.h \out3, \in3, \shift 4358 xvsrari.h \out4, \in4, \shift 4359 xvsrari.h \out5, \in5, \shift 4360 xvsrari.h \out6, \in6, \shift 4361 xvsrari.h \out7, \in7, \shift 4362 xvsrari.h \out8, \in8, \shift 4363 xvsrari.h \out9, \in9, \shift 4364 xvsrari.h \out10, \in10, \shift 4365 xvsrari.h \out11, \in11, \shift 4366 xvsrari.h \out12, \in12, \shift 4367 xvsrari.h \out13, \in13, \shift 4368 xvsrari.h \out14, \in14, \shift 4369 xvsrari.h \out15, \in15, \shift 4370.endm 4371 4372.macro xvpermi_q_x2 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1 4373 xvor.v \tmp0, \in0, \in0 4374 xvor.v \tmp1, \in1, \in1 4375 xvpermi.q \out0, \in2, 0x02 4376 xvpermi.q \out1, \in3, 0x02 4377 xvpermi.q \out2, \tmp0, 0x31 4378 xvpermi.q \out3, \tmp1, 0x31 4379.endm 4380 4381.macro DST_ADD_W16_LASX in0, in1, in2, in3, in4, in5, in6, in7 4382 vext2xv.hu.bu xr0, \in0 4383 vext2xv.hu.bu xr1, \in1 4384 vext2xv.hu.bu xr2, \in2 4385 vext2xv.hu.bu xr3, \in3 4386 xvadd.h xr0, xr0, \in4 4387 xvadd.h xr1, xr1, \in5 4388 xvadd.h xr2, xr2, \in6 4389 xvadd.h xr3, xr3, \in7 4390 xvssrani.bu.h xr1, xr0, 0 4391 xvssrani.bu.h xr3, xr2, 0 4392 xvpermi.d xr0, xr1, 0b11011000 4393 xvpermi.d xr2, xr3, 0b11011000 4394 xvpermi.d xr1, xr0, 0b00001110 4395 xvpermi.d xr3, xr2, 0b00001110 4396 vst vr0, a0, 0 4397 vstx vr1, a0, a1 4398 vst vr2, t2, 0 4399 vstx vr3, t2, a1 4400.endm 4401 4402.macro XVLD_DST_ADD_W16 in0, in1, in2, in3 4403 vld vr0, a0, 0 4404 vldx vr1, a0, a1 4405 vld vr2, t2, 0 4406 vldx vr3, t2, a1 4407 DST_ADD_W16_LASX xr0, xr1, xr2, xr3, \in0, \in1, \in2, \in3 4408.endm 4409 4410.macro inv_adst16_lasx 4411 la.local t0, iadst16_coeffs_h 4412 4413 xvldrepl.h xr20, t0, 0 // 4091 4414 xvldrepl.h xr21, t0, 2 // 201 4415 xvmulev_xvmaddod_lasx xr15, xr0, xr20, xr21, xr16, xr18 4416 xvneg.h xr20, xr20 4417 xvmulev_xvmaddod_lasx xr15, xr0, xr21, xr20, xr17, xr19 4418 xvilvl.w xr15, xr18, xr16 4419 xvilvl.w xr0, xr19, xr17 4420 xvilvh.w xr18, xr18, xr16 4421 xvilvh.w xr19, xr19, xr17 4422 xvssrarni.h.w xr18, xr15, 12 // t0 4423 xvssrarni.h.w xr19, xr0, 12 // t1 4424 4425 xvldrepl.h xr20, t0, 4 // 3973 4426 xvldrepl.h xr21, t0, 6 // 995 4427 xvmulev_xvmaddod_lasx xr13, xr2, xr20, xr21, xr16, xr0 4428 xvneg.h xr20, xr20 4429 xvmulev_xvmaddod_lasx xr13, xr2, xr21, xr20, xr17, xr15 4430 xvilvl.w xr13, xr0, xr16 4431 xvilvl.w xr2, xr15, xr17 4432 xvilvh.w xr0, xr0, xr16 4433 xvilvh.w xr15, xr15, xr17 4434 xvssrarni.h.w xr0, xr13, 12 // t2 4435 xvssrarni.h.w xr15, xr2, 12 // t3 4436 4437 xvldrepl.h xr20, t0, 8 // 3703 4438 xvldrepl.h xr21, t0, 10 // 1751 4439 xvmulev_xvmaddod_lasx xr11, xr4, xr20, xr21, xr16, xr2 4440 xvneg.h xr20, xr20 4441 xvmulev_xvmaddod_lasx xr11, xr4, xr21, xr20, xr17, xr13 4442 xvilvl.w xr11, xr2, xr16 4443 xvilvl.w xr4, xr13, xr17 4444 xvilvh.w xr2, xr2, xr16 4445 xvilvh.w xr13, xr13, xr17 4446 xvssrarni.h.w xr2, xr11, 12 // t4 4447 xvssrarni.h.w xr13, xr4, 12 // t5 4448 4449 xvldrepl.h xr20, t0, 12 // 3290 -> 1645 4450 xvldrepl.h xr21, t0, 14 // 2440 -> 1220 4451 xvmulev_xvmaddod_lasx xr9, xr6, xr20, xr21, xr16, xr4 4452 xvneg.h xr20, xr20 4453 xvmulev_xvmaddod_lasx xr9, xr6, xr21, xr20, xr17, xr11 4454 xvilvl.w xr9, xr4, xr16 4455 xvilvl.w xr6, xr11, xr17 4456 xvilvh.w xr4, xr4, xr16 4457 xvilvh.w xr11, xr11, xr17 4458 xvssrarni.h.w xr4, xr9, 12 // t6 4459 xvssrarni.h.w xr11, xr6, 12 // t7 4460 4461 xvldrepl.h xr20, t0, 16 // 2751 4462 xvldrepl.h xr21, t0, 18 // 3035 4463 xvmulev_xvmaddod_lasx xr7, xr8, xr20, xr21, xr16, xr6 4464 xvneg.h xr20, xr20 4465 xvmulev_xvmaddod_lasx xr7, xr8, xr21, xr20, xr17, xr9 4466 xvilvl.w xr7, xr6, xr16 4467 xvilvl.w xr8, xr9, xr17 4468 xvilvh.w xr6, xr6, xr16 4469 xvilvh.w xr9, xr9, xr17 4470 xvssrarni.h.w xr6, xr7, 12 // t8 4471 xvssrarni.h.w xr9, xr8, 12 // t9 4472 4473 xvldrepl.h xr20, t0, 20 // 2106 4474 xvldrepl.h xr21, t0, 22 // 3513 4475 xvmulev_xvmaddod_lasx xr5, xr10, xr20, xr21, xr16, xr7 4476 xvneg.h xr20, xr20 4477 xvmulev_xvmaddod_lasx xr5, xr10, xr21, xr20, xr17, xr8 4478 xvilvl.w xr5, xr7, xr16 4479 xvilvl.w xr10, xr8, xr17 4480 xvilvh.w xr7, xr7, xr16 4481 xvilvh.w xr8, xr8, xr17 4482 xvssrarni.h.w xr7, xr5, 12 // t10 4483 xvssrarni.h.w xr8, xr10, 12 // t11 4484 4485 xvldrepl.h xr20, t0, 24 // 1380 4486 xvldrepl.h xr21, t0, 26 // 3857 4487 xvmulev_xvmaddod_lasx xr3, xr12, xr20, xr21, xr16, xr5 4488 xvneg.h xr20, xr20 4489 xvmulev_xvmaddod_lasx xr3, xr12, xr21, xr20, xr17, xr10 4490 xvilvl.w xr3, xr5, xr16 4491 xvilvl.w xr12, xr10, xr17 4492 xvilvh.w xr5, xr5, xr16 4493 xvilvh.w xr10, xr10, xr17 4494 xvssrarni.h.w xr5, xr3, 12 // t12 4495 xvssrarni.h.w xr10, xr12, 12 // t13 4496 4497 xvldrepl.h xr20, t0, 28 // 601 4498 xvldrepl.h xr21, t0, 30 // 4052 4499 xvmulev_xvmaddod_lasx xr1, xr14, xr20, xr21, xr16, xr3 4500 xvneg.h xr20, xr20 4501 xvmulev_xvmaddod_lasx xr1, xr14, xr21, xr20, xr17, xr12 4502 xvilvl.w xr1, xr3, xr16 4503 xvilvl.w xr14, xr12, xr17 4504 xvilvh.w xr3, xr3, xr16 4505 xvilvh.w xr12, xr12, xr17 4506 xvssrarni.h.w xr3, xr1, 12 // t14 4507 xvssrarni.h.w xr12, xr14, 12 // t15 4508 4509 xvsadd.h xr1, xr18, xr6 // t0a 4510 xvssub.h xr14, xr18, xr6 // t8a 4511 xvsadd.h xr16, xr19, xr9 // t1a 4512 xvssub.h xr17, xr19, xr9 // t9a 4513 xvsadd.h xr6, xr0, xr7 // t2a 4514 xvssub.h xr18, xr0, xr7 // t10a 4515 xvsadd.h xr9, xr15, xr8 // t3a 4516 xvssub.h xr19, xr15, xr8 // t11a 4517 xvsadd.h xr0, xr2, xr5 // t4a 4518 xvssub.h xr7, xr2, xr5 // t12a 4519 xvsadd.h xr8, xr13, xr10 // t5a 4520 xvssub.h xr15, xr13, xr10 // t13a 4521 xvsadd.h xr2, xr4, xr3 // t6a 4522 xvssub.h xr5, xr4, xr3 // t14a 4523 xvsadd.h xr10, xr11, xr12 // t7a 4524 xvssub.h xr13, xr11, xr12 // t15a 4525 4526 la.local t0, idct_coeffs_h 4527 4528 xvldrepl.h xr20, t0, 8 // 799 4529 xvldrepl.h xr21, t0, 10 // 4017 4530 xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr3, xr11 4531 xvneg.h xr21, xr21 4532 xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr4, xr12 4533 xvilvl.w xr14, xr11, xr3 4534 xvilvl.w xr17, xr12, xr4 4535 xvilvh.w xr11, xr11, xr3 4536 xvilvh.w xr12, xr12, xr4 4537 xvssrarni.h.w xr11, xr14, 12 // t8 4538 xvssrarni.h.w xr12, xr17, 12 // t9 4539 4540 xvneg.h xr21, xr21 4541 xvmulev_xvmaddod_lasx xr15, xr7, xr20, xr21, xr3, xr14 4542 xvneg.h xr20, xr20 4543 xvmulev_xvmaddod_lasx xr15, xr7, xr21, xr20, xr4, xr17 4544 xvilvl.w xr15, xr14, xr3 4545 xvilvl.w xr7, xr17, xr4 4546 xvilvh.w xr14, xr14, xr3 4547 xvilvh.w xr17, xr17, xr4 4548 xvssrarni.h.w xr14, xr15, 12 // t13 4549 xvssrarni.h.w xr17, xr7, 12 // t12 4550 4551 xvldrepl.h xr20, t0, 12 // 3406 4552 xvldrepl.h xr21, t0, 14 // 2276 4553 xvmulev_xvmaddod_lasx xr18, xr19, xr21, xr20, xr3, xr7 4554 xvneg.h xr21, xr21 4555 xvmulev_xvmaddod_lasx xr18, xr19, xr20, xr21, xr4, xr15 4556 xvilvl.w xr18, xr7, xr3 4557 xvilvl.w xr19, xr15, xr4 4558 xvilvh.w xr7, xr7, xr3 4559 xvilvh.w xr15, xr15, xr4 4560 xvssrarni.h.w xr7, xr18, 12 // t10 4561 xvssrarni.h.w xr15, xr19, 12 // t11 4562 4563 xvneg.h xr21, xr21 4564 xvmulev_xvmaddod_lasx xr13, xr5, xr20, xr21, xr3, xr18 4565 xvneg.h xr20, xr20 4566 xvmulev_xvmaddod_lasx xr13, xr5, xr21, xr20, xr4, xr19 4567 xvilvl.w xr13, xr18, xr3 4568 xvilvl.w xr5, xr19, xr4 4569 xvilvh.w xr18, xr18, xr3 4570 xvilvh.w xr19, xr19, xr4 4571 xvssrarni.h.w xr18, xr13, 12 // t15 4572 xvssrarni.h.w xr19, xr5, 12 // t14 4573 4574 xvsadd.h xr5, xr1, xr0 // t0 4575 xvssub.h xr13, xr1, xr0 // t4 4576 xvsadd.h xr3, xr16, xr8 // t1 4577 xvssub.h xr4, xr16, xr8 // t5 4578 xvsadd.h xr0, xr6, xr2 // t2 4579 xvssub.h xr1, xr6, xr2 // t6 4580 xvsadd.h xr8, xr9, xr10 // t3 4581 xvssub.h xr16, xr9, xr10 // t7 4582 xvsadd.h xr2, xr11, xr17 // t8a 4583 xvssub.h xr6, xr11, xr17 // t12a 4584 xvsadd.h xr9, xr12, xr14 // t9a 4585 xvssub.h xr10, xr12, xr14 // t13a 4586 xvsadd.h xr11, xr7, xr19 // t10a 4587 xvssub.h xr17, xr7, xr19 // t14a 4588 xvsadd.h xr12, xr15, xr18 // t11a 4589 xvssub.h xr14, xr15, xr18 // t15a 4590 4591 la.local t0, idct_coeffs_h 4592 4593 xvldrepl.h xr20, t0, 4 // 1567 4594 xvldrepl.h xr21, t0, 6 // 3784 4595 xvmulev_xvmaddod_lasx xr13, xr4, xr21, xr20, xr7, xr18 4596 xvneg.h xr21, xr21 4597 xvmulev_xvmaddod_lasx xr13, xr4, xr20, xr21, xr15, xr19 4598 xvilvl.w xr13, xr18, xr7 4599 xvilvl.w xr4, xr19, xr15 4600 xvilvh.w xr18, xr18, xr7 4601 xvilvh.w xr19, xr19, xr15 4602 xvssrarni.h.w xr18, xr13, 12 // t4a 4603 xvssrarni.h.w xr19, xr4, 12 // t5a 4604 4605 xvneg.h xr21, xr21 4606 xvmulev_xvmaddod_lasx xr16, xr1, xr20, xr21, xr7, xr4 4607 xvneg.h xr20, xr20 4608 xvmulev_xvmaddod_lasx xr16, xr1, xr21, xr20, xr15, xr13 4609 xvilvl.w xr16, xr4, xr7 4610 xvilvl.w xr1, xr13, xr15 4611 xvilvh.w xr4, xr4, xr7 4612 xvilvh.w xr13, xr13, xr15 4613 xvssrarni.h.w xr4, xr16, 12 // t7a 4614 xvssrarni.h.w xr13, xr1, 12 // t6a 4615 4616 xvneg.h xr20, xr20 4617 xvmulev_xvmaddod_lasx xr6, xr10, xr21, xr20, xr7, xr1 4618 xvneg.h xr21, xr21 4619 xvmulev_xvmaddod_lasx xr6, xr10, xr20, xr21, xr15, xr16 4620 xvilvl.w xr6, xr1, xr7 4621 xvilvl.w xr10, xr16, xr15 4622 xvilvh.w xr1, xr1, xr7 4623 xvilvh.w xr16, xr16, xr15 4624 xvssrarni.h.w xr1, xr6, 12 // t12 4625 xvssrarni.h.w xr16, xr10, 12 // t13 4626 4627 xvneg.h xr21, xr21 4628 xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr7, xr6 4629 xvneg.h xr20, xr20 4630 xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr15, xr10 4631 xvilvl.w xr14, xr6, xr7 4632 xvilvl.w xr17, xr10, xr15 4633 xvilvh.w xr6, xr6, xr7 4634 xvilvh.w xr10, xr10, xr15 4635 xvssrarni.h.w xr6, xr14, 12 // t15 4636 xvssrarni.h.w xr10, xr17, 12 // t14 4637 4638 xvsadd.h xr14, xr5, xr0 // out[0] 4639 xvssub.h xr17, xr5, xr0 // t2a 4640 xvssub.h xr7, xr3, xr8 // t3a 4641 xvsadd.h xr15, xr3, xr8 // out[15] 4642 xvsllwil.w.h xr22, xr15, 0 4643 xvexth.w.h xr15, xr15 4644 xvneg.w xr22, xr22 4645 xvneg.w xr15, xr15 4646 xvssrarni.h.w xr15, xr22, 0 // out[15] 4647 xvssub.h xr7, xr3, xr8 // t3a 4648 4649 xvsadd.h xr3, xr19, xr4 // out[12] 4650 xvssub.h xr8, xr19, xr4 // t7 4651 xvssub.h xr0, xr18, xr13 // t6 4652 xvsadd.h xr5, xr18, xr13 // out[3] 4653 xvsllwil.w.h xr22, xr5, 0 4654 xvexth.w.h xr5, xr5 4655 xvneg.w xr22, xr22 4656 xvneg.w xr5, xr5 4657 xvssrarni.h.w xr5, xr22, 0 // out[3] 4658 4659 xvsadd.h xr13, xr9, xr12 // out[14] 4660 xvssub.h xr19, xr9, xr12 // t11 4661 xvssub.h xr4, xr2, xr11 // t10 4662 xvsadd.h xr18, xr2, xr11 // out[1] 4663 xvsllwil.w.h xr22, xr18, 0 4664 xvexth.w.h xr18, xr18 4665 xvneg.w xr22, xr22 4666 xvneg.w xr18, xr18 4667 xvssrarni.h.w xr18, xr22, 0 // out[1] 4668 4669 xvsadd.h xr2, xr1, xr10 // out[2] 4670 xvssub.h xr11, xr1, xr10 // t14a 4671 xvssub.h xr12, xr16, xr6 // t15a 4672 xvsadd.h xr9, xr16, xr6 // out[13] 4673 xvsllwil.w.h xr22, xr9, 0 4674 xvexth.w.h xr9, xr9 4675 xvneg.w xr22, xr22 4676 xvneg.w xr9, xr9 4677 xvssrarni.h.w xr9, xr22, 0 // out[13] 4678 4679 xvldrepl.h xr20, t0, 0 // 2896 4680 xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr20, xr6, xr10 4681 xvneg.h xr21, xr20 4682 xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr21, xr16, xr1 4683 xvilvl.w xr17, xr10, xr6 4684 xvilvl.w xr7, xr1, xr16 4685 xvilvh.w xr10, xr10, xr6 4686 xvilvh.w xr1, xr1, xr16 4687 xvssrarni.h.w xr1, xr7, 12 // out[8] 4688 xvsrari.w xr17, xr17, 12 4689 xvsrari.w xr10, xr10, 12 4690 xvneg.w xr17, xr17 4691 xvneg.w xr10, xr10 4692 xvssrarni.h.w xr10, xr17, 0 // out[7] 4693 4694 xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr21, xr16, xr17 4695 xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr20, xr6, xr7 4696 xvilvl.w xr0, xr17, xr16 4697 xvilvl.w xr8, xr7, xr6 4698 xvilvh.w xr17, xr17, xr16 4699 xvilvh.w xr7, xr7, xr6 4700 xvssrarni.h.w xr7, xr8, 12 // out[4] 4701 xvsrari.w xr0, xr0, 12 4702 xvsrari.w xr17, xr17, 12 4703 xvneg.w xr0, xr0 4704 xvneg.w xr17, xr17 4705 xvssrarni.h.w xr17, xr0, 0 // out[11] 4706 4707 xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr21, xr16, xr0 4708 xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr20, xr6, xr8 4709 xvilvl.w xr4, xr0, xr16 4710 xvilvl.w xr19, xr8, xr6 4711 xvilvh.w xr0, xr0, xr16 4712 xvilvh.w xr8, xr8, xr6 4713 xvssrarni.h.w xr8, xr19, 12 // out[6] 4714 xvsrari.w xr4, xr4, 12 4715 xvsrari.w xr0, xr0, 12 4716 xvneg.w xr4, xr4 4717 xvneg.w xr0, xr0 4718 xvssrarni.h.w xr0, xr4, 0 // out[9] 4719 xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr20, xr6, xr4 4720 xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr21, xr16, xr19 4721 xvilvl.w xr11, xr4, xr6 4722 xvilvl.w xr12, xr19, xr16 4723 xvilvh.w xr4, xr4, xr6 4724 xvilvh.w xr19, xr19, xr16 4725 xvssrarni.h.w xr19, xr12, 12 // out[10] 4726 xvsrari.w xr11, xr11, 12 4727 xvsrari.w xr4, xr4, 12 4728 xvneg.w xr11, xr11 4729 xvneg.w xr4, xr4 4730 xvssrarni.h.w xr4, xr11, 0 // out[5] 4731.endm 4732 4733function inv_txfm_add_adst_adst_16x16_8bpc_lasx 4734 PUSH_REG 4735 xvld_x16 a2, 0, 32, xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \ 4736 xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15 4737 4738 inv_adst16_lasx 4739 4740 LASX_TRANSPOSE8x8_H xr14, xr18, xr2, xr5, xr7, xr4, xr8, xr10, \ 4741 xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \ 4742 xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27 4743 4744 LASX_TRANSPOSE8x8_H xr1, xr0, xr19, xr17, xr3, xr9, xr13, xr15, \ 4745 xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \ 4746 xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27 4747 4748 xvsrari_h_x16 xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \ 4749 xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \ 4750 xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \ 4751 xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15, 2 4752 4753 xvpermi_q_x2 xr0, xr1, xr8, xr9, xr0, xr1, xr8, xr9, xr20, xr21 4754 xvpermi_q_x2 xr2, xr3, xr10, xr11, xr2, xr3, xr10, xr11, xr20, xr21 4755 xvpermi_q_x2 xr4, xr5, xr12, xr13, xr4, xr5, xr12, xr13, xr20, xr21 4756 xvpermi_q_x2 xr6, xr7, xr14, xr15, xr6, xr7, xr14, xr15, xr20, xr21 4757 4758 inv_adst16_lasx 4759 4760 xvsrari_h_x16 xr14, xr18, xr2, xr5, xr7, xr4, xr8, xr10, \ 4761 xr1, xr0, xr19, xr17, xr3, xr9, xr13, xr15, \ 4762 xr14, xr18, xr11, xr5, xr7, xr4, xr8, xr10, \ 4763 xr12, xr16, xr19, xr17, xr20, xr9, xr13, xr15, 4 4764 4765 xvxor.v xr23, xr23, xr23 4766.irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480 4767 xvst xr23, a2, \i 4768.endr 4769 alsl.d t2, a1, a0, 1 4770 XVLD_DST_ADD_W16 xr14, xr18, xr11, xr5 4771 alsl.d a0, a1, a0, 2 4772 alsl.d t2, a1, a0, 1 4773 XVLD_DST_ADD_W16 xr7, xr4, xr8, xr10 4774 alsl.d a0, a1, a0, 2 4775 alsl.d t2, a1, a0, 1 4776 XVLD_DST_ADD_W16 xr12, xr16, xr19, xr17 4777 alsl.d a0, a1, a0, 2 4778 alsl.d t2, a1, a0, 1 4779 XVLD_DST_ADD_W16 xr20, xr9, xr13, xr15 4780 POP_REG 4781endfunc 4782