1/********************************************************************* 2 * Copyright (c) 2022 Loongson Technology Corporation Limited 3 * Contributed by Gu Xiwei([email protected]) 4 * Shiyou Yin([email protected]) 5 * 6 * Permission to use, copy, modify, and/or distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 *********************************************************************/ 18 19/* 20 * This file is a LoongArch assembly helper file and available under ISC 21 * license. It provides a large number of macros and alias to simplify 22 * writing assembly code, especially for LSX and LASX optimizations. 23 * 24 * Any one can modify it or add new features for his/her own purposes. 25 * Contributing a patch will be appreciated as it might be useful for 26 * others as well. Send patches to loongson contributor mentioned above. 27 * 28 * MAJOR version: Usage changes, incompatible with previous version. 29 * MINOR version: Add new macros/functions, or bug fixes. 30 * MICRO version: Comment changes or implementation changes. 31 */ 32 33#define LML_VERSION_MAJOR 0 34#define LML_VERSION_MINOR 4 35#define LML_VERSION_MICRO 0 36 37#define DEFAULT_ALIGN 5 38 39/* Set prefix as needed. */ 40#ifndef PRIVATE_PREFIX 41#define PRIVATE_PREFIX dav1d_ 42#endif 43 44#define PASTE(a,b) a ## b 45#define CONCAT(a,b) PASTE(a,b) 46 47#ifdef PREFIX 48#define ASM_PREF CONCAT(_,PRIVATE_PREFIX) 49#else 50#define ASM_PREF PRIVATE_PREFIX 51#endif 52 53.macro function name, align=DEFAULT_ALIGN 54.macro endfunc 55 jirl $r0, $r1, 0x0 56 .size ASM_PREF\name, . - ASM_PREF\name 57 .purgem endfunc 58.endm 59.text ; 60.align \align ; 61.globl ASM_PREF\name ; 62.type ASM_PREF\name, @function ; 63ASM_PREF\name: ; 64.endm 65 66.macro const name, align=DEFAULT_ALIGN 67 .macro endconst 68 .size \name, . - \name 69 .purgem endconst 70 .endm 71.section .rodata 72.align \align 73\name: 74.endm 75 76/* 77 *============================================================================ 78 * LoongArch register alias 79 *============================================================================ 80 */ 81 82#define a0 $a0 83#define a1 $a1 84#define a2 $a2 85#define a3 $a3 86#define a4 $a4 87#define a5 $a5 88#define a6 $a6 89#define a7 $a7 90 91#define t0 $t0 92#define t1 $t1 93#define t2 $t2 94#define t3 $t3 95#define t4 $t4 96#define t5 $t5 97#define t6 $t6 98#define t7 $t7 99#define t8 $t8 100 101#define s0 $s0 102#define s1 $s1 103#define s2 $s2 104#define s3 $s3 105#define s4 $s4 106#define s5 $s5 107#define s6 $s6 108#define s7 $s7 109#define s8 $s8 110 111#define zero $zero 112#define sp $sp 113#define ra $ra 114 115#define fa0 $fa0 116#define fa1 $fa1 117#define fa2 $fa2 118#define fa3 $fa3 119#define fa4 $fa4 120#define fa5 $fa5 121#define fa6 $fa6 122#define fa7 $fa7 123#define ft0 $ft0 124#define ft1 $ft1 125#define ft2 $ft2 126#define ft3 $ft3 127#define ft4 $ft4 128#define ft5 $ft5 129#define ft6 $ft6 130#define ft7 $ft7 131#define ft8 $ft8 132#define ft9 $ft9 133#define ft10 $ft10 134#define ft11 $ft11 135#define ft12 $ft12 136#define ft13 $ft13 137#define ft14 $ft14 138#define ft15 $ft15 139#define fs0 $fs0 140#define fs1 $fs1 141#define fs2 $fs2 142#define fs3 $fs3 143#define fs4 $fs4 144#define fs5 $fs5 145#define fs6 $fs6 146#define fs7 $fs7 147 148#define f0 $f0 149#define f1 $f1 150#define f2 $f2 151#define f3 $f3 152#define f4 $f4 153#define f5 $f5 154#define f6 $f6 155#define f7 $f7 156#define f8 $f8 157#define f9 $f9 158#define f10 $f10 159#define f11 $f11 160#define f12 $f12 161#define f13 $f13 162#define f14 $f14 163#define f15 $f15 164#define f16 $f16 165#define f17 $f17 166#define f18 $f18 167#define f19 $f19 168#define f20 $f20 169#define f21 $f21 170#define f22 $f22 171#define f23 $f23 172#define f24 $f24 173#define f25 $f25 174#define f26 $f26 175#define f27 $f27 176#define f28 $f28 177#define f29 $f29 178#define f30 $f30 179#define f31 $f31 180 181#define vr0 $vr0 182#define vr1 $vr1 183#define vr2 $vr2 184#define vr3 $vr3 185#define vr4 $vr4 186#define vr5 $vr5 187#define vr6 $vr6 188#define vr7 $vr7 189#define vr8 $vr8 190#define vr9 $vr9 191#define vr10 $vr10 192#define vr11 $vr11 193#define vr12 $vr12 194#define vr13 $vr13 195#define vr14 $vr14 196#define vr15 $vr15 197#define vr16 $vr16 198#define vr17 $vr17 199#define vr18 $vr18 200#define vr19 $vr19 201#define vr20 $vr20 202#define vr21 $vr21 203#define vr22 $vr22 204#define vr23 $vr23 205#define vr24 $vr24 206#define vr25 $vr25 207#define vr26 $vr26 208#define vr27 $vr27 209#define vr28 $vr28 210#define vr29 $vr29 211#define vr30 $vr30 212#define vr31 $vr31 213 214#define xr0 $xr0 215#define xr1 $xr1 216#define xr2 $xr2 217#define xr3 $xr3 218#define xr4 $xr4 219#define xr5 $xr5 220#define xr6 $xr6 221#define xr7 $xr7 222#define xr8 $xr8 223#define xr9 $xr9 224#define xr10 $xr10 225#define xr11 $xr11 226#define xr12 $xr12 227#define xr13 $xr13 228#define xr14 $xr14 229#define xr15 $xr15 230#define xr16 $xr16 231#define xr17 $xr17 232#define xr18 $xr18 233#define xr19 $xr19 234#define xr20 $xr20 235#define xr21 $xr21 236#define xr22 $xr22 237#define xr23 $xr23 238#define xr24 $xr24 239#define xr25 $xr25 240#define xr26 $xr26 241#define xr27 $xr27 242#define xr28 $xr28 243#define xr29 $xr29 244#define xr30 $xr30 245#define xr31 $xr31 246 247/* 248 *============================================================================ 249 * LSX/LASX synthesize instructions 250 *============================================================================ 251 */ 252 253/* 254 * Description : Dot product of byte vector elements 255 * Arguments : Inputs - vj, vk 256 * Outputs - vd 257 * Return Type - halfword 258 */ 259.macro vdp2.h.bu vd, vj, vk 260 vmulwev.h.bu \vd, \vj, \vk 261 vmaddwod.h.bu \vd, \vj, \vk 262.endm 263 264.macro vdp2.h.bu.b vd, vj, vk 265 vmulwev.h.bu.b \vd, \vj, \vk 266 vmaddwod.h.bu.b \vd, \vj, \vk 267.endm 268 269.macro vdp2.w.h vd, vj, vk 270 vmulwev.w.h \vd, \vj, \vk 271 vmaddwod.w.h \vd, \vj, \vk 272.endm 273 274.macro xvdp2.h.bu xd, xj, xk 275 xvmulwev.h.bu \xd, \xj, \xk 276 xvmaddwod.h.bu \xd, \xj, \xk 277.endm 278 279.macro xvdp2.h.bu.b xd, xj, xk 280 xvmulwev.h.bu.b \xd, \xj, \xk 281 xvmaddwod.h.bu.b \xd, \xj, \xk 282.endm 283 284.macro xvdp2.w.h xd, xj, xk 285 xvmulwev.w.h \xd, \xj, \xk 286 xvmaddwod.w.h \xd, \xj, \xk 287.endm 288 289/* 290 * Description : Dot product & addition of halfword vector elements 291 * Arguments : Inputs - vj, vk 292 * Outputs - vd 293 * Return Type - twice size of input 294 */ 295.macro vdp2add.h.bu vd, vj, vk 296 vmaddwev.h.bu \vd, \vj, \vk 297 vmaddwod.h.bu \vd, \vj, \vk 298.endm 299 300.macro vdp2add.h.bu.b vd, vj, vk 301 vmaddwev.h.bu.b \vd, \vj, \vk 302 vmaddwod.h.bu.b \vd, \vj, \vk 303.endm 304 305.macro vdp2add.w.h vd, vj, vk 306 vmaddwev.w.h \vd, \vj, \vk 307 vmaddwod.w.h \vd, \vj, \vk 308.endm 309 310.macro xvdp2add.h.bu.b xd, xj, xk 311 xvmaddwev.h.bu.b \xd, \xj, \xk 312 xvmaddwod.h.bu.b \xd, \xj, \xk 313.endm 314 315.macro xvdp2add.w.h xd, xj, xk 316 xvmaddwev.w.h \xd, \xj, \xk 317 xvmaddwod.w.h \xd, \xj, \xk 318.endm 319 320/* 321 * Description : Range element vj[i] to vk[i] ~ vj[i] 322 * clip: vj > vk ? vj : vk && vj < va ? vj : va 323 */ 324.macro vclip.h vd, vj, vk, va 325 vmax.h \vd, \vj, \vk 326 vmin.h \vd, \vd, \va 327.endm 328 329.macro vclip.w vd, vj, vk, va 330 vmax.w \vd, \vj, \vk 331 vmin.w \vd, \vd, \va 332.endm 333 334.macro xvclip.h xd, xj, xk, xa 335 xvmax.h \xd, \xj, \xk 336 xvmin.h \xd, \xd, \xa 337.endm 338 339.macro xvclip.w xd, xj, xk, xa 340 xvmax.w \xd, \xj, \xk 341 xvmin.w \xd, \xd, \xa 342.endm 343 344/* 345 * Description : Range element vj[i] to 0 ~ 255 346 * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0 347 */ 348.macro vclip255.h vd, vj 349 vmaxi.h \vd, \vj, 0 350 vsat.hu \vd, \vd, 7 351.endm 352 353.macro vclip255.w vd, vj 354 vmaxi.w \vd, \vj, 0 355 vsat.wu \vd, \vd, 7 356.endm 357 358.macro xvclip255.h xd, xj 359 xvmaxi.h \xd, \xj, 0 360 xvsat.hu \xd, \xd, 7 361.endm 362 363.macro xvclip255.w xd, xj 364 xvmaxi.w \xd, \xj, 0 365 xvsat.wu \xd, \xd, 7 366.endm 367 368/* 369 * Description : Store elements of vector 370 * vd : Data vector to be stroed 371 * rk : Address of data storage 372 * ra : Offset of address 373 * si : Index of data in vd 374 */ 375.macro vstelmx.b vd, rk, ra, si 376 add.d \rk, \rk, \ra 377 vstelm.b \vd, \rk, 0, \si 378.endm 379 380.macro vstelmx.h vd, rk, ra, si 381 add.d \rk, \rk, \ra 382 vstelm.h \vd, \rk, 0, \si 383.endm 384 385.macro vstelmx.w vd, rk, ra, si 386 add.d \rk, \rk, \ra 387 vstelm.w \vd, \rk, 0, \si 388.endm 389 390.macro vstelmx.d vd, rk, ra, si 391 add.d \rk, \rk, \ra 392 vstelm.d \vd, \rk, 0, \si 393.endm 394 395.macro vmov xd, xj 396 vor.v \xd, \xj, \xj 397.endm 398 399.macro xmov xd, xj 400 xvor.v \xd, \xj, \xj 401.endm 402 403.macro xvstelmx.d xd, rk, ra, si 404 add.d \rk, \rk, \ra 405 xvstelm.d \xd, \rk, 0, \si 406.endm 407 408/* 409 *============================================================================ 410 * LSX/LASX custom macros 411 *============================================================================ 412 */ 413 414/* 415 * Load 4 float, double, V128, v256 elements with stride. 416 */ 417.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 418 fld.s \out0, \src, 0 419 fldx.s \out1, \src, \stride 420 fldx.s \out2, \src, \stride2 421 fldx.s \out3, \src, \stride3 422.endm 423 424.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 425 fld.d \out0, \src, 0 426 fldx.d \out1, \src, \stride 427 fldx.d \out2, \src, \stride2 428 fldx.d \out3, \src, \stride3 429.endm 430 431.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 432 vld \out0, \src, 0 433 vldx \out1, \src, \stride 434 vldx \out2, \src, \stride2 435 vldx \out3, \src, \stride3 436.endm 437 438.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 439 xvld \out0, \src, 0 440 xvldx \out1, \src, \stride 441 xvldx \out2, \src, \stride2 442 xvldx \out3, \src, \stride3 443.endm 444 445/* 446 * Description : Transpose 4x4 block with half-word elements in vectors 447 * Arguments : Inputs - in0, in1, in2, in3 448 * Outputs - out0, out1, out2, out3 449 */ 450.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ 451 tmp0, tmp1 452 vilvl.h \tmp0, \in1, \in0 453 vilvl.h \tmp1, \in3, \in2 454 vilvl.w \out0, \tmp1, \tmp0 455 vilvh.w \out2, \tmp1, \tmp0 456 vilvh.d \out1, \out0, \out0 457 vilvh.d \out3, \out0, \out2 458.endm 459 460/* 461 * Description : Transpose 4x4 block with word elements in vectors 462 * Arguments : Inputs - in0, in1, in2, in3 463 * Outputs - out0, out1, out2, out3 464 * Details : 465 * Example : 466 * 1, 2, 3, 4 1, 5, 9,13 467 * 5, 6, 7, 8 to 2, 6,10,14 468 * 9,10,11,12 =====> 3, 7,11,15 469 * 13,14,15,16 4, 8,12,16 470 */ 471.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ 472 tmp0, tmp1 473 474 vilvl.w \tmp0, \in1, \in0 475 vilvh.w \out1, \in1, \in0 476 vilvl.w \tmp1, \in3, \in2 477 vilvh.w \out3, \in3, \in2 478 479 vilvl.d \out0, \tmp1, \tmp0 480 vilvl.d \out2, \out3, \out1 481 vilvh.d \out3, \out3, \out1 482 vilvh.d \out1, \tmp1, \tmp0 483.endm 484 485/* 486 * Description : Transpose 8x8 block with half-word elements in vectors 487 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 488 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 489 */ 490.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 491 out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \ 492 tmp3, tmp4, tmp5, tmp6, tmp7 493 vilvl.h \tmp0, \in6, \in4 494 vilvl.h \tmp1, \in7, \in5 495 vilvl.h \tmp2, \in2, \in0 496 vilvl.h \tmp3, \in3, \in1 497 498 vilvl.h \tmp4, \tmp1, \tmp0 499 vilvh.h \tmp5, \tmp1, \tmp0 500 vilvl.h \tmp6, \tmp3, \tmp2 501 vilvh.h \tmp7, \tmp3, \tmp2 502 503 vilvh.h \tmp0, \in6, \in4 504 vilvh.h \tmp1, \in7, \in5 505 vilvh.h \tmp2, \in2, \in0 506 vilvh.h \tmp3, \in3, \in1 507 508 vpickev.d \out0, \tmp4, \tmp6 509 vpickod.d \out1, \tmp4, \tmp6 510 vpickev.d \out2, \tmp5, \tmp7 511 vpickod.d \out3, \tmp5, \tmp7 512 513 vilvl.h \tmp4, \tmp1, \tmp0 514 vilvh.h \tmp5, \tmp1, \tmp0 515 vilvl.h \tmp6, \tmp3, \tmp2 516 vilvh.h \tmp7, \tmp3, \tmp2 517 518 vpickev.d \out4, \tmp4, \tmp6 519 vpickod.d \out5, \tmp4, \tmp6 520 vpickev.d \out6, \tmp5, \tmp7 521 vpickod.d \out7, \tmp5, \tmp7 522.endm 523 524/* 525 * Description : Transpose 16x8 block with byte elements in vectors 526 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 527 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 528 */ 529.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \ 530 in8, in9, in10, in11, in12, in13, in14, in15, \ 531 out0, out1, out2, out3, out4, out5, out6, out7,\ 532 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 533 xvilvl.b \tmp0, \in2, \in0 534 xvilvl.b \tmp1, \in3, \in1 535 xvilvl.b \tmp2, \in6, \in4 536 xvilvl.b \tmp3, \in7, \in5 537 xvilvl.b \tmp4, \in10, \in8 538 xvilvl.b \tmp5, \in11, \in9 539 xvilvl.b \tmp6, \in14, \in12 540 xvilvl.b \tmp7, \in15, \in13 541 xvilvl.b \out0, \tmp1, \tmp0 542 xvilvh.b \out1, \tmp1, \tmp0 543 xvilvl.b \out2, \tmp3, \tmp2 544 xvilvh.b \out3, \tmp3, \tmp2 545 xvilvl.b \out4, \tmp5, \tmp4 546 xvilvh.b \out5, \tmp5, \tmp4 547 xvilvl.b \out6, \tmp7, \tmp6 548 xvilvh.b \out7, \tmp7, \tmp6 549 xvilvl.w \tmp0, \out2, \out0 550 xvilvh.w \tmp2, \out2, \out0 551 xvilvl.w \tmp4, \out3, \out1 552 xvilvh.w \tmp6, \out3, \out1 553 xvilvl.w \tmp1, \out6, \out4 554 xvilvh.w \tmp3, \out6, \out4 555 xvilvl.w \tmp5, \out7, \out5 556 xvilvh.w \tmp7, \out7, \out5 557 xvilvl.d \out0, \tmp1, \tmp0 558 xvilvh.d \out1, \tmp1, \tmp0 559 xvilvl.d \out2, \tmp3, \tmp2 560 xvilvh.d \out3, \tmp3, \tmp2 561 xvilvl.d \out4, \tmp5, \tmp4 562 xvilvh.d \out5, \tmp5, \tmp4 563 xvilvl.d \out6, \tmp7, \tmp6 564 xvilvh.d \out7, \tmp7, \tmp6 565.endm 566 567/* 568 * Description : Transpose 4x4 block with half-word elements in vectors 569 * Arguments : Inputs - in0, in1, in2, in3 570 * Outputs - out0, out1, out2, out3 571 */ 572.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ 573 tmp0, tmp1 574 xvilvl.h \tmp0, \in1, \in0 575 xvilvl.h \tmp1, \in3, \in2 576 xvilvl.w \out0, \tmp1, \tmp0 577 xvilvh.w \out2, \tmp1, \tmp0 578 xvilvh.d \out1, \out0, \out0 579 xvilvh.d \out3, \out0, \out2 580.endm 581 582/* 583 * Description : Transpose 4x8 block with half-word elements in vectors 584 * Arguments : Inputs - in0, in1, in2, in3 585 * Outputs - out0, out1, out2, out3 586 */ 587.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \ 588 tmp0, tmp1 589 xvilvl.h \tmp0, \in2, \in0 590 xvilvl.h \tmp1, \in3, \in1 591 xvilvl.h \out2, \tmp1, \tmp0 592 xvilvh.h \out3, \tmp1, \tmp0 593 594 xvilvl.d \out0, \out2, \out2 595 xvilvh.d \out1, \out2, \out2 596 xvilvl.d \out2, \out3, \out3 597 xvilvh.d \out3, \out3, \out3 598.endm 599 600/* 601 * Description : Transpose 8x8 block with half-word elements in vectors 602 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 603 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 604 */ 605.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \ 606 out0, out1, out2, out3, out4, out5, out6, out7, \ 607 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 608 xvilvl.h \tmp0, \in6, \in4 609 xvilvl.h \tmp1, \in7, \in5 610 xvilvl.h \tmp2, \in2, \in0 611 xvilvl.h \tmp3, \in3, \in1 612 613 xvilvl.h \tmp4, \tmp1, \tmp0 614 xvilvh.h \tmp5, \tmp1, \tmp0 615 xvilvl.h \tmp6, \tmp3, \tmp2 616 xvilvh.h \tmp7, \tmp3, \tmp2 617 618 xvilvh.h \tmp0, \in6, \in4 619 xvilvh.h \tmp1, \in7, \in5 620 xvilvh.h \tmp2, \in2, \in0 621 xvilvh.h \tmp3, \in3, \in1 622 623 xvpickev.d \out0, \tmp4, \tmp6 624 xvpickod.d \out1, \tmp4, \tmp6 625 xvpickev.d \out2, \tmp5, \tmp7 626 xvpickod.d \out3, \tmp5, \tmp7 627 628 xvilvl.h \tmp4, \tmp1, \tmp0 629 xvilvh.h \tmp5, \tmp1, \tmp0 630 xvilvl.h \tmp6, \tmp3, \tmp2 631 xvilvh.h \tmp7, \tmp3, \tmp2 632 633 xvpickev.d \out4, \tmp4, \tmp6 634 xvpickod.d \out5, \tmp4, \tmp6 635 xvpickev.d \out6, \tmp5, \tmp7 636 xvpickod.d \out7, \tmp5, \tmp7 637.endm 638 639/* 640 * Description : Transpose 2x4x4 block with half-word elements in vectors 641 * Arguments : Inputs - in0, in1, in2, in3 642 * Outputs - out0, out1, out2, out3 643 */ 644.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ 645 tmp0, tmp1, tmp2 646 xvilvh.h \tmp1, \in0, \in1 647 xvilvl.h \out1, \in0, \in1 648 xvilvh.h \tmp0, \in2, \in3 649 xvilvl.h \out3, \in2, \in3 650 651 xvilvh.w \tmp2, \out3, \out1 652 xvilvl.w \out3, \out3, \out1 653 654 xvilvl.w \out2, \tmp0, \tmp1 655 xvilvh.w \tmp1, \tmp0, \tmp1 656 657 xvilvh.d \out0, \out2, \out3 658 xvilvl.d \out2, \out2, \out3 659 xvilvh.d \out1, \tmp1, \tmp2 660 xvilvl.d \out3, \tmp1, \tmp2 661.endm 662 663/* 664 * Description : Transpose 4x4 block with word elements in vectors 665 * Arguments : Inputs - in0, in1, in2, in3 666 * Outputs - out0, out1, out2, out3 667 * Details : 668 * Example : 669 * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13 670 * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14 671 * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15 672 * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16 673 */ 674.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ 675 tmp0, tmp1 676 677 xvilvl.w \tmp0, \in1, \in0 678 xvilvh.w \out1, \in1, \in0 679 xvilvl.w \tmp1, \in3, \in2 680 xvilvh.w \out3, \in3, \in2 681 682 xvilvl.d \out0, \tmp1, \tmp0 683 xvilvl.d \out2, \out3, \out1 684 xvilvh.d \out3, \out3, \out1 685 xvilvh.d \out1, \tmp1, \tmp0 686.endm 687 688/* 689 * Description : Transpose 8x8 block with word elements in vectors 690 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 691 * Outputs - out0, out1, out2, out3, out4, out5, out6, 692 * _out7 693 * Example : LASX_TRANSPOSE8x8_W 694 * in0 : 1,2,3,4,5,6,7,8 695 * in1 : 2,2,3,4,5,6,7,8 696 * in2 : 3,2,3,4,5,6,7,8 697 * in3 : 4,2,3,4,5,6,7,8 698 * in4 : 5,2,3,4,5,6,7,8 699 * in5 : 6,2,3,4,5,6,7,8 700 * in6 : 7,2,3,4,5,6,7,8 701 * in7 : 8,2,3,4,5,6,7,8 702 * 703 * out0 : 1,2,3,4,5,6,7,8 704 * out1 : 2,2,2,2,2,2,2,2 705 * out2 : 3,3,3,3,3,3,3,3 706 * out3 : 4,4,4,4,4,4,4,4 707 * out4 : 5,5,5,5,5,5,5,5 708 * out5 : 6,6,6,6,6,6,6,6 709 * out6 : 7,7,7,7,7,7,7,7 710 * out7 : 8,8,8,8,8,8,8,8 711 */ 712.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\ 713 out0, out1, out2, out3, out4, out5, out6, out7,\ 714 tmp0, tmp1, tmp2, tmp3 715 xvilvl.w \tmp0, \in2, \in0 716 xvilvl.w \tmp1, \in3, \in1 717 xvilvh.w \tmp2, \in2, \in0 718 xvilvh.w \tmp3, \in3, \in1 719 xvilvl.w \out0, \tmp1, \tmp0 720 xvilvh.w \out1, \tmp1, \tmp0 721 xvilvl.w \out2, \tmp3, \tmp2 722 xvilvh.w \out3, \tmp3, \tmp2 723 724 xvilvl.w \tmp0, \in6, \in4 725 xvilvl.w \tmp1, \in7, \in5 726 xvilvh.w \tmp2, \in6, \in4 727 xvilvh.w \tmp3, \in7, \in5 728 xvilvl.w \out4, \tmp1, \tmp0 729 xvilvh.w \out5, \tmp1, \tmp0 730 xvilvl.w \out6, \tmp3, \tmp2 731 xvilvh.w \out7, \tmp3, \tmp2 732 733 xmov \tmp0, \out0 734 xmov \tmp1, \out1 735 xmov \tmp2, \out2 736 xmov \tmp3, \out3 737 xvpermi.q \out0, \out4, 0x02 738 xvpermi.q \out1, \out5, 0x02 739 xvpermi.q \out2, \out6, 0x02 740 xvpermi.q \out3, \out7, 0x02 741 xvpermi.q \out4, \tmp0, 0x31 742 xvpermi.q \out5, \tmp1, 0x31 743 xvpermi.q \out6, \tmp2, 0x31 744 xvpermi.q \out7, \tmp3, 0x31 745.endm 746 747/* 748 * Description : Transpose 4x4 block with double-word elements in vectors 749 * Arguments : Inputs - in0, in1, in2, in3 750 * Outputs - out0, out1, out2, out3 751 * Example : LASX_TRANSPOSE4x4_D 752 * in0 : 1,2,3,4 753 * in1 : 1,2,3,4 754 * in2 : 1,2,3,4 755 * in3 : 1,2,3,4 756 * 757 * out0 : 1,1,1,1 758 * out1 : 2,2,2,2 759 * out2 : 3,3,3,3 760 * out3 : 4,4,4,4 761 */ 762.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ 763 tmp0, tmp1 764 xvilvl.d \tmp0, \in1, \in0 765 xvilvh.d \out1, \in1, \in0 766 xvilvh.d \tmp1, \in3, \in2 767 xvilvl.d \out2, \in3, \in2 768 769 xvor.v \out0, \tmp0, \tmp0 770 xvor.v \out3, \tmp1, \tmp1 771 772 xvpermi.q \out0, \out2, 0x02 773 xvpermi.q \out2, \tmp0, 0x31 774 xvpermi.q \out3, \out1, 0x31 775 xvpermi.q \out1, \tmp1, 0x02 776.endm 777