xref: /aosp_15_r20/external/libdav1d/src/loongarch/loongson_asm.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*********************************************************************
2 * Copyright (c) 2022 Loongson Technology Corporation Limited
3 * Contributed by Gu Xiwei([email protected])
4 *                Shiyou Yin([email protected])
5 *
6 * Permission to use, copy, modify, and/or distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 *********************************************************************/
18
19/*
20 * This file is a LoongArch assembly helper file and available under ISC
21 * license. It provides a large number of macros and alias to simplify
22 * writing assembly code, especially for LSX and LASX optimizations.
23 *
24 * Any one can modify it or add new features for his/her own purposes.
25 * Contributing a patch will be appreciated as it might be useful for
26 * others as well. Send patches to loongson contributor mentioned above.
27 *
28 * MAJOR version: Usage changes, incompatible with previous version.
29 * MINOR version: Add new macros/functions, or bug fixes.
30 * MICRO version: Comment changes or implementation changes.
31 */
32
33#define LML_VERSION_MAJOR 0
34#define LML_VERSION_MINOR 4
35#define LML_VERSION_MICRO 0
36
37#define DEFAULT_ALIGN    5
38
39/* Set prefix as needed. */
40#ifndef PRIVATE_PREFIX
41#define PRIVATE_PREFIX dav1d_
42#endif
43
44#define PASTE(a,b) a ## b
45#define CONCAT(a,b) PASTE(a,b)
46
47#ifdef PREFIX
48#define ASM_PREF CONCAT(_,PRIVATE_PREFIX)
49#else
50#define ASM_PREF PRIVATE_PREFIX
51#endif
52
53.macro function name, align=DEFAULT_ALIGN
54.macro endfunc
55    jirl    $r0, $r1, 0x0
56    .size ASM_PREF\name, . - ASM_PREF\name
57    .purgem endfunc
58.endm
59.text ;
60.align \align ;
61.globl ASM_PREF\name ;
62.type  ASM_PREF\name, @function ;
63ASM_PREF\name: ;
64.endm
65
66.macro  const name, align=DEFAULT_ALIGN
67    .macro endconst
68    .size  \name, . - \name
69    .purgem endconst
70    .endm
71.section .rodata
72.align   \align
73\name:
74.endm
75
76/*
77 *============================================================================
78 * LoongArch register alias
79 *============================================================================
80 */
81
82#define a0 $a0
83#define a1 $a1
84#define a2 $a2
85#define a3 $a3
86#define a4 $a4
87#define a5 $a5
88#define a6 $a6
89#define a7 $a7
90
91#define t0 $t0
92#define t1 $t1
93#define t2 $t2
94#define t3 $t3
95#define t4 $t4
96#define t5 $t5
97#define t6 $t6
98#define t7 $t7
99#define t8 $t8
100
101#define s0 $s0
102#define s1 $s1
103#define s2 $s2
104#define s3 $s3
105#define s4 $s4
106#define s5 $s5
107#define s6 $s6
108#define s7 $s7
109#define s8 $s8
110
111#define zero $zero
112#define sp   $sp
113#define ra   $ra
114
115#define fa0  $fa0
116#define fa1  $fa1
117#define fa2  $fa2
118#define fa3  $fa3
119#define fa4  $fa4
120#define fa5  $fa5
121#define fa6  $fa6
122#define fa7  $fa7
123#define ft0  $ft0
124#define ft1  $ft1
125#define ft2  $ft2
126#define ft3  $ft3
127#define ft4  $ft4
128#define ft5  $ft5
129#define ft6  $ft6
130#define ft7  $ft7
131#define ft8  $ft8
132#define ft9  $ft9
133#define ft10 $ft10
134#define ft11 $ft11
135#define ft12 $ft12
136#define ft13 $ft13
137#define ft14 $ft14
138#define ft15 $ft15
139#define fs0  $fs0
140#define fs1  $fs1
141#define fs2  $fs2
142#define fs3  $fs3
143#define fs4  $fs4
144#define fs5  $fs5
145#define fs6  $fs6
146#define fs7  $fs7
147
148#define f0  $f0
149#define f1  $f1
150#define f2  $f2
151#define f3  $f3
152#define f4  $f4
153#define f5  $f5
154#define f6  $f6
155#define f7  $f7
156#define f8  $f8
157#define f9  $f9
158#define f10 $f10
159#define f11 $f11
160#define f12 $f12
161#define f13 $f13
162#define f14 $f14
163#define f15 $f15
164#define f16 $f16
165#define f17 $f17
166#define f18 $f18
167#define f19 $f19
168#define f20 $f20
169#define f21 $f21
170#define f22 $f22
171#define f23 $f23
172#define f24 $f24
173#define f25 $f25
174#define f26 $f26
175#define f27 $f27
176#define f28 $f28
177#define f29 $f29
178#define f30 $f30
179#define f31 $f31
180
181#define vr0 $vr0
182#define vr1 $vr1
183#define vr2 $vr2
184#define vr3 $vr3
185#define vr4 $vr4
186#define vr5 $vr5
187#define vr6 $vr6
188#define vr7 $vr7
189#define vr8 $vr8
190#define vr9 $vr9
191#define vr10 $vr10
192#define vr11 $vr11
193#define vr12 $vr12
194#define vr13 $vr13
195#define vr14 $vr14
196#define vr15 $vr15
197#define vr16 $vr16
198#define vr17 $vr17
199#define vr18 $vr18
200#define vr19 $vr19
201#define vr20 $vr20
202#define vr21 $vr21
203#define vr22 $vr22
204#define vr23 $vr23
205#define vr24 $vr24
206#define vr25 $vr25
207#define vr26 $vr26
208#define vr27 $vr27
209#define vr28 $vr28
210#define vr29 $vr29
211#define vr30 $vr30
212#define vr31 $vr31
213
214#define xr0 $xr0
215#define xr1 $xr1
216#define xr2 $xr2
217#define xr3 $xr3
218#define xr4 $xr4
219#define xr5 $xr5
220#define xr6 $xr6
221#define xr7 $xr7
222#define xr8 $xr8
223#define xr9 $xr9
224#define xr10 $xr10
225#define xr11 $xr11
226#define xr12 $xr12
227#define xr13 $xr13
228#define xr14 $xr14
229#define xr15 $xr15
230#define xr16 $xr16
231#define xr17 $xr17
232#define xr18 $xr18
233#define xr19 $xr19
234#define xr20 $xr20
235#define xr21 $xr21
236#define xr22 $xr22
237#define xr23 $xr23
238#define xr24 $xr24
239#define xr25 $xr25
240#define xr26 $xr26
241#define xr27 $xr27
242#define xr28 $xr28
243#define xr29 $xr29
244#define xr30 $xr30
245#define xr31 $xr31
246
247/*
248 *============================================================================
249 * LSX/LASX synthesize instructions
250 *============================================================================
251 */
252
253/*
254 * Description : Dot product of byte vector elements
255 * Arguments   : Inputs  - vj, vk
256 *               Outputs - vd
257 *               Return Type - halfword
258 */
259.macro vdp2.h.bu vd, vj, vk
260    vmulwev.h.bu      \vd,    \vj,    \vk
261    vmaddwod.h.bu     \vd,    \vj,    \vk
262.endm
263
264.macro vdp2.h.bu.b vd, vj, vk
265    vmulwev.h.bu.b    \vd,    \vj,    \vk
266    vmaddwod.h.bu.b   \vd,    \vj,    \vk
267.endm
268
269.macro vdp2.w.h vd, vj, vk
270    vmulwev.w.h       \vd,    \vj,    \vk
271    vmaddwod.w.h      \vd,    \vj,    \vk
272.endm
273
274.macro xvdp2.h.bu xd, xj, xk
275    xvmulwev.h.bu    \xd,    \xj,    \xk
276    xvmaddwod.h.bu   \xd,    \xj,    \xk
277.endm
278
279.macro xvdp2.h.bu.b xd, xj, xk
280    xvmulwev.h.bu.b    \xd,  \xj,    \xk
281    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
282.endm
283
284.macro xvdp2.w.h xd, xj, xk
285    xvmulwev.w.h       \xd,  \xj,    \xk
286    xvmaddwod.w.h      \xd,  \xj,    \xk
287.endm
288
289/*
290 * Description : Dot product & addition of halfword vector elements
291 * Arguments   : Inputs  - vj, vk
292 *               Outputs - vd
293 *               Return Type - twice size of input
294 */
295.macro vdp2add.h.bu vd, vj, vk
296    vmaddwev.h.bu     \vd,    \vj,    \vk
297    vmaddwod.h.bu     \vd,    \vj,    \vk
298.endm
299
300.macro vdp2add.h.bu.b vd, vj, vk
301    vmaddwev.h.bu.b   \vd,    \vj,    \vk
302    vmaddwod.h.bu.b   \vd,    \vj,    \vk
303.endm
304
305.macro vdp2add.w.h vd, vj, vk
306    vmaddwev.w.h      \vd,    \vj,    \vk
307    vmaddwod.w.h      \vd,    \vj,    \vk
308.endm
309
310.macro xvdp2add.h.bu.b xd, xj, xk
311    xvmaddwev.h.bu.b   \xd,  \xj,    \xk
312    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
313.endm
314
315.macro xvdp2add.w.h xd, xj, xk
316    xvmaddwev.w.h      \xd,  \xj,    \xk
317    xvmaddwod.w.h      \xd,  \xj,    \xk
318.endm
319
320/*
321 * Description : Range element vj[i] to vk[i] ~ vj[i]
322 * clip: vj > vk ? vj : vk && vj < va ? vj : va
323 */
324.macro vclip.h  vd,  vj, vk, va
325    vmax.h    \vd,  \vj,   \vk
326    vmin.h    \vd,  \vd,   \va
327.endm
328
329.macro vclip.w  vd,  vj, vk, va
330    vmax.w    \vd,  \vj,   \vk
331    vmin.w    \vd,  \vd,   \va
332.endm
333
334.macro xvclip.h  xd,  xj, xk, xa
335    xvmax.h    \xd,  \xj,   \xk
336    xvmin.h    \xd,  \xd,   \xa
337.endm
338
339.macro xvclip.w  xd,  xj, xk, xa
340    xvmax.w    \xd,  \xj,   \xk
341    xvmin.w    \xd,  \xd,   \xa
342.endm
343
344/*
345 * Description : Range element vj[i] to 0 ~ 255
346 * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
347 */
348.macro vclip255.h  vd, vj
349    vmaxi.h   \vd,   \vj,  0
350    vsat.hu   \vd,   \vd,  7
351.endm
352
353.macro vclip255.w  vd, vj
354    vmaxi.w   \vd,   \vj,  0
355    vsat.wu   \vd,   \vd,  7
356.endm
357
358.macro xvclip255.h  xd, xj
359    xvmaxi.h   \xd,   \xj,  0
360    xvsat.hu   \xd,   \xd,  7
361.endm
362
363.macro xvclip255.w  xd, xj
364    xvmaxi.w   \xd,   \xj,  0
365    xvsat.wu   \xd,   \xd,  7
366.endm
367
368/*
369 * Description : Store elements of vector
370 * vd : Data vector to be stroed
371 * rk : Address of data storage
372 * ra : Offset of address
373 * si : Index of data in vd
374 */
375.macro vstelmx.b vd, rk, ra, si
376    add.d      \rk,  \rk,  \ra
377    vstelm.b   \vd,  \rk,  0, \si
378.endm
379
380.macro vstelmx.h vd, rk, ra, si
381    add.d      \rk,  \rk,  \ra
382    vstelm.h   \vd,  \rk,  0, \si
383.endm
384
385.macro vstelmx.w vd, rk, ra, si
386    add.d      \rk,  \rk,  \ra
387    vstelm.w   \vd,  \rk,  0, \si
388.endm
389
390.macro vstelmx.d  vd, rk, ra, si
391    add.d      \rk,  \rk,  \ra
392    vstelm.d   \vd,  \rk,  0, \si
393.endm
394
395.macro vmov xd, xj
396    vor.v  \xd,  \xj,  \xj
397.endm
398
399.macro xmov xd, xj
400    xvor.v  \xd,  \xj,  \xj
401.endm
402
403.macro xvstelmx.d  xd, rk, ra, si
404    add.d      \rk, \rk,  \ra
405    xvstelm.d  \xd, \rk,  0, \si
406.endm
407
408/*
409 *============================================================================
410 * LSX/LASX custom macros
411 *============================================================================
412 */
413
414/*
415 * Load 4 float, double, V128, v256 elements with stride.
416 */
417.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
418    fld.s     \out0,    \src,    0
419    fldx.s    \out1,    \src,    \stride
420    fldx.s    \out2,    \src,    \stride2
421    fldx.s    \out3,    \src,    \stride3
422.endm
423
424.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
425    fld.d     \out0,    \src,    0
426    fldx.d    \out1,    \src,    \stride
427    fldx.d    \out2,    \src,    \stride2
428    fldx.d    \out3,    \src,    \stride3
429.endm
430
431.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
432    vld     \out0,    \src,    0
433    vldx    \out1,    \src,    \stride
434    vldx    \out2,    \src,    \stride2
435    vldx    \out3,    \src,    \stride3
436.endm
437
438.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
439    xvld    \out0,    \src,    0
440    xvldx   \out1,    \src,    \stride
441    xvldx   \out2,    \src,    \stride2
442    xvldx   \out3,    \src,    \stride3
443.endm
444
445/*
446 * Description : Transpose 4x4 block with half-word elements in vectors
447 * Arguments   : Inputs  - in0, in1, in2, in3
448 *               Outputs - out0, out1, out2, out3
449 */
450.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
451                          tmp0, tmp1
452    vilvl.h   \tmp0,  \in1,   \in0
453    vilvl.h   \tmp1,  \in3,   \in2
454    vilvl.w   \out0,  \tmp1,  \tmp0
455    vilvh.w   \out2,  \tmp1,  \tmp0
456    vilvh.d   \out1,  \out0,  \out0
457    vilvh.d   \out3,  \out0,  \out2
458.endm
459
460/*
461 * Description : Transpose 4x4 block with word elements in vectors
462 * Arguments   : Inputs  - in0, in1, in2, in3
463 *               Outputs - out0, out1, out2, out3
464 * Details     :
465 * Example     :
466 *               1, 2, 3, 4            1, 5, 9,13
467 *               5, 6, 7, 8    to      2, 6,10,14
468 *               9,10,11,12  =====>    3, 7,11,15
469 *              13,14,15,16            4, 8,12,16
470 */
471.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
472                          tmp0, tmp1
473
474    vilvl.w    \tmp0,   \in1,    \in0
475    vilvh.w    \out1,   \in1,    \in0
476    vilvl.w    \tmp1,   \in3,    \in2
477    vilvh.w    \out3,   \in3,    \in2
478
479    vilvl.d    \out0,   \tmp1,   \tmp0
480    vilvl.d    \out2,   \out3,   \out1
481    vilvh.d    \out3,   \out3,   \out1
482    vilvh.d    \out1,   \tmp1,   \tmp0
483.endm
484
485/*
486 * Description : Transpose 8x8 block with half-word elements in vectors
487 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
488 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
489 */
490.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
491                          out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
492                          tmp3, tmp4, tmp5, tmp6, tmp7
493    vilvl.h      \tmp0,    \in6,   \in4
494    vilvl.h      \tmp1,    \in7,   \in5
495    vilvl.h      \tmp2,    \in2,   \in0
496    vilvl.h      \tmp3,    \in3,   \in1
497
498    vilvl.h      \tmp4,    \tmp1,  \tmp0
499    vilvh.h      \tmp5,    \tmp1,  \tmp0
500    vilvl.h      \tmp6,    \tmp3,  \tmp2
501    vilvh.h      \tmp7,    \tmp3,  \tmp2
502
503    vilvh.h      \tmp0,    \in6,   \in4
504    vilvh.h      \tmp1,    \in7,   \in5
505    vilvh.h      \tmp2,    \in2,   \in0
506    vilvh.h      \tmp3,    \in3,   \in1
507
508    vpickev.d    \out0,    \tmp4,  \tmp6
509    vpickod.d    \out1,    \tmp4,  \tmp6
510    vpickev.d    \out2,    \tmp5,  \tmp7
511    vpickod.d    \out3,    \tmp5,  \tmp7
512
513    vilvl.h      \tmp4,    \tmp1,  \tmp0
514    vilvh.h      \tmp5,    \tmp1,  \tmp0
515    vilvl.h      \tmp6,    \tmp3,  \tmp2
516    vilvh.h      \tmp7,    \tmp3,  \tmp2
517
518    vpickev.d    \out4,    \tmp4,  \tmp6
519    vpickod.d    \out5,    \tmp4,  \tmp6
520    vpickev.d    \out6,    \tmp5,  \tmp7
521    vpickod.d    \out7,    \tmp5,  \tmp7
522.endm
523
524/*
525 * Description : Transpose 16x8 block with byte elements in vectors
526 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
527 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
528 */
529.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
530                            in8, in9, in10, in11, in12, in13, in14, in15,  \
531                            out0, out1, out2, out3, out4, out5, out6, out7,\
532                            tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
533    xvilvl.b   \tmp0,    \in2,     \in0
534    xvilvl.b   \tmp1,    \in3,     \in1
535    xvilvl.b   \tmp2,    \in6,     \in4
536    xvilvl.b   \tmp3,    \in7,     \in5
537    xvilvl.b   \tmp4,    \in10,    \in8
538    xvilvl.b   \tmp5,    \in11,    \in9
539    xvilvl.b   \tmp6,    \in14,    \in12
540    xvilvl.b   \tmp7,    \in15,    \in13
541    xvilvl.b   \out0,    \tmp1,    \tmp0
542    xvilvh.b   \out1,    \tmp1,    \tmp0
543    xvilvl.b   \out2,    \tmp3,    \tmp2
544    xvilvh.b   \out3,    \tmp3,    \tmp2
545    xvilvl.b   \out4,    \tmp5,    \tmp4
546    xvilvh.b   \out5,    \tmp5,    \tmp4
547    xvilvl.b   \out6,    \tmp7,    \tmp6
548    xvilvh.b   \out7,    \tmp7,    \tmp6
549    xvilvl.w   \tmp0,    \out2,    \out0
550    xvilvh.w   \tmp2,    \out2,    \out0
551    xvilvl.w   \tmp4,    \out3,    \out1
552    xvilvh.w   \tmp6,    \out3,    \out1
553    xvilvl.w   \tmp1,    \out6,    \out4
554    xvilvh.w   \tmp3,    \out6,    \out4
555    xvilvl.w   \tmp5,    \out7,    \out5
556    xvilvh.w   \tmp7,    \out7,    \out5
557    xvilvl.d   \out0,    \tmp1,    \tmp0
558    xvilvh.d   \out1,    \tmp1,    \tmp0
559    xvilvl.d   \out2,    \tmp3,    \tmp2
560    xvilvh.d   \out3,    \tmp3,    \tmp2
561    xvilvl.d   \out4,    \tmp5,    \tmp4
562    xvilvh.d   \out5,    \tmp5,    \tmp4
563    xvilvl.d   \out6,    \tmp7,    \tmp6
564    xvilvh.d   \out7,    \tmp7,    \tmp6
565.endm
566
567/*
568 * Description : Transpose 4x4 block with half-word elements in vectors
569 * Arguments   : Inputs  - in0, in1, in2, in3
570 *               Outputs - out0, out1, out2, out3
571 */
572.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
573                           tmp0, tmp1
574    xvilvl.h   \tmp0,  \in1,   \in0
575    xvilvl.h   \tmp1,  \in3,   \in2
576    xvilvl.w   \out0,  \tmp1,  \tmp0
577    xvilvh.w   \out2,  \tmp1,  \tmp0
578    xvilvh.d   \out1,  \out0,  \out0
579    xvilvh.d   \out3,  \out0,  \out2
580.endm
581
582/*
583 * Description : Transpose 4x8 block with half-word elements in vectors
584 * Arguments   : Inputs  - in0, in1, in2, in3
585 *               Outputs - out0, out1, out2, out3
586 */
587.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
588                           tmp0, tmp1
589    xvilvl.h      \tmp0,    \in2,   \in0
590    xvilvl.h      \tmp1,    \in3,   \in1
591    xvilvl.h      \out2,    \tmp1,  \tmp0
592    xvilvh.h      \out3,    \tmp1,  \tmp0
593
594    xvilvl.d      \out0,    \out2,  \out2
595    xvilvh.d      \out1,    \out2,  \out2
596    xvilvl.d      \out2,    \out3,  \out3
597    xvilvh.d      \out3,    \out3,  \out3
598.endm
599
600/*
601 * Description : Transpose 8x8 block with half-word elements in vectors
602 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
603 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
604 */
605.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7,         \
606                           out0, out1, out2, out3, out4, out5, out6, out7, \
607                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
608    xvilvl.h     \tmp0,   \in6,     \in4
609    xvilvl.h     \tmp1,   \in7,     \in5
610    xvilvl.h     \tmp2,   \in2,     \in0
611    xvilvl.h     \tmp3,   \in3,     \in1
612
613    xvilvl.h     \tmp4,   \tmp1,    \tmp0
614    xvilvh.h     \tmp5,   \tmp1,    \tmp0
615    xvilvl.h     \tmp6,   \tmp3,    \tmp2
616    xvilvh.h     \tmp7,   \tmp3,    \tmp2
617
618    xvilvh.h     \tmp0,   \in6,     \in4
619    xvilvh.h     \tmp1,   \in7,     \in5
620    xvilvh.h     \tmp2,   \in2,     \in0
621    xvilvh.h     \tmp3,   \in3,     \in1
622
623    xvpickev.d   \out0,   \tmp4,    \tmp6
624    xvpickod.d   \out1,   \tmp4,    \tmp6
625    xvpickev.d   \out2,   \tmp5,    \tmp7
626    xvpickod.d   \out3,   \tmp5,    \tmp7
627
628    xvilvl.h     \tmp4,   \tmp1,    \tmp0
629    xvilvh.h     \tmp5,   \tmp1,    \tmp0
630    xvilvl.h     \tmp6,   \tmp3,    \tmp2
631    xvilvh.h     \tmp7,   \tmp3,    \tmp2
632
633    xvpickev.d   \out4,   \tmp4,    \tmp6
634    xvpickod.d   \out5,   \tmp4,    \tmp6
635    xvpickev.d   \out6,   \tmp5,    \tmp7
636    xvpickod.d   \out7,   \tmp5,    \tmp7
637.endm
638
639/*
640 * Description : Transpose 2x4x4 block with half-word elements in vectors
641 * Arguments   : Inputs  - in0, in1, in2, in3
642 *               Outputs - out0, out1, out2, out3
643 */
644.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
645                             tmp0, tmp1, tmp2
646    xvilvh.h   \tmp1,    \in0,     \in1
647    xvilvl.h   \out1,    \in0,     \in1
648    xvilvh.h   \tmp0,    \in2,     \in3
649    xvilvl.h   \out3,    \in2,     \in3
650
651    xvilvh.w   \tmp2,    \out3,    \out1
652    xvilvl.w   \out3,    \out3,    \out1
653
654    xvilvl.w   \out2,    \tmp0,    \tmp1
655    xvilvh.w   \tmp1,    \tmp0,    \tmp1
656
657    xvilvh.d   \out0,    \out2,    \out3
658    xvilvl.d   \out2,    \out2,    \out3
659    xvilvh.d   \out1,    \tmp1,    \tmp2
660    xvilvl.d   \out3,    \tmp1,    \tmp2
661.endm
662
663/*
664 * Description : Transpose 4x4 block with word elements in vectors
665 * Arguments   : Inputs  - in0, in1, in2, in3
666 *               Outputs - out0, out1, out2, out3
667 * Details     :
668 * Example     :
669 *               1, 2, 3, 4,  1, 2, 3, 4        1,5, 9,13, 1,5, 9,13
670 *               5, 6, 7, 8,  5, 6, 7, 8   to   2,6,10,14, 2,6,10,14
671 *               9,10,11,12,  9,10,11,12 =====> 3,7,11,15, 3,7,11,15
672 *              13,14,15,16, 13,14,15,16        4,8,12,16, 4,8,12,16
673 */
674.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
675                           tmp0, tmp1
676
677    xvilvl.w    \tmp0,   \in1,    \in0
678    xvilvh.w    \out1,   \in1,    \in0
679    xvilvl.w    \tmp1,   \in3,    \in2
680    xvilvh.w    \out3,   \in3,    \in2
681
682    xvilvl.d    \out0,   \tmp1,   \tmp0
683    xvilvl.d    \out2,   \out3,   \out1
684    xvilvh.d    \out3,   \out3,   \out1
685    xvilvh.d    \out1,   \tmp1,   \tmp0
686.endm
687
688/*
689 * Description : Transpose 8x8 block with word elements in vectors
690 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
691 *               Outputs - out0, out1, out2, out3, out4, out5, out6,
692 *               _out7
693 * Example     : LASX_TRANSPOSE8x8_W
694 *         in0 : 1,2,3,4,5,6,7,8
695 *         in1 : 2,2,3,4,5,6,7,8
696 *         in2 : 3,2,3,4,5,6,7,8
697 *         in3 : 4,2,3,4,5,6,7,8
698 *         in4 : 5,2,3,4,5,6,7,8
699 *         in5 : 6,2,3,4,5,6,7,8
700 *         in6 : 7,2,3,4,5,6,7,8
701 *         in7 : 8,2,3,4,5,6,7,8
702 *
703 *        out0 : 1,2,3,4,5,6,7,8
704 *        out1 : 2,2,2,2,2,2,2,2
705 *        out2 : 3,3,3,3,3,3,3,3
706 *        out3 : 4,4,4,4,4,4,4,4
707 *        out4 : 5,5,5,5,5,5,5,5
708 *        out5 : 6,6,6,6,6,6,6,6
709 *        out6 : 7,7,7,7,7,7,7,7
710 *        out7 : 8,8,8,8,8,8,8,8
711 */
712.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
713                           out0, out1, out2, out3, out4, out5, out6, out7,\
714                           tmp0, tmp1, tmp2, tmp3
715    xvilvl.w    \tmp0,   \in2,    \in0
716    xvilvl.w    \tmp1,   \in3,    \in1
717    xvilvh.w    \tmp2,   \in2,    \in0
718    xvilvh.w    \tmp3,   \in3,    \in1
719    xvilvl.w    \out0,   \tmp1,   \tmp0
720    xvilvh.w    \out1,   \tmp1,   \tmp0
721    xvilvl.w    \out2,   \tmp3,   \tmp2
722    xvilvh.w    \out3,   \tmp3,   \tmp2
723
724    xvilvl.w    \tmp0,   \in6,    \in4
725    xvilvl.w    \tmp1,   \in7,    \in5
726    xvilvh.w    \tmp2,   \in6,    \in4
727    xvilvh.w    \tmp3,   \in7,    \in5
728    xvilvl.w    \out4,   \tmp1,   \tmp0
729    xvilvh.w    \out5,   \tmp1,   \tmp0
730    xvilvl.w    \out6,   \tmp3,   \tmp2
731    xvilvh.w    \out7,   \tmp3,   \tmp2
732
733    xmov        \tmp0,   \out0
734    xmov        \tmp1,   \out1
735    xmov        \tmp2,   \out2
736    xmov        \tmp3,   \out3
737    xvpermi.q   \out0,   \out4,   0x02
738    xvpermi.q   \out1,   \out5,   0x02
739    xvpermi.q   \out2,   \out6,   0x02
740    xvpermi.q   \out3,   \out7,   0x02
741    xvpermi.q   \out4,   \tmp0,   0x31
742    xvpermi.q   \out5,   \tmp1,   0x31
743    xvpermi.q   \out6,   \tmp2,   0x31
744    xvpermi.q   \out7,   \tmp3,   0x31
745.endm
746
747/*
748 * Description : Transpose 4x4 block with double-word elements in vectors
749 * Arguments   : Inputs  - in0, in1, in2, in3
750 *               Outputs - out0, out1, out2, out3
751 * Example     : LASX_TRANSPOSE4x4_D
752 *         in0 : 1,2,3,4
753 *         in1 : 1,2,3,4
754 *         in2 : 1,2,3,4
755 *         in3 : 1,2,3,4
756 *
757 *        out0 : 1,1,1,1
758 *        out1 : 2,2,2,2
759 *        out2 : 3,3,3,3
760 *        out3 : 4,4,4,4
761 */
762.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
763                           tmp0, tmp1
764    xvilvl.d    \tmp0,   \in1,    \in0
765    xvilvh.d    \out1,   \in1,    \in0
766    xvilvh.d    \tmp1,   \in3,    \in2
767    xvilvl.d    \out2,   \in3,    \in2
768
769    xvor.v      \out0,   \tmp0,   \tmp0
770    xvor.v      \out3,   \tmp1,   \tmp1
771
772    xvpermi.q   \out0,   \out2,   0x02
773    xvpermi.q   \out2,   \tmp0,   0x31
774    xvpermi.q   \out3,   \out1,   0x31
775    xvpermi.q   \out1,   \tmp1,   0x02
776.endm
777