xref: /aosp_15_r20/external/libdav1d/src/loongarch/itx.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29#include "src/loongarch/loongson_util.S"
30
31.macro PUSH_REG
32    addi.d           sp,     sp,    -64
33    fst.d            f24,    sp,     0
34    fst.d            f25,    sp,     8
35    fst.d            f26,    sp,     16
36    fst.d            f27,    sp,     24
37    fst.d            f28,    sp,     32
38    fst.d            f29,    sp,     40
39    fst.d            f30,    sp,     48
40    fst.d            f31,    sp,     56
41.endm
42
43.macro POP_REG
44    fld.d            f24,    sp,     0
45    fld.d            f25,    sp,     8
46    fld.d            f26,    sp,     16
47    fld.d            f27,    sp,     24
48    fld.d            f28,    sp,     32
49    fld.d            f29,    sp,     40
50    fld.d            f30,    sp,     48
51    fld.d            f31,    sp,     56
52    addi.d           sp,     sp,     64
53.endm
54
55.macro malloc_space number
56    li.w          t0,       \number
57    sub.d         sp,       sp,       t0
58    addi.d        sp,       sp,       -64
59    PUSH_REG
60.endm
61
62.macro free_space number
63    POP_REG
64    li.w          t0,       \number
65    add.d         sp,       sp,       t0
66    addi.d        sp,       sp,       64
67.endm
68
69.macro iwht4
70    vadd.h        vr0,       vr0,     vr1
71    vsub.h        vr4,       vr2,     vr3
72    vsub.h        vr5,       vr0,     vr4
73    vsrai.h       vr5,       vr5,     1
74    vsub.h        vr2,       vr5,     vr1
75    vsub.h        vr1,       vr5,     vr3
76    vadd.h        vr3,       vr4,     vr2
77    vsub.h        vr0,       vr0,     vr1
78.endm
79
80.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5
81    vilvl.w       \in0,     \in1,     \in0  // 0 1  2  3  4  5  6  7 x ...
82    vilvl.w       \in2,     \in3,     \in2  // 8 9 10 11 12 13 14 15 x ...
83    vsllwil.hu.bu \in0,     \in0,     0
84    vsllwil.hu.bu \in2,     \in2,     0
85    vadd.h        \in0,     \in4,     \in0
86    vadd.h        \in2,     \in5,     \in2
87    vssrani.bu.h  \in2,     \in0,     0
88    vstelm.w      \in2,     a0,       0,    0
89    vstelmx.w     \in2,     a0,       a1,   1
90    vstelmx.w     \in2,     a0,       a1,   2
91    vstelmx.w     \in2,     a0,       a1,   3
92.endm
93
94.macro VLD_DST_ADD_W4 in0, in1
95    vld           vr0,      a0,       0
96    vldx          vr1,      a0,       a1
97    vld           vr2,      t2,       0
98    vldx          vr3,      t2,       a1
99
100    DST_ADD_W4    vr0, vr1, vr2, vr3, \in0, \in1
101.endm
102
103function inv_txfm_add_wht_wht_4x4_8bpc_lsx
104    vld           vr0,       a2,      0
105    vld           vr2,       a2,      16
106
107    vxor.v        vr20,      vr20,    vr20
108    vsrai.h       vr0,       vr0,     2
109    vsrai.h       vr2,       vr2,     2
110    vst           vr20,      a2,      0
111    vpickod.d     vr1,       vr0,     vr0
112    vpickod.d     vr3,       vr2,     vr2
113    vst           vr20,      a2,      16
114
115    iwht4
116
117    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
118
119    iwht4
120
121    vilvl.d       vr4,       vr1,     vr0
122    vilvl.d       vr5,       vr3,     vr2
123    alsl.d        t2,        a1,      a0,    1
124    VLD_DST_ADD_W4 vr4, vr5
125endfunc
126
127const idct_coeffs, align=4
128    .word          2896, 2896*8, 1567, 3784
129    .word          799, 4017, 3406, 2276
130    .word          401, 4076, 3166, 2598
131    .word          1931, 3612, 3920, 1189
132    .word          201, 4091, 3035, 2751
133    .word          1751, 3703, 3857, 1380
134    .word          995, 3973, 3513, 2106
135    .word          2440, 3290, 4052, 601
136endconst
137
138.macro vsrari_h_x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
139    vsrari.h      \out0,    \in0,     \shift
140    vsrari.h      \out1,    \in1,     \shift
141    vsrari.h      \out2,    \in2,     \shift
142    vsrari.h      \out3,    \in3,     \shift
143.endm
144
145.macro vsrari_h_x8 in0, in1, in2, in3, in4, in5, in6, in7, out0, \
146                   out1, out2, out3, out4, out5, out6, out7, shift
147    vsrari.h      \out0,    \in0,     \shift
148    vsrari.h      \out1,    \in1,     \shift
149    vsrari.h      \out2,    \in2,     \shift
150    vsrari.h      \out3,    \in3,     \shift
151    vsrari.h      \out4,    \in4,     \shift
152    vsrari.h      \out5,    \in5,     \shift
153    vsrari.h      \out6,    \in6,     \shift
154    vsrari.h      \out7,    \in7,     \shift
155.endm
156
157.macro vmulev_vmaddod_lsx in0, in1, in2, in3, out0, out1, sz
158    vmulwev.w.h   \out0,    \in0,     \in2
159    vmulwod.w.h   \out1,    \in0,     \in2
160    vmaddwev.w.h  \out0,    \in1,     \in3
161    vmaddwod.w.h  \out1,    \in1,     \in3
162.ifc \sz, .4h
163    vilvl.w       \out0,    \out1,    \out0
164.else
165    vilvl.w       vr22,     \out1,    \out0
166    vilvh.w       \out1,    \out1,    \out0
167    vor.v         \out0,    vr22,     vr22
168.endif
169.endm
170
171const idct_coeffs_h, align=4
172    .short          2896, 2896*8, 1567, 3784
173    .short          799, 4017, 3406, 2276
174    .short          401, 4076, 3166, 2598
175    .short          1931, 3612, 3920, 1189
176    .short          201, 4091, 3035, 2751
177    .short          1751, 3703, 3857, 1380
178    .short          995, 3973, 3513, 2106
179    .short          2440, 3290, 4052, 601
180endconst
181
182const iadst4_coeffs, align=4
183    .word          1321, 3803, 2482, 3344
184endconst
185
186.macro inv_dct4_lsx in0, in1, in2, in3, out0, out1, out2, out3, sz
187    la.local      t0,       idct_coeffs_h
188
189    vldrepl.h     vr20,     t0,       0    // 2896
190    vmulev_vmaddod_lsx \in0, \in2, vr20, vr20, vr16, vr18, \sz
191    vneg.h        vr21,     vr20
192    vmulev_vmaddod_lsx \in0, \in2, vr20, vr21, vr17, vr19, \sz
193    vssrarni.h.w  vr18,     vr16,     12   // t0
194    vssrarni.h.w  vr19,     vr17,     12   // t1
195
196    vldrepl.h     vr20,     t0,       4    // 1567
197    vldrepl.h     vr21,     t0,       6    // 3784
198    vmulev_vmaddod_lsx \in1, \in3, vr21, vr20, \in0, vr16, \sz
199    vneg.h        vr21,     vr21
200    vmulev_vmaddod_lsx \in1, \in3, vr20, vr21, \in2, vr17, \sz
201    vssrarni.h.w  vr16,     \in0,     12   // t3
202    vssrarni.h.w  vr17,     \in2,     12   // t2
203
204    vsadd.h       \out0,    vr18,     vr16
205    vsadd.h       \out1,    vr19,     vr17
206    vssub.h       \out2,    vr19,     vr17
207    vssub.h       \out3,    vr18,     vr16
208.endm
209
210functionl inv_dct_4h_x4_lsx
211    inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .4h
212endfuncl
213
214functionl inv_dct_8h_x4_lsx
215    inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .8h
216endfuncl
217
218.macro inv_adst4_core_lsx in0, in1, in2, in3, out0, out1, out2, out3
219    vsub.w        vr16,     \in0,    \in2  // in0-in2
220    vmul.w        vr17,     \in0,    vr20  // in0*1321
221    vmul.w        vr19,     \in0,    vr22  // in0*2482
222    vmul.w        vr18,     \in1,    vr23  // in1*3344
223    vmadd.w       vr17,     \in2,    vr21  // in0*1321+in2*3803
224    vmsub.w       vr19,     \in2,    vr20  // in2*1321
225    vadd.w        vr16,     vr16,    \in3  // in0-in2+in3
226    vmadd.w       vr17,     \in3,    vr22  // in0*1321+in2*3803+in3*2482
227    vmsub.w       vr19,     \in3,    vr21  // in0*2482-in2*1321-in3*3803
228    vadd.w        vr15,     vr17,    vr19
229    vmul.w        \out2,    vr16,    vr23  // out[2] 8  9  10 11
230    vadd.w        \out0,    vr17,    vr18  // out[0] 0  1  2  3
231    vadd.w        \out1,    vr19,    vr18  // out[1] 4  5  6  7
232    vsub.w        \out3,    vr15,    vr18  // out[3] 12 13 14 15
233.endm
234
235.macro inv_adst4_lsx in0, in1, in2, in3, out0, out1, out2, out3
236    la.local      t0,       iadst4_coeffs
237
238    vldrepl.w     vr20,     t0,      0     // 1321
239    vldrepl.w     vr21,     t0,      4     // 3803
240    vldrepl.w     vr22,     t0,      8     // 2482
241    vldrepl.w     vr23,     t0,      12    // 3344
242
243    vsllwil.w.h   vr0,      \in0,    0
244    vsllwil.w.h   vr1,      \in1,    0
245    vsllwil.w.h   vr2,      \in2,    0
246    vsllwil.w.h   vr3,      \in3,    0
247    inv_adst4_core_lsx vr0, vr1, vr2, vr3, \out0, \out1, \out2, \out3
248    vssrarni.h.w  \out0,    \out0,   12
249    vssrarni.h.w  \out1,    \out1,   12
250    vssrarni.h.w  \out2,    \out2,   12
251    vssrarni.h.w  \out3,    \out3,   12
252.endm
253
254functionl inv_adst_4h_x4_lsx
255    inv_adst4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3
256endfuncl
257
258functionl inv_flipadst_4h_x4_lsx
259    inv_adst4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0
260endfuncl
261
262.macro inv_adst_8x4_lsx in0, in1, in2, in3, out0, out1, out2, out3
263    la.local      t0,       iadst4_coeffs
264    vldrepl.w     vr20,     t0,      0     // 1321
265    vldrepl.w     vr21,     t0,      4     // 3803
266    vldrepl.w     vr22,     t0,      8     // 2482
267    vldrepl.w     vr23,     t0,      12    // 3344
268
269    vsllwil.w.h   vr10,     \in0,     0     // in0
270    vsllwil.w.h   vr11,     \in1,     0     // in1
271    vsllwil.w.h   vr12,     \in2,     0     // in2
272    vsllwil.w.h   vr13,     \in3,     0     // in3
273    inv_adst4_core_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
274
275    vexth.w.h     \in0,      \in0           // in0
276    vexth.w.h     \in1,      \in1           // in1
277    vexth.w.h     \in2,      \in2           // in2
278    vexth.w.h     \in3,      \in3           // in3
279    inv_adst4_core_lsx \in0, \in1, \in2, \in3, \out0, \out1, \out2, \out3
280
281    vssrarni.h.w  \out0,     vr10,    12
282    vssrarni.h.w  \out1,     vr11,    12
283    vssrarni.h.w  \out2,     vr12,    12
284    vssrarni.h.w  \out3,     vr13,    12
285.endm
286
287functionl inv_adst_8h_x4_lsx
288    inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3
289endfuncl
290
291functionl inv_flipadst_8h_x4_lsx
292    inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0
293endfuncl
294
295functionl inv_identity_4h_x4_lsx
296    li.w          t0,       1697
297    vreplgr2vr.h  vr20,     t0
298
299    vilvl.d       vr0,      vr1,      vr0
300    vilvl.d       vr2,      vr3,      vr2
301    vmulwev.w.h   vr16,     vr0,      vr20
302    vmulwod.w.h   vr17,     vr0,      vr20
303    vmulwev.w.h   vr18,     vr2,      vr20
304    vmulwod.w.h   vr19,     vr2,      vr20
305    vilvl.w       vr1,      vr17,     vr16
306    vilvh.w       vr3,      vr17,     vr16
307    vilvl.w       vr22,     vr19,     vr18
308    vilvh.w       vr23,     vr19,     vr18
309    vssrarni.h.w  vr3,      vr1,      12
310    vssrarni.h.w  vr23,     vr22,     12
311    vsadd.h       vr0,      vr3,      vr0  // t0
312    vsadd.h       vr2,      vr23,     vr2  // t2
313    vilvh.d       vr1,      vr0,      vr0  // t1
314    vilvh.d       vr3,      vr2,      vr2  // t3
315endfuncl
316
317.macro inv_identity4_lsx1 in0, in1, in2, out0, out1
318    vsllwil.w.h   vr16,     \in0,     0
319    vexth.w.h     vr17,     \in1
320    vmul.w        vr18,     vr16,     \in2
321    vmul.w        vr19,     vr17,     \in2
322    vsrari.w      vr18,     vr18,     12
323    vsrari.w      vr19,     vr19,     12
324    vadd.w        \out0,    vr18,     vr16
325    vadd.w        \out1,    vr19,     vr17
326    vssrarni.h.w  \out1,    \out0,    1
327.endm
328
329functionl inv_identity_8h_x4_lsx
330    li.w          t0,        1697
331    vreplgr2vr.h  vr20,      t0
332    vmulwev.w.h   vr16,      vr0,     vr20
333    vmulwod.w.h   vr17,      vr0,     vr20
334    vmulwev.w.h   vr18,      vr1,     vr20
335    vmulwod.w.h   vr19,      vr1,     vr20
336    vilvl.w       vr21,      vr17,    vr16
337    vilvh.w       vr22,      vr17,    vr16
338    vilvl.w       vr23,      vr19,    vr18
339    vilvh.w       vr16,      vr19,    vr18
340    vssrarni.h.w  vr22,      vr21,    12
341    vssrarni.h.w  vr16,      vr23,    12
342    vsadd.h       vr0,       vr22,    vr0  // t0
343    vsadd.h       vr1,       vr16,    vr1  // t1
344    vmulwev.w.h   vr16,      vr2,     vr20
345    vmulwod.w.h   vr17,      vr2,     vr20
346    vmulwev.w.h   vr18,      vr3,     vr20
347    vmulwod.w.h   vr19,      vr3,     vr20
348    vilvl.w       vr21,      vr17,    vr16
349    vilvh.w       vr22,      vr17,    vr16
350    vilvl.w       vr23,      vr19,    vr18
351    vilvh.w       vr16,      vr19,    vr18
352    vssrarni.h.w  vr22,      vr21,    12
353    vssrarni.h.w  vr16,      vr23,    12
354    vsadd.h       vr2,       vr22,    vr2  // t2
355    vsadd.h       vr3,       vr16,    vr3  // t3
356endfuncl
357
358functionl inv_identity_8h_x4_lsx1
359    li.w          t0,        1697
360    vreplgr2vr.w  vr20,      t0
361.irp i, vr0, vr1, vr2, vr3
362    inv_identity4_lsx1 \i, \i vr20, vr21, \i
363.endr
364endfuncl
365
366functionl inv_txfm_add_4x4_lsx
367    vxor.v        vr23,     vr23,     vr23
368    vld           vr0,      a2,       0
369    vld           vr2,      a2,       16
370    vilvh.d       vr1,      vr0,      vr0
371    vilvh.d       vr3,      vr2,      vr2
372    vst           vr23,     a2,       0
373    vst           vr23,     a2,       16
374
375    move          t6,       ra
376    jirl          ra,       t7,       0
377    move          ra,       t6
378
379    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
380
381    move          t6,       ra
382    jirl          ra,       t8,       0
383    move          ra,       t6
384
385    vilvl.d       vr4,      vr1,      vr0
386    vilvl.d       vr5,      vr3,      vr2
387    vsrari.h      vr4,      vr4,      4
388    vsrari.h      vr5,      vr5,      4
389    alsl.d        t2,       a1,       a0,    1
390    VLD_DST_ADD_W4 vr4, vr5
391endfuncl
392
393.macro idct_dc w, h, shift
394    ld.h          t2,       a2,       0      // dc
395    vldi          vr0,      0x8b5            // 181
396    vreplgr2vr.w  vr1,      t2
397    vldi          vr20,     0x880            // 128
398    vmul.w        vr2,      vr0,      vr1    // dc * 181
399    st.h          zero,     a2,       0
400    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
401    vld           vr10,     a0,       0      // 0 1 2 3 4 5 6 7
402
403.if (2*\w == \h) || (2*\h == \w)
404    vmul.w        vr2,      vr0,      vr2
405    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
406.endif
407.if \shift>0
408    vsrari.w      vr2,      vr2,      \shift      // (dc + rnd) >> shift
409.endif
410    vldx          vr11,     a0,       a1     // 8 9 10 11 12 13 14 15
411    alsl.d        t2,       a1,       a0,    1
412    vmadd.w       vr20,     vr2,      vr0
413    vld           vr12,     t2,       0      // 16 17 18 19 20 21 22 23
414    vssrarni.h.w  vr20,     vr20,     12
415    vldx          vr13,     t2,       a1     // 24 25 26 27 28 29 30 31
416.endm
417
418.macro fun4x4 txfm1, txfm2
419function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_lsx
420.ifc \txfm1\()_\txfm2, dct_dct
421    bnez          a3,       1f
422
423    idct_dc 4, 4, 0
424
425    DST_ADD_W4    vr10, vr11, vr12, vr13, vr20, vr20
426    b             .\txfm1\()_\txfm2\()_4X4_END
4271:
428.endif
429
430    la.local     t7,    inv_\txfm1\()_4h_x4_lsx
431    la.local     t8,    inv_\txfm2\()_4h_x4_lsx
432
433    b            inv_txfm_add_4x4_lsx
434.\txfm1\()_\txfm2\()_4X4_END:
435endfunc
436.endm
437
438fun4x4 dct, dct
439fun4x4 identity, identity
440fun4x4 adst, dct
441fun4x4 dct, adst
442fun4x4 adst, adst
443fun4x4 dct, flipadst
444fun4x4 flipadst, adst
445fun4x4 adst, flipadst
446fun4x4 flipadst, dct
447fun4x4 flipadst, flipadst
448fun4x4 dct, identity
449fun4x4 identity, dct
450fun4x4 flipadst, identity
451fun4x4 identity, flipadst
452fun4x4 identity, adst
453fun4x4 adst, identity
454
455const iadst8_coeffs_h, align=4
456    .short          4076, 401, 3612, 1931
457    .short          2598, 3166, 1189, 3920
458    .short          2896, 0, 1567, 3784, 0, 0, 0, 0
459endconst
460
461.macro inv_adst8_lsx out0, out1, out2, out3, out4, out5, out6, out7, sz
462    la.local      t0,       iadst8_coeffs_h
463
464    vldrepl.h     vr20,     t0,       0     // 4076
465    vldrepl.h     vr21,     t0,       2     // 401
466    vmulev_vmaddod_lsx vr7, vr0, vr20, vr21, vr16, vr17, \sz
467    vneg.h        vr20,     vr20
468    vmulev_vmaddod_lsx vr7, vr0, vr21, vr20, vr18, vr19, \sz
469    vssrarni.h.w  vr17,     vr16,     12    // t0a
470    vssrarni.h.w  vr19,     vr18,     12    // t1a
471
472    vldrepl.h     vr20,     t0,       4     // 3612
473    vldrepl.h     vr21,     t0,       6     // 1931
474    vmulev_vmaddod_lsx vr5, vr2, vr20, vr21, vr0, vr16, \sz
475    vneg.h        vr20,     vr20
476    vmulev_vmaddod_lsx vr5, vr2, vr21, vr20, vr7, vr18, \sz
477    vssrarni.h.w  vr16,     vr0,      12    // t2a
478    vssrarni.h.w  vr18,     vr7,      12    // t3a
479
480    vldrepl.h     vr20,     t0,       8     // 2598
481    vldrepl.h     vr21,     t0,       10    // 3166
482    vmulev_vmaddod_lsx vr3, vr4, vr20, vr21, vr2, vr0, \sz
483    vneg.h        vr20,     vr20
484    vmulev_vmaddod_lsx vr3, vr4, vr21, vr20, vr5, vr7, \sz
485    vssrarni.h.w  vr0,      vr2,      12    // t4a
486    vssrarni.h.w  vr7,      vr5,      12    // t5a
487
488    vldrepl.h     vr20,     t0,       12    // 1189
489    vldrepl.h     vr21,     t0,       14    // 3920
490    vmulev_vmaddod_lsx vr1, vr6, vr20, vr21, vr3, vr2, \sz
491    vneg.h        vr20,     vr20
492    vmulev_vmaddod_lsx vr1, vr6, vr21, vr20, vr4, vr5, \sz
493    vssrarni.h.w  vr2,      vr3,      12    // t6a
494    vssrarni.h.w  vr5,      vr4,      12    // t7a
495
496    vsadd.h       vr3,      vr17,     vr0   // t0
497    vssub.h       vr4,      vr17,     vr0   // t4
498    vsadd.h       vr1,      vr19,     vr7   // t1
499    vssub.h       vr6,      vr19,     vr7   // t5
500    vsadd.h       vr17,     vr16,     vr2   // t2
501    vssub.h       vr19,     vr16,     vr2   // t6
502    vsadd.h       vr0,      vr18,     vr5   // t3
503    vssub.h       vr7,      vr18,     vr5   // t7
504
505    la.local      t0,       idct_coeffs_h
506
507    vldrepl.h     vr20,     t0,       4     // 1567
508    vldrepl.h     vr21,     t0,       6     // 3784
509    vmulev_vmaddod_lsx vr4, vr6, vr21, vr20, vr16, vr5, \sz
510    vneg.h        vr21,     vr21
511    vmulev_vmaddod_lsx vr4, vr6, vr20, vr21, vr18, vr2, \sz
512    vssrarni.h.w  vr5,      vr16,     12    // t4a
513    vssrarni.h.w  vr2,      vr18,     12    // t5a
514
515    vneg.h        vr21,     vr21
516    vmulev_vmaddod_lsx vr7, vr19, vr20, vr21, vr4, vr16, \sz
517    vneg.h        vr20,     vr20
518    vmulev_vmaddod_lsx vr7, vr19, vr21, vr20, vr6, vr18, \sz
519    vssrarni.h.w  vr16,     vr4,      12    // t7a
520    vssrarni.h.w  vr18,     vr6,      12    // t6a
521
522    vsadd.h       vr4,      vr5,      vr18  // out1
523    vssub.h       vr19,     vr5,      vr18  // t6
524    vsadd.h       vr20,     vr1,      vr0   // out7
525    vssub.h       vr18,     vr1,      vr0   // t3
526    vsadd.h       \out0,    vr3,      vr17  // out0
527    vssub.h       vr5,      vr3,      vr17  // t2
528    vsadd.h       \out6,    vr2,      vr16  // out6
529    vssub.h       vr23,     vr2,      vr16  // t7
530
531    vsllwil.w.h   vr3,      vr20,     0     // out7
532    vexth.w.h     \out7,    vr20            // out7
533    vsllwil.w.h   vr21,     vr4,      0     // out1
534    vexth.w.h     \out1,    vr4             // out1
535    vneg.w        vr3,      vr3
536    vneg.w        \out7,    \out7
537    vneg.w        vr21,     vr21
538    vneg.w        \out1,    \out1
539    vssrarni.h.w  \out7,    vr3,      0
540    vssrarni.h.w  \out1,    vr21,     0
541
542    la.local      t0,       idct_coeffs_h
543
544    vldrepl.h     vr20,     t0,       0     // 2896
545    vmulev_vmaddod_lsx vr5, vr18, vr20, vr20, vr16, \out3, \sz
546    vneg.h        vr21,     vr20
547    vmulev_vmaddod_lsx vr5, vr18, vr20, vr21, vr17, \out4, \sz
548    vsrari.w      vr16,     vr16,     12
549    vsrari.w      \out3,    \out3,    12
550    vneg.w        vr16,     vr16
551    vneg.w        \out3,    \out3
552    vssrarni.h.w  \out3,    vr16,     0     // out3
553    vssrarni.h.w  \out4,    vr17,     12    // out4
554
555    vmulev_vmaddod_lsx vr19, vr23, vr20, vr20, vr16, \out2, \sz
556    vmulev_vmaddod_lsx vr19, vr23, vr20, vr21, vr17, \out5, \sz
557    vssrarni.h.w  \out2,    vr16,     12    // out2
558    vsrari.w      vr17,     vr17,     12
559    vsrari.w      \out5,    \out5,    12
560    vneg.w        vr17,     vr17
561    vneg.w        \out5,    \out5
562    vssrarni.h.w  \out5,    vr17,     0     // out5
563.endm
564
565functionl inv_adst_8h_x8_lsx
566    inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
567endfuncl
568
569functionl inv_flipadst_8h_x8_lsx
570    inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h
571endfuncl
572
573functionl inv_adst_4h_x8_lsx
574    inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
575endfuncl
576
577functionl inv_flipadst_4h_x8_lsx
578    inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h
579endfuncl
580
581.macro inv_dct8_lsx in0, in1, in2, in3, in4, in5, in6, in7, sz
582    inv_dct4_lsx \in0, \in2, \in4, \in6, \in0, \in2, \in4, \in6, \sz
583
584    la.local      t0,       idct_coeffs_h
585
586    vldrepl.h     vr20,     t0,       8        // 799
587    vldrepl.h     vr21,     t0,       10       // 4017
588    vmulev_vmaddod_lsx  \in1, \in7, vr21, vr20, vr16, vr17, \sz
589    vneg.h        vr21,     vr21
590    vmulev_vmaddod_lsx  \in1, \in7, vr20, vr21, vr18, vr19, \sz
591    vssrarni.h.w  vr17,     vr16,     12       // t7a
592    vssrarni.h.w  vr19,     vr18,     12       // t4a
593
594    vldrepl.h     vr20,     t0,       12       // 3406
595    vldrepl.h     vr21,     t0,       14       // 2276
596    vmulev_vmaddod_lsx  \in5, \in3, vr21, vr20, \in1, vr16, \sz
597    vneg.h        vr21,     vr21
598    vmulev_vmaddod_lsx  \in5, \in3, vr20, vr21, \in7, vr18, \sz
599    vssrarni.h.w  vr16,     \in1,       12      // t6a
600    vssrarni.h.w  vr18,     \in7,       12      // t5a
601
602    vssub.h       \in7,     vr19,      vr18     // t5a
603    vsadd.h       vr18,     vr19,      vr18     // t4
604    vssub.h       \in5,     vr17,      vr16     // t6a
605    vsadd.h       vr16,     vr17,      vr16     // t7
606
607    vldrepl.h     vr20,     t0,        0        // 2896
608    vmulev_vmaddod_lsx  \in5, \in7, vr20, vr20, \in1, vr17, \sz
609    vneg.h        vr21,     vr20
610    vmulev_vmaddod_lsx  \in5, \in7, vr20, vr21, vr23, vr19, \sz
611    vssrarni.h.w  vr17,     \in1,      12       // t6
612    vssrarni.h.w  vr19,     vr23,      12       // t5
613
614    vssub.h       \in7,      \in0,     vr16     //c[7]
615    vsadd.h       \in0,      \in0,     vr16     //c[0]
616    vssub.h       \in5,      \in4,     vr19     //c[5]
617    vsadd.h       vr23,      \in4,     vr19     //c[2]
618    vssub.h       \in4,      \in6,     vr18     //c[4]
619    vsadd.h       \in3,      \in6,     vr18     //c[3]
620    vssub.h       \in6,      \in2,     vr17     //c[6]
621    vsadd.h       \in1,      \in2,     vr17     //c[1]
622    vor.v         \in2,      vr23,     vr23
623.endm
624
625functionl inv_dct_8h_x8_lsx
626    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
627endfuncl
628
629functionl inv_dct_4h_x8_lsx
630    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .4h
631endfuncl
632
633.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7
634    vsllwil.hu.bu vr0,      \in0,     0
635    vsllwil.hu.bu vr1,      \in1,     0
636    vsllwil.hu.bu vr2,      \in2,     0
637    vsllwil.hu.bu vr3,      \in3,     0
638    vadd.h        vr0,      \in4,     vr0
639    vadd.h        vr1,      \in5,     vr1
640    vadd.h        vr2,      \in6,     vr2
641    vadd.h        vr3,      \in7,     vr3
642    vssrani.bu.h  vr1,      vr0,      0
643    vssrani.bu.h  vr3,      vr2,      0
644    vstelm.d      vr1,      a0,       0,    0
645    vstelmx.d     vr1,      a0,       a1,   1
646    vstelmx.d     vr3,      a0,       a1,   0
647    vstelmx.d     vr3,      a0,       a1,   1
648.endm
649
650.macro VLD_DST_ADD_W8 in0, in1, in2, in3
651    vld           vr0,      a0,       0
652    vldx          vr1,      a0,       a1
653    vld           vr2,      t2,       0
654    vldx          vr3,      t2,       a1
655
656    DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3
657.endm
658
659functionl inv_identity_8h_x8_lsx
660.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
661    vsadd.h       \i,       \i,       \i
662.endr
663endfuncl
664
665functionl inv_identity_4h_x8_lsx
666.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
667    vsadd.h       \i,       \i,       \i
668.endr
669endfuncl
670
671.macro def_fn_8x8_base variant
672functionl inv_txfm_\variant\()add_8x8_lsx
673    vxor.v  vr23, vr23, vr23
674    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
675.irp i, 0, 16, 32, 48, 64, 80, 96, 112
676    vst           vr23,     a2,       \i
677.endr
678
679.ifc \variant, identity_
680    // The identity shl #1 and downshift srshr #1 cancel out
681    b             .itx_8x8_epilog
682.else
683
684    move          t6,       ra
685    jirl          ra,       t7,       0
686    move          ra,       t6
687
688.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
689    vsrari.h      \i,       \i,       1
690.endr
691
692.itx_8x8_epilog:
693    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
694                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
695                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
696
697    move          t6,       ra
698    jirl          ra,       t8,       0
699    move          ra,       t6
700
701    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
702                vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4
703
704    alsl.d        t2,       a1,       a0,     1
705    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
706    add.d         a0,       a0,       a1
707    alsl.d        t2,       a1,       a0,     1
708    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
709.endif
710endfuncl
711.endm
712
713def_fn_8x8_base identity_
714def_fn_8x8_base
715
716.macro fn8x8 txfm1, txfm2
717function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_lsx
718.ifc \txfm1\()_\txfm2, dct_dct
719    bnez          a3,       .NO_HAS_DCONLY_8x8
720
721    idct_dc 8, 8, 1
722
723    DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20
724
725    add.d         a0,       a1,       a0
726    alsl.d        t2,       a1,       a0,     1
727    VLD_DST_ADD_W8 vr20, vr20, vr20, vr20
728
729    b             .\txfm1\()_\txfm2\()_8X8_END
730.NO_HAS_DCONLY_8x8:
731.endif
732    la.local      t8,       inv_\txfm2\()_8h_x8_lsx
733.ifc \txfm1, identity
734    b             inv_txfm_identity_add_8x8_lsx
735.else
736    la.local      t7,       inv_\txfm1\()_8h_x8_lsx
737    b             inv_txfm_add_8x8_lsx
738.endif
739.\txfm1\()_\txfm2\()_8X8_END:
740endfunc
741.endm
742
743fn8x8 dct, dct
744fn8x8 identity, identity
745fn8x8 dct, adst
746fn8x8 dct, flipadst
747fn8x8 dct, identity
748fn8x8 adst, dct
749fn8x8 adst, adst
750fn8x8 adst, flipadst
751fn8x8 flipadst, dct
752fn8x8 flipadst, adst
753fn8x8 flipadst, flipadst
754fn8x8 identity, dct
755fn8x8 adst, identity
756fn8x8 flipadst, identity
757fn8x8 identity, adst
758fn8x8 identity, flipadst
759
760.macro rect2_lsx in0, in1, out0
761    vsllwil.w.h   vr22,     \in0,     0     // in1
762    vexth.w.h     \in0,     \in0            // in1
763    vmul.w        vr22,     vr22,     \in1
764    vmul.w        \out0,    \in0,     \in1
765    vssrarni.h.w  \out0,    vr22,     12
766.endm
767
768.macro LSX_TRANSPOSE8x4_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
769                          out2, out3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
770    vilvl.h       \tmp0,    \in1,     \in0
771    vilvl.h       \tmp1,    \in3,     \in2
772    vilvl.w       \tmp2,    \tmp1,    \tmp0
773    vilvh.w       \tmp3,    \tmp1,    \tmp0
774    vilvl.h       \tmp0,    \in5,     \in4
775    vilvl.h       \tmp1,    \in7,     \in6
776    vilvl.w       \tmp4,    \tmp1,    \tmp0
777    vilvh.w       \tmp5,    \tmp1,    \tmp0
778    vilvl.d       \out0,    \tmp4,    \tmp2
779    vilvh.d       \out1,    \tmp4,    \tmp2
780    vilvl.d       \out2,    \tmp5,    \tmp3
781    vilvh.d       \out3,    \tmp5,    \tmp3
782.endm
783
784functionl inv_txfm_add_8x4_lsx
785    vxor.v        vr23,     vr23,     vr23
786    vld           vr0,      a2,       0
787    vld           vr2,      a2,       16
788    vld           vr4,      a2,       32
789    vld           vr6,      a2,       48
790.irp i, 0, 16, 32, 48
791    vst           vr23,     a2,       \i
792.endr
793
794    li.w          t0,       2896
795    vreplgr2vr.w  vr23,     t0
796    rect2_lsx     vr0,      vr23,     vr0
797    rect2_lsx     vr2,      vr23,     vr2
798    rect2_lsx     vr4,      vr23,     vr4
799    rect2_lsx     vr6,      vr23,     vr6
800
801    vilvh.d       vr1,      vr0,      vr0
802    vilvh.d       vr3,      vr2,      vr2
803    vilvh.d       vr5,      vr4,      vr4
804    vilvh.d       vr7,      vr6,      vr6
805
806    move          t6,       ra
807    jirl          ra,       t7,       0
808    move          ra,       t6
809
810    LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \
811                       vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21
812
813    move          t6,       ra
814    jirl          ra,       t8,       0
815    move          ra,       t6
816
817    vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4
818
819    alsl.d        t2,       a1,       a0,     1
820    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
821endfuncl
822
823.macro LSX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, out4, \
824                          out5, out6, out7, tmp0, tmp1, tmp2, tmp3
825    vilvl.h       \tmp0,    \in1,     \in0
826    vilvl.h       \tmp1,    \in3,     \in2
827    vilvh.h       \tmp2,    \in1,     \in0
828    vilvh.h       \tmp3,    \in3,     \in2
829    vilvl.w       \out0,    \tmp1,    \tmp0
830    vilvh.w       \out2,    \tmp1,    \tmp0
831    vilvl.w       \out4,    \tmp3,    \tmp2
832    vilvh.w       \out6,    \tmp3,    \tmp2
833
834    vbsrl.v       \out1,    \out0,    8
835    vbsrl.v       \out3,    \out2,    8
836    vbsrl.v       \out5,    \out4,    8
837    vbsrl.v       \out7,    \out6,    8
838    vinsgr2vr.d   \out0,    zero,     1
839    vinsgr2vr.d   \out2,    zero,     1
840    vinsgr2vr.d   \out4,    zero,     1
841    vinsgr2vr.d   \out6,    zero,     1
842.endm
843
844functionl inv_txfm_add_4x8_lsx
845    vxor.v        vr23,     vr23,     vr23
846    vld           vr0,      a2,       0
847    vld           vr1,      a2,       16
848    vld           vr2,      a2,       32
849    vld           vr3,      a2,       48
850.irp i, 0, 16, 32, 48
851    vst           vr23,     a2,       \i
852.endr
853
854    li.w          t0,       2896
855    vreplgr2vr.w  vr23,     t0
856    rect2_lsx     vr0,      vr23,     vr0
857    rect2_lsx     vr1,      vr23,     vr1
858    rect2_lsx     vr2,      vr23,     vr2
859    rect2_lsx     vr3,      vr23,     vr3
860
861    move          t6,       ra
862    jirl          ra,       t7,       0
863    move          ra,       t6
864
865    LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \
866                       vr6, vr7, vr16, vr17, vr18, vr19
867
868    move          t6,       ra
869    jirl          ra,       t8,       0
870    move          ra,       t6
871
872    vilvl.d       vr0,      vr1,      vr0
873    vilvl.d       vr1,      vr3,      vr2
874    vilvl.d       vr2,      vr5,      vr4
875    vilvl.d       vr3,      vr7,      vr6
876
877    vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4
878
879    alsl.d        t2,       a1,       a0,    1
880    VLD_DST_ADD_W4 vr16, vr17
881    add.d         a0,       a1,       a0
882    alsl.d        t2,       a1,       a0,    1
883    VLD_DST_ADD_W4 vr18, vr19
884endfuncl
885
886.macro fn8x4 txfm1, txfm2
887function inv_txfm_add_\txfm1\()_\txfm2\()_8x4_8bpc_lsx
888.ifc \txfm1()_\txfm2, dct_dct
889    bnez          a3,       .NO_HAS_DCONLY_8x4
890
891    idct_dc 8, 4, 0
892
893    DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
894
895    b             .\txfm1\()_\txfm2\()_8X4_END
896.NO_HAS_DCONLY_8x4:
897.endif
898    la.local      t7,       inv_\txfm1\()_4h_x8_lsx
899    la.local      t8,       inv_\txfm2\()_8h_x4_lsx
900    b             inv_txfm_add_8x4_lsx
901.\txfm1\()_\txfm2\()_8X4_END:
902endfunc
903.endm
904
905fn8x4 dct, dct
906fn8x4 identity, identity
907fn8x4 dct, adst
908fn8x4 dct, flipadst
909fn8x4 dct, identity
910fn8x4 adst, dct
911fn8x4 adst, adst
912fn8x4 adst, flipadst
913fn8x4 flipadst, dct
914fn8x4 flipadst, adst
915fn8x4 flipadst, flipadst
916fn8x4 identity, dct
917fn8x4 adst, identity
918fn8x4 flipadst, identity
919fn8x4 identity, adst
920fn8x4 identity, flipadst
921
922.macro fn4x8 txfm1, txfm2
923function inv_txfm_add_\txfm1\()_\txfm2\()_4x8_8bpc_lsx
924.ifc \txfm1()_\txfm2, dct_dct
925    bnez          a3,       .NO_HAS_DCONLY_4x8
926
927    idct_dc 4, 8, 0
928
929    DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20
930
931    add.d         a0,       a0,       a1
932    alsl.d        t2,       a1,       a0,   1
933    VLD_DST_ADD_W4 vr5, vr5
934    b             .\txfm1\()_\txfm2\()_4X8_END
935.NO_HAS_DCONLY_4x8:
936.endif
937    la.local      t7,       inv_\txfm1\()_8h_x4_lsx
938    la.local      t8,       inv_\txfm2\()_4h_x8_lsx
939    b             inv_txfm_add_4x8_lsx
940.\txfm1\()_\txfm2\()_4X8_END:
941endfunc
942.endm
943
944fn4x8 dct, dct
945fn4x8 identity, identity
946fn4x8 dct, adst
947fn4x8 dct, flipadst
948fn4x8 dct, identity
949fn4x8 adst, dct
950fn4x8 adst, adst
951fn4x8 adst, flipadst
952fn4x8 flipadst, dct
953fn4x8 flipadst, adst
954fn4x8 flipadst, flipadst
955fn4x8 identity, dct
956fn4x8 adst, identity
957fn4x8 flipadst, identity
958fn4x8 identity, adst
959fn4x8 identity, flipadst
960
961.macro inv_identity4_lsx_x2 in0, in1, in2, in3, in4, out0, out1
962    vsllwil.w.h   vr4,      \in0,    0
963    vexth.w.h     vr5,      \in0
964    vsllwil.w.h   vr6,      \in1,    0
965    vexth.w.h     vr7,      \in1
966    vmul.w        vr4,      vr4,     \in2
967    vmul.w        vr5,      vr5,     \in2
968    vmul.w        vr6,      vr6,     \in2
969    vmul.w        vr7,      vr7,     \in2
970    vssrarni.h.w  vr5,      vr4,     12
971    vssrarni.h.w  vr7,      vr6,     12
972    vsadd.h       \out0,    vr5,     \in3
973    vsadd.h       \out1,    vr7,     \in4
974.endm
975
976.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1
977    vsllwil.w.h   vr22,     \in0,     0
978    vexth.w.h     vr23,     \in0
979    vmul.w        \out0,    vr22,     \in2
980    vmul.w        \out1,    vr23,     \in2
981    vsllwil.w.h   vr22,     \in1,     0
982    vexth.w.h     vr23,     \in1
983    vmadd.w       \out0,    vr22,     \in3
984    vmadd.w       \out1,    vr23,     \in3
985.endm
986
987.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1
988    vsllwil.w.h   vr22,     \in0,     0
989    vexth.w.h     vr23,     \in0
990    vmul.w        \out0,    vr22,     \in2
991    vmul.w        \out1,    vr23,     \in2
992    vsllwil.w.h   vr22,     \in1,     0
993    vexth.w.h     vr23,     \in1
994    vmsub.w       \out0,    vr22,     \in3
995    vmsub.w       \out1,    vr23,     \in3
996.endm
997
998.macro inv_dct16_lsx sz
999    inv_dct8_lsx vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14, \sz
1000
1001    la.local      t0,       idct_coeffs_h
1002    vldrepl.h     vr20,     t0,       16        // 401
1003    vldrepl.h     vr21,     t0,       18        // 4076
1004    vmulev_vmaddod_lsx vr1, vr15, vr21, vr20, vr16, vr17, \sz
1005    vneg.h        vr21,     vr21
1006    vmulev_vmaddod_lsx vr1, vr15, vr20, vr21, vr18, vr19, \sz
1007    vssrarni.h.w  vr17,     vr16,     12        // t15a
1008    vssrarni.h.w  vr19,     vr18,     12        // t8a
1009    vldrepl.h     vr20,     t0,       20        // 3166 -> 1583
1010    vldrepl.h     vr21,     t0,       22        // 2598 -> 1299
1011    vmulev_vmaddod_lsx vr9, vr7, vr21, vr20, vr1, vr16, \sz
1012    vneg.h        vr21,     vr21
1013    vmulev_vmaddod_lsx vr9, vr7, vr20, vr21, vr15, vr18, \sz
1014    vssrarni.h.w  vr16,     vr1,      12        // t14a
1015    vssrarni.h.w  vr18,     vr15,     12        // t9a
1016    vldrepl.h     vr20,     t0,       24        // 1931
1017    vldrepl.h     vr21,     t0,       26        // 3612
1018    vmulev_vmaddod_lsx vr5, vr11, vr21, vr20, vr7, vr1, \sz
1019    vneg.h        vr21,     vr21
1020    vmulev_vmaddod_lsx vr5, vr11, vr20, vr21, vr9, vr15, \sz
1021    vssrarni.h.w  vr1,      vr7,      12        // t13a
1022    vssrarni.h.w  vr15,     vr9,      12        // t10a
1023    vldrepl.h     vr20,     t0,       28        // 3920
1024    vldrepl.h     vr21,     t0,       30        // 1189
1025    vmulev_vmaddod_lsx vr13, vr3, vr21, vr20, vr5, vr7, \sz
1026    vneg.h        vr21,     vr21
1027    vmulev_vmaddod_lsx vr13, vr3, vr20, vr21, vr11, vr9, \sz
1028    vssrarni.h.w  vr7,      vr5,      12        // t12a
1029    vssrarni.h.w  vr9,      vr11,     12        // t11a
1030
1031    vsadd.h       vr5,      vr19,     vr18     // t8
1032    vssub.h       vr11,     vr19,     vr18     // t9
1033    vssub.h       vr3,      vr9,      vr15     // t10
1034    vsadd.h       vr13,     vr9,      vr15     // t11
1035    vsadd.h       vr18,     vr7,      vr1      // t12
1036    vssub.h       vr19,     vr7,      vr1      // t13
1037    vssub.h       vr9,      vr17,     vr16     // t14
1038    vsadd.h       vr15,     vr17,     vr16     // t15
1039
1040    vldrepl.h     vr20,     t0,       4        // 1567
1041    vldrepl.h     vr21,     t0,       6        // 3784
1042    vmulev_vmaddod_lsx vr9, vr11, vr21, vr20, vr1, vr16, \sz
1043    vneg.h        vr21,     vr21
1044    vmulev_vmaddod_lsx vr9, vr11, vr20, vr21, vr7, vr17, \sz
1045    vssrarni.h.w  vr16,     vr1,      12       // t14a
1046    vssrarni.h.w  vr17,     vr7,      12       // t9a
1047
1048    vneg.h        vr21,     vr21
1049    vmulev_vmaddod_lsx vr19, vr3, vr21, vr20, vr9, vr1, \sz
1050    vneg.h        vr21,     vr21
1051    vmulev_vmaddod_lsx vr19, vr3, vr20, vr21, vr11, vr7, \sz
1052    vneg.w        vr1,      vr1
1053    vneg.w        vr9,      vr9
1054    vssrarni.h.w  vr7,      vr11,     12       // t13a
1055    vssrarni.h.w  vr1,      vr9,      12       // t10a
1056    vsadd.h       vr9,      vr5,      vr13     // t8a
1057    vssub.h       vr11,     vr5,      vr13     // t11a
1058    vssub.h       vr3,      vr15,     vr18     // t12a
1059    vsadd.h       vr19,     vr15,     vr18     // t15a
1060    vsadd.h       vr5,      vr17,     vr1      // t9
1061    vssub.h       vr13,     vr17,     vr1      // t10
1062    vssub.h       vr15,     vr16,     vr7      // t13
1063    vsadd.h       vr18,     vr16,     vr7      // t14
1064
1065    vldrepl.h     vr20,     t0,       0        // 2896
1066    vmulev_vmaddod_lsx vr15, vr13, vr20, vr20, vr1, vr7, \sz
1067    vneg.h        vr21,     vr20
1068    vmulev_vmaddod_lsx vr15, vr13, vr20, vr21, vr17, vr16, \sz
1069    vssrarni.h.w  vr7,      vr1,      12       // t13a
1070    vssrarni.h.w  vr16,     vr17,     12       // t10a
1071
1072    vmulev_vmaddod_lsx vr3, vr11, vr20, vr20, vr13, vr23, \sz
1073    vmulev_vmaddod_lsx vr3, vr11, vr20, vr21, vr15, vr17, \sz
1074    vssrarni.h.w  vr23,     vr13,     12       // t12
1075    vssrarni.h.w  vr17,     vr15,     12       // t11
1076
1077    vssub.h       vr15,     vr0,     vr19      // c[15]
1078    vsadd.h       vr0,      vr0,     vr19      // c[0]
1079    vsadd.h       vr1,      vr2,     vr18      // c[1]
1080    vssub.h       vr20,     vr2,     vr18      // c[14]
1081    vsadd.h       vr2,      vr4,     vr7       // c[2]
1082    vssub.h       vr13,     vr4,     vr7       // c[13]
1083    vsadd.h       vr3,      vr6,     vr23      // c[3]
1084    vssub.h       vr21,     vr6,     vr23      // c[12]
1085    vsadd.h       vr4,      vr8,     vr17      // c[4]
1086    vssub.h       vr11,     vr8,     vr17      // c[11]
1087    vsadd.h       vr7,      vr14,    vr9       // c[7]
1088    vssub.h       vr8,      vr14,    vr9       // c[8]
1089    vsadd.h       vr6,      vr12,    vr5       // c[6]
1090    vssub.h       vr9,      vr12,    vr5       // c[9]
1091    vsadd.h       vr5,      vr10,    vr16      // c[5]
1092    vssub.h       vr10,     vr10,    vr16      // c[10]
1093    vor.v         vr14,     vr20,    vr20
1094    vor.v         vr12,     vr21,    vr21
1095.endm
1096
1097functionl inv_dct_8h_x16_lsx
1098    inv_dct16_lsx .8h
1099endfuncl
1100
1101functionl inv_dct_4h_x16_lsx
1102    inv_dct16_lsx .4h
1103endfuncl
1104
1105.macro VLD_DST_ADD_W4_x4 in0, in1, in2, in3, in4, in5, in6 ,in7
1106    alsl.d        t2,       a1,       a0,    1
1107
1108    VLD_DST_ADD_W4 \in0, \in1
1109
1110    add.d         a0,       a1,       a0
1111    alsl.d        t2,       a1,       a0,    1
1112    VLD_DST_ADD_W4 \in2, \in3
1113
1114    add.d         a0,       a1,       a0
1115    alsl.d        t2,       a1,       a0,    1
1116    VLD_DST_ADD_W4 \in4, \in5
1117
1118    add.d         a0,       a1,       a0
1119    alsl.d        t2,       a1,       a0,    1
1120    VLD_DST_ADD_W4 \in6, \in7
1121.endm
1122
1123.macro def_fn_4x16_base txfm
1124functionl inv_txfm_\txfm\()add_4x16_lsx
1125    PUSH_REG
1126    blt           a3,       t5,       416f
1127    vld           vr0,      a2,       16
1128    vld           vr1,      a2,       48
1129    vld           vr2,      a2,       80
1130    vld           vr3,      a2,       112
1131    vxor.v        vr23,     vr23,     vr23
1132.irp i, 16, 48, 80, 112
1133    vst           vr23,     a2,       \i
1134.endr
1135
1136    move          t6,       ra
1137    jirl          ra,       t7,       0
1138    move          ra,       t6
1139
1140.ifnc \txfm, identity_
1141    vsrari.h      vr0,      vr0,      1
1142    vsrari.h      vr1,      vr1,      1
1143    vsrari.h      vr2,      vr2,      1
1144    vsrari.h      vr3,      vr3,      1
1145.endif
1146
1147    LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr8, vr9, vr24, vr25, vr26, \
1148                       vr27, vr14, vr28, vr10, vr11, vr12, vr13
1149
1150416:
1151    ble           t5,       a3,       416416f
1152.irp i, vr8, vr9, vr24, vr25, vr26, vr27, vr14, vr28
1153    vxor.v        \i,       \i,       \i
1154.endr
1155
1156416416:
1157    vld           vr0,      a2,       0
1158    vld           vr1,      a2,       32
1159    vld           vr2,      a2,       64
1160    vld           vr3,      a2,       96
1161    vxor.v        vr23,     vr23,     vr23
1162.irp i, 0, 32, 64, 96
1163    vst           vr23,     a2,       \i
1164.endr
1165
1166    move          t6,       ra
1167    jirl          ra,       t7,       0
1168    move          ra,       t6
1169
1170.ifnc \txfm, identity_
1171    vsrari.h      vr0,      vr0,      1
1172    vsrari.h      vr1,      vr1,      1
1173    vsrari.h      vr2,      vr2,      1
1174    vsrari.h      vr3,      vr3,      1
1175.endif
1176
1177    LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \
1178                       vr6, vr7, vr16, vr17, vr18, vr19
1179
1180    vor.v         vr10,     vr24,     vr24
1181    vor.v         vr11,     vr25,     vr25
1182    vor.v         vr12,     vr26,     vr26
1183    vor.v         vr13,     vr27,     vr27
1184    vor.v         vr15,     vr28,     vr28
1185
1186    move          t6,       ra
1187    jirl          ra,       t8,       0
1188    move          ra,       t6
1189
1190    vilvl.d       vr16,     vr1,      vr0
1191    vilvl.d       vr17,     vr3,      vr2
1192    vilvl.d       vr18,     vr5,      vr4
1193    vilvl.d       vr19,     vr7,      vr6
1194    vilvl.d       vr20,     vr9,      vr8
1195    vilvl.d       vr21,     vr11,     vr10
1196    vilvl.d       vr22,     vr13,     vr12
1197    vilvl.d       vr23,     vr15,     vr14
1198
1199.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1200    vsrari.h     \i,       \i,       4
1201.endr
1202
1203    VLD_DST_ADD_W4_x4 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1204    POP_REG
1205endfuncl
1206.endm
1207
1208def_fn_4x16_base identity_
1209def_fn_4x16_base
1210
1211.macro fn4x16 txfm1, txfm2, eob_half
1212function inv_txfm_add_\txfm1\()_\txfm2\()_4x16_8bpc_lsx
1213.ifc \txfm1()_\txfm2, dct_dct
1214    bnez          a3,       .NO_HAS_DCONLY_4x16
1215
1216    idct_dc 4, 16, 1
1217
1218    DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5
1219
1220.rept 3
1221    add.d         a0,       a1,       a0
1222    alsl.d        t2,       a1,       a0,   1
1223
1224    VLD_DST_ADD_W4 vr5, vr5
1225.endr
1226    b             .\txfm1\()_\txfm2\()_4X16_END
1227
1228.NO_HAS_DCONLY_4x16:
1229.endif
1230    li.w          t5,       \eob_half
1231    la.local      t7,       inv_\txfm1\()_8h_x4_lsx
1232.ifc \txfm1, identity
1233    la.local      t7,       inv_\txfm1\()_8h_x4_lsx1
1234.endif
1235    la.local      t8,       inv_\txfm2\()_4h_x16_lsx
1236
1237.ifc \txfm1, identity
1238    b             inv_txfm_identity_add_4x16_lsx
1239.else
1240    b             inv_txfm_add_4x16_lsx
1241.endif
1242.\txfm1\()_\txfm2\()_4X16_END:
1243endfunc
1244.endm
1245
1246fn4x16 dct, dct, 29
1247fn4x16 identity, identity, 29
1248fn4x16 dct, adst, 29
1249fn4x16 dct, flipadst, 29
1250fn4x16 dct, identity, 8
1251fn4x16 adst, dct, 29
1252fn4x16 adst, adst, 29
1253fn4x16 adst, flipadst, 29
1254fn4x16 flipadst, dct, 29
1255fn4x16 flipadst, adst, 29
1256fn4x16 flipadst, flipadst, 29
1257fn4x16 identity, dct, 32
1258fn4x16 adst, identity, 8
1259fn4x16 flipadst, identity, 8
1260fn4x16 identity, adst, 32
1261fn4x16 identity, flipadst, 32
1262
1263.macro inv_identity16_lsx in0, in1, in2, out0, sz
1264.ifc \sz, .8h
1265    vsllwil.w.h   vr16,     \in0,     0
1266    vexth.w.h     vr17,     \in0
1267    vmul.w        vr16,     vr16,     \in1
1268    vmul.w        vr17,     vr17,     \in1
1269    vsadd.h       \in2,     \in2,     \in2
1270    vssrarni.h.w  vr17,     vr16,     11
1271    vsadd.h       \out0,    vr17,     \in2
1272.else
1273    vsllwil.w.h   vr16,     \in0,     0
1274    vmul.w        vr16,     vr16,     \in1
1275    vsadd.h       \in2,     \in2,     \in2
1276    vssrarni.h.w  vr16,     vr16,     11
1277    vsadd.h       \out0,    vr16,     \in2
1278.endif
1279.endm
1280
1281.macro inv_identity16_lsx1 in0, in1, in2, out0
1282    vsllwil.w.h   vr16,     \in0,     0
1283    vexth.w.h     vr17,     \in1
1284    vmul.w        vr18,     vr16,     \in2
1285    vmul.w        vr19,     vr17,     \in2
1286    vsrari.w      vr18,     vr18,     11
1287    vsrari.w      vr19,     vr19,     11
1288    vslli.w       vr16,     vr16,     1
1289    vslli.w       vr17,     vr17,     1
1290    vadd.w        vr16,     vr18,     vr16
1291    vadd.w        \out0,    vr19,     vr17
1292    vssrarni.h.w  \out0,    vr16,     1
1293.endm
1294
1295functionl inv_identity_8h_x16_lsx
1296    li.w          t0,       1697
1297    vreplgr2vr.w  vr20,     t0
1298.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
1299    vr9, vr10, vr11, vr12, vr13, vr14, vr15
1300    inv_identity16_lsx \i, vr20, \i, \i, .8h
1301.endr
1302endfuncl
1303
1304functionl inv_identity_4h_x16_lsx
1305    li.w          t0,       1697
1306    vreplgr2vr.w  vr20,     t0
1307.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
1308    vr9, vr10, vr11, vr12, vr13, vr14, vr15
1309    inv_identity16_lsx \i, vr20, \i, \i, .4h
1310.endr
1311endfuncl
1312
1313functionl inv_identity_8h_x16_lsx1
1314    li.w          t0,       1697
1315    vreplgr2vr.w  vr20,     t0
1316.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
1317    vr9, vr10, vr11, vr12, vr13, vr14, vr15
1318    inv_identity16_lsx1 \i, \i, vr20, \i
1319.endr
1320endfuncl
1321
1322const iadst16_coeffs_h, align=4
1323    .short         4091, 201, 3973, 995
1324    .short         3703, 1751, 3290, 2440
1325    .short         2751, 3035, 2106, 3513
1326    .short         1380, 3857, 601, 4052
1327endconst
1328
1329.macro inv_adst16_lsx txfm, sz
1330    la.local      t0,       iadst16_coeffs_h
1331    vldrepl.h     vr20,     t0,        0        // 4091
1332    vldrepl.h     vr21,     t0,        2        // 201
1333    vmulev_vmaddod_lsx vr15, vr0, vr20, vr21, vr16, vr18, \sz
1334    vneg.h        vr20,     vr20
1335    vmulev_vmaddod_lsx vr15, vr0, vr21, vr20, vr17, vr19, \sz
1336    vssrarni.h.w  vr18,     vr16,      12       // t0
1337    vssrarni.h.w  vr19,     vr17,      12       // t1
1338    vldrepl.h     vr20,     t0,        4        // 3973
1339    vldrepl.h     vr21,     t0,        6        // 995
1340    vmulev_vmaddod_lsx vr13, vr2, vr20, vr21, vr16, vr0, \sz
1341    vneg.h        vr20,     vr20
1342    vmulev_vmaddod_lsx vr13, vr2, vr21, vr20, vr17, vr15, \sz
1343    vssrarni.h.w  vr0,      vr16,      12       // t2
1344    vssrarni.h.w  vr15,     vr17,      12       // t3
1345    vldrepl.h     vr20,     t0,        8       // 3703
1346    vldrepl.h     vr21,     t0,        10       // 1751
1347    vmulev_vmaddod_lsx vr11, vr4, vr20, vr21, vr16, vr2, \sz
1348    vneg.h        vr20,     vr20
1349    vmulev_vmaddod_lsx vr11, vr4, vr21, vr20, vr17, vr13, \sz
1350    vssrarni.h.w  vr2,      vr16,      12       // t4
1351    vssrarni.h.w  vr13,     vr17,      12       // t5
1352    vldrepl.h     vr20,     t0,        12       // 3290 -> 1645
1353    vldrepl.h     vr21,     t0,        14       // 2440 -> 1220
1354    vmulev_vmaddod_lsx vr9, vr6, vr20, vr21, vr16, vr4, \sz
1355    vneg.h        vr20,     vr20
1356    vmulev_vmaddod_lsx vr9, vr6, vr21, vr20, vr17, vr11, \sz
1357    vssrarni.h.w  vr4,      vr16,      12       // t6
1358    vssrarni.h.w  vr11,     vr17,      12       // t7
1359    vldrepl.h     vr20,     t0,        16       // 2751
1360    vldrepl.h     vr21,     t0,        18       // 3035
1361    vmulev_vmaddod_lsx vr7, vr8, vr20, vr21, vr16, vr6, \sz
1362    vneg.h        vr20,     vr20
1363    vmulev_vmaddod_lsx vr7, vr8, vr21, vr20, vr17, vr9, \sz
1364    vssrarni.h.w  vr6,      vr16,      12       // t8
1365    vssrarni.h.w  vr9,      vr17,      12       // t9
1366    vldrepl.h     vr20,     t0,        20       // 2106
1367    vldrepl.h     vr21,     t0,        22       // 3513
1368    vmulev_vmaddod_lsx vr5, vr10, vr20, vr21, vr16, vr7, \sz
1369    vneg.h        vr20,     vr20
1370    vmulev_vmaddod_lsx vr5, vr10, vr21, vr20, vr17, vr8, \sz
1371    vssrarni.h.w  vr7,      vr16,      12       // t10
1372    vssrarni.h.w  vr8,      vr17,      12       // t11
1373    vldrepl.h     vr20,     t0,        24       // 1380
1374    vldrepl.h     vr21,     t0,        26       // 3857
1375    vmulev_vmaddod_lsx vr3, vr12, vr20, vr21, vr16, vr5, \sz
1376    vneg.h        vr20,     vr20
1377    vmulev_vmaddod_lsx vr3, vr12, vr21, vr20, vr17, vr10, \sz
1378    vssrarni.h.w  vr5,      vr16,      12       // t12
1379    vssrarni.h.w  vr10,     vr17,      12       // t13
1380    vldrepl.h     vr20,     t0,        28       // 601
1381    vldrepl.h     vr21,     t0,        30       // 4052
1382    vmulev_vmaddod_lsx vr1, vr14, vr20, vr21, vr16, vr3, \sz
1383    vneg.h        vr20,     vr20
1384    vmulev_vmaddod_lsx vr1, vr14, vr21, vr20, vr17, vr12, \sz
1385    vssrarni.h.w  vr3,      vr16,      12       // t14
1386    vssrarni.h.w  vr12,     vr17,      12       // t15
1387
1388    vsadd.h       vr1,      vr18,      vr6      // t0a
1389    vssub.h       vr14,     vr18,      vr6      // t8a
1390    vsadd.h       vr16,     vr19,      vr9      // t1a
1391    vssub.h       vr17,     vr19,      vr9      // t9a
1392    vsadd.h       vr6,      vr0,       vr7      // t2a
1393    vssub.h       vr18,     vr0,       vr7      // t10a
1394    vsadd.h       vr9,      vr15,      vr8      // t3a
1395    vssub.h       vr19,     vr15,      vr8      // t11a
1396    vsadd.h       vr0,      vr2,       vr5      // t4a
1397    vssub.h       vr7,      vr2,       vr5      // t12a
1398    vsadd.h       vr8,      vr13,      vr10     // t5a
1399    vssub.h       vr15,     vr13,      vr10     // t13a
1400    vsadd.h       vr2,      vr4,       vr3      // t6a
1401    vssub.h       vr5,      vr4,       vr3      // t14a
1402    vsadd.h       vr10,     vr11,      vr12     // t7a
1403    vssub.h       vr13,     vr11,      vr12     // t15a
1404
1405    la.local      t0,       idct_coeffs_h
1406
1407    vldrepl.h     vr20,     t0,        8        // 799
1408    vldrepl.h     vr21,     t0,        10       // 4017
1409    vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr3, vr11, \sz
1410    vneg.h        vr21,     vr21
1411    vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr4, vr12, \sz
1412    vssrarni.h.w  vr11,     vr3,       12       // t8
1413    vssrarni.h.w  vr12,     vr4,       12       // t9
1414    vneg.h        vr21,     vr21
1415    vmulev_vmaddod_lsx vr15, vr7, vr20, vr21, vr3, vr14, \sz
1416    vneg.h        vr20,     vr20
1417    vmulev_vmaddod_lsx vr15, vr7, vr21, vr20, vr4, vr17, \sz
1418    vssrarni.h.w  vr14,     vr3,       12       // t13
1419    vssrarni.h.w  vr17,     vr4,       12       // t12
1420    vldrepl.h     vr20,     t0,        12       // 3406
1421    vldrepl.h     vr21,     t0,        14       // 2276
1422    vmulev_vmaddod_lsx vr18, vr19, vr21, vr20, vr3, vr7, \sz
1423    vneg.h        vr21,     vr21
1424    vmulev_vmaddod_lsx vr18, vr19, vr20, vr21, vr4, vr15, \sz
1425    vssrarni.h.w  vr7,      vr3,       12       // t10
1426    vssrarni.h.w  vr15,     vr4,       12       // t11
1427    vneg.h        vr21,     vr21
1428    vmulev_vmaddod_lsx vr13, vr5, vr20, vr21, vr3, vr18, \sz
1429    vneg.h        vr20,     vr20
1430    vmulev_vmaddod_lsx vr13, vr5, vr21, vr20, vr4, vr19, \sz
1431    vssrarni.h.w  vr18,     vr3,       12       // t15
1432    vssrarni.h.w  vr19,     vr4,       12       // t14
1433
1434    vsadd.h       vr5,      vr1,       vr0      // t0
1435    vssub.h       vr13,     vr1,       vr0      // t4
1436    vsadd.h       vr3,      vr16,      vr8      // t1
1437    vssub.h       vr4,      vr16,      vr8      // t5
1438    vsadd.h       vr0,      vr6,       vr2      // t2
1439    vssub.h       vr1,      vr6,       vr2      // t6
1440    vsadd.h       vr8,      vr9,       vr10     // t3
1441    vssub.h       vr16,     vr9,       vr10     // t7
1442    vsadd.h       vr2,      vr11,      vr17     // t8a
1443    vssub.h       vr6,      vr11,      vr17     // t12a
1444    vsadd.h       vr9,      vr12,      vr14     // t9a
1445    vssub.h       vr10,     vr12,      vr14     // t13a
1446    vsadd.h       vr11,     vr7,       vr19     // t10a
1447    vssub.h       vr17,     vr7,       vr19     // t14a
1448    vsadd.h       vr12,     vr15,      vr18     // t11a
1449    vssub.h       vr14,     vr15,      vr18     // t15a
1450
1451    vldrepl.h     vr20,     t0,        4        // 1567
1452    vldrepl.h     vr21,     t0,        6       // 3784
1453    vmulev_vmaddod_lsx vr13, vr4, vr21, vr20, vr7, vr18, \sz
1454    vneg.h        vr21,     vr21
1455    vmulev_vmaddod_lsx vr13, vr4, vr20, vr21, vr15, vr19, \sz
1456    vssrarni.h.w  vr18,     vr7,       12       // t4a
1457    vssrarni.h.w  vr19,     vr15,      12       // t5a
1458    vneg.h        vr21,     vr21
1459    vmulev_vmaddod_lsx vr16, vr1, vr20, vr21, vr7, vr4, \sz
1460    vneg.h        vr20,     vr20
1461    vmulev_vmaddod_lsx vr16, vr1, vr21, vr20, vr15, vr13, \sz
1462    vssrarni.h.w  vr4,      vr7,       12       // t7a
1463    vssrarni.h.w  vr13,     vr15,      12       // t6a
1464    vneg.h        vr20,     vr20
1465    vmulev_vmaddod_lsx vr6, vr10, vr21, vr20, vr7, vr1, \sz
1466    vneg.h        vr21,     vr21
1467    vmulev_vmaddod_lsx vr6, vr10, vr20, vr21, vr15, vr16, \sz
1468    vssrarni.h.w  vr1,      vr7,       12       // t12
1469    vssrarni.h.w  vr16,     vr15,      12       // t13
1470    vneg.h        vr21,     vr21
1471    vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr7, vr6, \sz
1472    vneg.h        vr20,     vr20
1473    vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr15, vr10, \sz
1474    vssrarni.h.w  vr6,      vr7,       12       // t15
1475    vssrarni.h.w  vr10,     vr15,      12       // t14
1476
1477    vssub.h       vr17,     vr5,       vr0      // t2a
1478    vsadd.h       vr14,     vr5,       vr0      // out[0]
1479    vssub.h       vr7,      vr3,       vr8      // t3a
1480    vsadd.h       vr15,     vr3,       vr8      // out[15]
1481    vsllwil.w.h   vr22,     vr15,      0
1482    vexth.w.h     vr15,     vr15
1483    vneg.w        vr22,     vr22
1484    vneg.w        vr15,     vr15
1485    vssrarni.h.w  vr15,     vr22,      0        // out[15]
1486
1487    vsadd.h       vr3,      vr19,      vr4      // out[12]
1488    vssub.h       vr8,      vr19,      vr4      // t7
1489    vssub.h       vr0,      vr18,      vr13     // t6
1490    vsadd.h       vr5,      vr18,      vr13     // out[3]
1491    vsllwil.w.h   vr22,     vr5,       0
1492    vexth.w.h     vr5,      vr5
1493    vneg.w        vr22,     vr22
1494    vneg.w        vr5,      vr5
1495    vssrarni.h.w  vr5,      vr22,      0        // out[3]
1496
1497    vsadd.h       vr13,     vr9,       vr12     // out[14]
1498    vssub.h       vr19,     vr9,       vr12     // t11
1499    vssub.h       vr4,      vr2,       vr11     // t10
1500    vsadd.h       vr18,     vr2,       vr11     // out[1]
1501    vsllwil.w.h   vr22,     vr18,      0
1502    vexth.w.h     vr18,     vr18
1503    vneg.w        vr22,     vr22
1504    vneg.w        vr18,     vr18
1505    vssrarni.h.w  vr18,     vr22,      0        // out[1]
1506
1507    vsadd.h       vr2,      vr1,       vr10     // out[2]
1508    vssub.h       vr11,     vr1,       vr10     // t14a
1509    vssub.h       vr12,     vr16,      vr6      // t15a
1510    vsadd.h       vr9,      vr16,      vr6      // out[13]
1511    vsllwil.w.h   vr22,     vr9,       0
1512    vexth.w.h     vr9,      vr9
1513    vneg.w        vr22,     vr22
1514    vneg.w        vr9,      vr9
1515    vssrarni.h.w  vr9,      vr22,      0        // out[13]
1516
1517    vldrepl.h     vr20,     t0,        0        // 2896
1518    vmulev_vmaddod_lsx vr17, vr7, vr20, vr20, vr6, vr10, \sz
1519    vneg.h        vr21,     vr20
1520    vmulev_vmaddod_lsx vr17, vr7, vr20, vr21, vr16, vr1, \sz
1521    vssrarni.h.w  vr1,      vr16,      12       // out[8]
1522    vsrari.w      vr6,      vr6,       12
1523    vsrari.w      vr10,     vr10,      12
1524    vneg.w        vr6,      vr6
1525    vneg.w        vr10,     vr10
1526    vssrarni.h.w  vr10,     vr6,       0        // out[7]
1527    vmulev_vmaddod_lsx vr0, vr8, vr20, vr21, vr16, vr17, \sz
1528    vmulev_vmaddod_lsx vr0, vr8, vr20, vr20, vr6, vr7, \sz
1529    vssrarni.h.w  vr7,      vr6,       12       // out[4]
1530    vsrari.w      vr16,     vr16,      12
1531    vsrari.w      vr17,     vr17,      12
1532    vneg.w        vr16,     vr16
1533    vneg.w        vr17,     vr17
1534    vssrarni.h.w  vr17,     vr16,       0        // out[11]
1535
1536    vmulev_vmaddod_lsx vr4, vr19, vr20, vr21, vr16, vr0, \sz
1537    vmulev_vmaddod_lsx vr4, vr19, vr20, vr20, vr6, vr8, \sz
1538    vssrarni.h.w  vr8,      vr6,       12       // out[6]
1539    vsrari.w      vr16,     vr16,      12
1540    vsrari.w      vr0,      vr0,       12
1541    vneg.w        vr16,     vr16
1542    vneg.w        vr0,      vr0
1543    vssrarni.h.w  vr0,      vr16,      0    // out[9]
1544
1545    vmulev_vmaddod_lsx vr11, vr12, vr20, vr20, vr6, vr4, \sz
1546    vmulev_vmaddod_lsx vr11, vr12, vr20, vr21, vr16, vr19, \sz
1547    vssrarni.h.w  vr19,     vr16,      12       // out[10]
1548    vsrari.w      vr6,      vr6,       12
1549    vsrari.w      vr4,      vr4,       12
1550    vneg.w        vr6,      vr6
1551    vneg.w        vr4,      vr4
1552    vssrarni.h.w  vr4,      vr6,       0        // out[5]
1553
1554.ifc \txfm, adst
1555    vor.v         vr12,     vr3,       vr3
1556    vor.v         vr3,      vr5,       vr5
1557    vor.v         vr5,      vr4,       vr4
1558    vor.v         vr4,      vr7,       vr7
1559    vor.v         vr7,      vr10,      vr10
1560    vor.v         vr10,     vr19,      vr19
1561    vor.v         vr6,      vr8,       vr8
1562    vor.v         vr8,      vr1,       vr1
1563    vor.v         vr11,     vr17,      vr17
1564    vor.v         vr20,     vr13,      vr13
1565    vor.v         vr13,     vr9,       vr9
1566    vor.v         vr9,      vr0,       vr0
1567    vor.v         vr0,      vr14,      vr14
1568    vor.v         vr14,     vr20,      vr20
1569    vor.v         vr1,      vr18,      vr18
1570.else
1571    vor.v         vr6,      vr0,       vr0
1572    vor.v         vr0,      vr15,      vr15
1573    vor.v         vr15,     vr14,      vr14
1574    vor.v         vr14,     vr18,      vr18
1575    vor.v         vr11,     vr7,       vr7
1576    vor.v         vr7,      vr1,       vr1
1577    vor.v         vr1,      vr13,      vr13
1578    vor.v         vr13,     vr2,       vr2
1579    vor.v         vr2,      vr9,       vr9
1580    vor.v         vr9,      vr8,       vr8
1581    vor.v         vr8,      vr10,      vr10
1582    vor.v         vr10,     vr4,       vr4
1583    vor.v         vr4,      vr17,      vr17
1584    vor.v         vr12,     vr5,       vr5
1585    vor.v         vr5,      vr19,      vr19
1586.endif
1587.endm // inv_adst16_lsx
1588
1589functionl inv_adst_8h_x16_lsx
1590    inv_adst16_lsx adst, 8h
1591endfuncl
1592
1593functionl inv_flipadst_8h_x16_lsx
1594    inv_adst16_lsx flipadst, 8h
1595endfuncl
1596
1597functionl inv_adst_4h_x16_lsx
1598    inv_adst16_lsx adst, 4h
1599endfuncl
1600
1601functionl inv_flipadst_4h_x16_lsx
1602    inv_adst16_lsx flipadst, 4h
1603endfuncl
1604
1605.macro VLD_DST_ADD_W8_x4 in0, in1, in2, in3, in4, in5, in6, in7, in8, \
1606                         in9, in10, in11, in12, in13, in14, in15
1607
1608    alsl.d        t2,       a1,       a0,    1
1609    VLD_DST_ADD_W8 \in0, \in1, \in2, \in3
1610
1611    add.d         a0,       a1,       a0
1612    alsl.d        t2,       a1,       a0,    1
1613    VLD_DST_ADD_W8 \in4, \in5, \in6, \in7
1614
1615    add.d         a0,       a1,       a0
1616    alsl.d        t2,       a1,       a0,    1
1617    VLD_DST_ADD_W8 \in8, \in9, \in10, \in11
1618
1619    add.d         a0,       a1,       a0
1620    alsl.d        t2,       a1,       a0,    1
1621    VLD_DST_ADD_W8 \in12, \in13, \in14, \in15
1622.endm
1623
1624.macro def_base_8x16 txfm1
1625functionl inv_txfm_\txfm1\()add_8x16_lsx
1626    blt     a3,    t5,   816f
1627    vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
1628    vxor.v        vr23,     vr23,     vr23
1629.irp i, 16, 48, 80, 112, 144, 176, 208, 240
1630    vst           vr23,     a2,       \i
1631.endr
1632
1633    li.w          t0,       2896
1634    vreplgr2vr.w  vr23,     t0
1635.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
1636    rect2_lsx     \i,       vr23,     \i
1637.endr
1638
1639.ifc \txfm1, identity_
1640    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1641                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
1642                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1643.else
1644    move          t6,       ra
1645    jirl          ra,       t7,       0
1646    move          ra,       t6
1647
1648    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1649                vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 1
1650
1651    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
1652                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
1653                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
1654.endif
1655
1656816:
1657    ble       t5,    a3,  816816f
1658.irp i, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
1659    vxor.v    \i,  \i,  \i
1660.endr
1661
1662816816:
1663    vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
1664    vxor.v        vr23,     vr23,     vr23
1665.irp i, 0, 32, 64, 96, 128, 160, 192, 224
1666    vst           vr23,     a2,       \i
1667.endr
1668
1669    li.w          t0,       2896
1670    vreplgr2vr.w  vr23,     t0
1671.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
1672    rect2_lsx     \i,       vr23,     \i
1673.endr
1674
1675.ifc \txfm1, identity_
1676
1677.else
1678    move          t6,       ra
1679    jirl          ra,       t7,       0
1680    move          ra,       t6
1681
1682.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
1683    vsrari.h      \i,       \i,       1
1684.endr
1685.endif
1686
1687    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1688                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1689                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1690
1691    move          t6,       ra
1692    jirl          ra,       t8,       0
1693    move          ra,       t6
1694
1695    vor.v   vr0, vr0, vr0
1696    vsrari_h_x8 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
1697                vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4
1698    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1699                vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4
1700
1701    VLD_DST_ADD_W8_x4 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
1702                      vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1703endfuncl
1704.endm
1705
1706def_base_8x16 identity_
1707def_base_8x16
1708
1709.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11
1710    vsllwil.hu.bu vr4,      \in0,     0
1711    vexth.hu.bu   vr0,      \in0
1712    vsllwil.hu.bu vr5,      \in1,     0
1713    vexth.hu.bu   vr1,      \in1
1714    vsllwil.hu.bu vr6,      \in2,     0
1715    vexth.hu.bu   vr2,      \in2
1716    vsllwil.hu.bu vr7,      \in3,     0
1717    vexth.hu.bu   vr3,      \in3
1718    vadd.h        vr4,      vr4,      \in4
1719    vadd.h        vr0,      vr0,      \in5
1720    vadd.h        vr5,      vr5,      \in6
1721    vadd.h        vr1,      vr1,      \in7
1722    vadd.h        vr6,      vr6,      \in8
1723    vadd.h        vr2,      vr2,      \in9
1724    vadd.h        vr7,      vr7,      \in10
1725    vadd.h        vr3,      vr3,      \in11
1726    vssrani.bu.h  vr0,      vr4,      0
1727    vssrani.bu.h  vr1,      vr5,      0
1728    vssrani.bu.h  vr2,      vr6,      0
1729    vssrani.bu.h  vr3,      vr7,      0
1730    vst           vr0,      a0,       0
1731    vstx          vr1,      a0,       a1
1732    vst           vr2,      t2,       0
1733    vstx          vr3,      t2,       a1
1734.endm
1735
1736.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7
1737    vld           vr0,      a0,       0
1738    vldx          vr1,      a0,       a1
1739    vld           vr2,      t2,       0
1740    vldx          vr3,      t2,       a1
1741    DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \
1742                \in4, \in5, \in6, \in7
1743.endm
1744
1745.macro def_fn_16x8 txfm1
1746functionl inv_txfm_\txfm1\()add_16x8_lsx
1747    PUSH_REG
1748
1749    vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1750            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
1751    vxor.v        vr23,     vr23,     vr23
1752.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, \
1753    176, 192, 208, 224, 240
1754    vst           vr23,     a2,       \i
1755.endr
1756
1757    li.w          t0,       2896
1758    vreplgr2vr.w  vr23,     t0
1759.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1760    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
1761    rect2_lsx     \i,       vr23,     \i
1762.endr
1763
1764    move          t6,       ra
1765    jirl          ra,       t7,       0
1766    move          ra,       t6
1767
1768.ifnc \txfm1, identity_
1769.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1770    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
1771    vsrari.h       \i,       \i,       1
1772.endr
1773.endif
1774
1775    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1776                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1777                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1778
1779    move          t6,       ra
1780    jirl          ra,       t8,       0
1781    move          ra,       t6
1782
1783    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1784                vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, 4
1785
1786    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
1787                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1788                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1789    move          t6,       ra
1790    jirl          ra,       t8,       0
1791    move          ra,       t6
1792
1793    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1794                vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4
1795
1796    alsl.d        t2,       a1,       a0,    1
1797    VLD_DST_ADD_W16 vr24, vr8, vr25, vr9, vr26, vr10, vr27, vr11
1798
1799    alsl.d        a0,       a1,       a0,    2
1800    alsl.d        t2,       a1,       a0,    1
1801    VLD_DST_ADD_W16 vr28, vr12, vr29, vr13, vr30, vr14, vr31, vr15
1802
1803    POP_REG
1804endfuncl
1805.endm
1806
1807def_fn_16x8 identity_
1808def_fn_16x8
1809
1810.macro fun16x8 txfm1, txfm2
1811function inv_txfm_add_\txfm1\()_\txfm2\()_16x8_8bpc_lsx
1812.ifc \txfm1\()_\txfm2, dct_dct
1813    bnez          a3,       .NO_HAS_DCONLY_16x8
1814
1815    idct_dc 16, 8, 1
1816
1817    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
1818                vr20, vr20, vr20, vr20, vr20
1819
1820    alsl.d        a0,       a1,       a0,     2
1821    alsl.d        t2,       a1,       a0,     1
1822    VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20,
1823    b             .\txfm1\()_\txfm2\()_16x8_END
1824.NO_HAS_DCONLY_16x8:
1825.endif
1826
1827    la.local     t7,    inv_\txfm1\()_8h_x16_lsx
1828.ifc \txfm1, identity
1829    la.local     t7,    inv_identity_8h_x16_lsx1
1830.endif
1831
1832    la.local     t8,    inv_\txfm2\()_8h_x8_lsx
1833
1834.ifc \txfm1, identity
1835    b            inv_txfm_identity_add_16x8_lsx
1836.else
1837    b            inv_txfm_add_16x8_lsx
1838.endif
1839
1840.\txfm1\()_\txfm2\()_16x8_END:
1841endfunc
1842.endm
1843
1844fun16x8 dct, dct
1845fun16x8 identity, identity
1846fun16x8 dct, adst
1847fun16x8 dct, flipadst
1848fun16x8 dct, identity
1849fun16x8 adst, dct
1850fun16x8 adst, adst
1851fun16x8 adst, flipadst
1852fun16x8 flipadst, dct
1853fun16x8 flipadst, adst
1854fun16x8 flipadst, flipadst
1855fun16x8 identity, dct
1856fun16x8 adst, identity
1857fun16x8 flipadst, identity
1858fun16x8 identity, adst
1859fun16x8 identity, flipadst
1860
1861.macro fun8x16 txfm1, txfm2, eob_half
1862function inv_txfm_add_\txfm1\()_\txfm2\()_8x16_8bpc_lsx
1863.ifc \txfm1\()_\txfm2, dct_dct
1864    bnez          a3,       .NO_HAS_DCONLY_8x16
1865
1866    idct_dc 8, 16, 1
1867
1868    DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20
1869.rept 3
1870    add.d         a0,       a1,       a0
1871    alsl.d        t2,       a1,       a0,     1
1872    VLD_DST_ADD_W8 vr20, vr20, vr20, vr20
1873.endr
1874
1875    b             .\txfm1\()_\txfm2\()_8x16_END
1876.NO_HAS_DCONLY_8x16:
1877.endif
1878    li.w         t5,    \eob_half
1879.ifnc \txfm1, identity
1880    la.local     t7,    inv_\txfm1\()_8h_x8_lsx
1881.endif
1882
1883    la.local     t8,    inv_\txfm2\()_8h_x16_lsx
1884.ifc \txfm1, identity
1885    b            inv_txfm_identity_add_8x16_lsx
1886.else
1887    b            inv_txfm_add_8x16_lsx
1888.endif
1889.\txfm1\()_\txfm2\()_8x16_END:
1890endfunc
1891.endm
1892
1893fun8x16 dct, dct, 43
1894fun8x16 identity, identity, 43
1895fun8x16 dct, adst, 43
1896fun8x16 dct, flipadst, 43
1897fun8x16 dct, identity, 8
1898fun8x16 adst, dct, 43
1899fun8x16 adst, adst, 43
1900fun8x16 adst, flipadst, 43
1901fun8x16 flipadst, dct, 43
1902fun8x16 flipadst, adst, 43
1903fun8x16 flipadst, flipadst, 43
1904fun8x16 identity, dct, 64
1905fun8x16 adst, identity, 8
1906fun8x16 flipadst, identity, 8
1907fun8x16 identity, adst, 64
1908fun8x16 identity, flipadst, 64
1909
1910functionl inv_txfm_add_16x16_lsx
1911    malloc_space 512
1912
1913    addi.d        t1,       sp,       64
1914    addi.d        t2,       a2,       0
1915.rept 2
1916    vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1917            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
1918
1919    vxor.v        vr23,     vr23,     vr23
1920.irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, \
1921    384, 416, 448, 480
1922    vst           vr23,     a2,       \i
1923.endr
1924
1925    move          t6,       ra
1926    jirl          ra,       t7,       0
1927    move          ra,       t6
1928
1929    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1930                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1931                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1932
1933    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
1934                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
1935                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1936
1937.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1938    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
1939    vsrari.h       \i,       \i,       2
1940.endr
1941    vst_x8 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
1942    vst_x8 t1, 16, 32, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
1943    addi.d         t1,       t1,       256
1944    addi.d         a2,       a2,       16
1945    blt            a3,       t5,       1616f
1946.endr
1947
19481616:
1949    ble           t5,       a3,       16161616f
1950    addi.d        t1,       sp,       320
1951    vxor.v        vr23,     vr23,     vr23
1952.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
1953    240
1954    vst           vr23,     t1,       \i
1955.endr
1956
195716161616:
1958    addi.d        t1,       sp,       64
1959.rept 2
1960    vld_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1961            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
1962
1963    move          t6,       ra
1964    jirl          ra,       t8,       0
1965    move          ra,       t6
1966
1967    vst_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
1968            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
1969
1970    addi.d        t1,       t1,       16
1971.endr
1972    alsl.d        t2,       a1,       a0,    1
1973    addi.d        t1,       sp,       64
1974.rept 4
1975    vld_x8 t1, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1976    vsrari_h_x8 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \
1977                vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4
1978    VLD_DST_ADD_W16 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
1979    alsl.d        a0,       a1,       a0,    2
1980    alsl.d        t2,       a1,       a0,    1
1981    addi.d        t1,       t1,       128
1982.endr
1983    free_space 512
1984endfuncl
1985
1986.macro fun16x16 txfm1, txfm2, eob_half
1987function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_lsx
1988.ifc \txfm1\()_\txfm2, dct_dct
1989    bnez          a3,       .NO_HAS_DCONLY_16x16
1990
1991    idct_dc 16, 16, 2
1992
1993    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
1994                    vr20, vr20, vr20, vr20, vr20
1995.rept 3
1996    alsl.d        a0,       a1,       a0,     2
1997    alsl.d        t2,       a1,       a0,     1
1998
1999    VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
2000.endr
2001    b             .\txfm1\()_\txfm2\()_16x16_END
2002.NO_HAS_DCONLY_16x16:
2003.endif
2004    li.w         t5,    \eob_half
2005    la.local     t7,    inv_\txfm1\()_8h_x16_lsx
2006    la.local     t8,    inv_\txfm2\()_8h_x16_lsx
2007
2008    b            inv_txfm_add_16x16_lsx
2009.\txfm1\()_\txfm2\()_16x16_END:
2010endfunc
2011.endm
2012
2013fun16x16 dct, dct, 36
2014fun16x16 adst, adst, 36
2015fun16x16 adst, dct, 36
2016fun16x16 dct, adst, 36
2017fun16x16 flipadst, dct, 36
2018fun16x16 dct, flipadst, 36
2019fun16x16 adst, flipadst, 36
2020fun16x16 flipadst, adst, 36
2021
2022.macro dct_8x32_core_lsx in1, in2, vld_st0, vld_st1, vld_stride, \
2023                         vst_st0, vst_st1, vst_st2, vst_st3, vst_stride, \
2024                         transpose8x8, shift
2025    la.local      t0,       idct_coeffs
2026    vldrepl.w     vr20,     t0,       64           // 201
2027    vldrepl.w     vr21,     t0,       68           // 4091
2028    vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9
2029    vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10
2030    vssrarni.h.w  vr9,      vr8,      12           // t31a
2031    vssrarni.h.w  vr10,     vr11,     12           // t16a
2032    vldrepl.w     vr20,     t0,       72           // 3035
2033    vldrepl.w     vr21,     t0,       76           // 2751
2034    vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0
2035    vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30
2036    vssrarni.h.w  vr0,      vr8,      12           // t30a
2037    vssrarni.h.w  vr30,     vr11,     12           // t17a
2038    vldrepl.w     vr20,     t0,       80           // 1751
2039    vldrepl.w     vr21,     t0,       84           // 3703
2040    vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7
2041    vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19
2042    vssrarni.h.w  vr7,      vr8,      12           // t29a
2043    vssrarni.h.w  vr19,     vr11,     12           // t18a
2044    vldrepl.w     vr20,     t0,       88           // 3857
2045    vldrepl.w     vr21,     t0,       92           // 1380
2046    vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4
2047    vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26
2048    vssrarni.h.w  vr4,      vr8,      12           // t28a
2049    vssrarni.h.w  vr26,     vr11,     12           // t19a
2050    vldrepl.w     vr20,     t0,       96           // 995
2051    vldrepl.w     vr21,     t0,       100          // 3973
2052    vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3
2053    vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27
2054    vssrarni.h.w  vr3,      vr8,      12           // t27a
2055    vssrarni.h.w  vr27,     vr11,     12           // t20a
2056    vldrepl.w     vr20,     t0,       104          // 3513
2057    vldrepl.w     vr21,     t0,       108          // 2106
2058    vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2
2059    vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28
2060    vssrarni.h.w  vr2,      vr8,      12           // t26a
2061    vssrarni.h.w  vr28,     vr11,     12           // t21a
2062    vldrepl.w     vr20,     t0,       112          // 2440 -> 1220
2063    vldrepl.w     vr21,     t0,       116          // 3290 -> 1645
2064    vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5
2065    vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25
2066    vssrarni.h.w  vr5,      vr8,      12           // t25a
2067    vssrarni.h.w  vr25,     vr11,     12           // t22a
2068    vldrepl.w     vr20,     t0,       120          // 4052
2069    vldrepl.w     vr21,     t0,       124          // 601
2070    vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6
2071    vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24
2072    vssrarni.h.w  vr6,      vr8,      12           // t24a
2073    vssrarni.h.w  vr24,     vr11,     12           // t23a
2074
2075    vsadd.h       vr1,      vr10,     vr30         // t16
2076    vssub.h       vr29,     vr10,     vr30         // t17
2077    vssub.h       vr8,      vr26,     vr19         // t18
2078    vsadd.h       vr31,     vr26,     vr19         // t19
2079    vsadd.h       vr10,     vr27,     vr28         // t20
2080    vssub.h       vr30,     vr27,     vr28         // t21
2081    vssub.h       vr19,     vr24,     vr25         // t22
2082    vsadd.h       vr26,     vr24,     vr25         // t23
2083    vsadd.h       vr27,     vr6,      vr5          // t24
2084    vssub.h       vr28,     vr6,      vr5          // t25
2085    vssub.h       vr24,     vr3,      vr2          // t26
2086    vsadd.h       vr25,     vr3,      vr2          // t27
2087    vsadd.h       vr5,      vr4,      vr7          // t28
2088    vssub.h       vr6,      vr4,      vr7          // t29
2089    vssub.h       vr2,      vr9,      vr0          // t30
2090    vsadd.h       vr3,      vr9,      vr0          // t31
2091
2092    vldrepl.w     vr20,     t0,       16           // 799
2093    vldrepl.w     vr21,     t0,       20           // 4017
2094    vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
2095    vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
2096    vssrarni.h.w  vr7,      vr4,      12           // t30a
2097    vssrarni.h.w  vr0,      vr11,     12           // t17a
2098    vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
2099    vneg.w        vr4,      vr4
2100    vneg.w        vr9,      vr9
2101    vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
2102    vssrarni.h.w  vr9,      vr4,      12           // t18a
2103    vssrarni.h.w  vr2,      vr11,     12           // t29a
2104    vldrepl.w     vr20,     t0,       24           // 3406 -> 1703
2105    vldrepl.w     vr21,     t0,       28           // 2276 -> 1138
2106    vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
2107    vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
2108    vssrarni.h.w  vr29,     vr4,      12           // t26a
2109    vssrarni.h.w  vr6,      vr11,     12           // t21a
2110    vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
2111    vneg.w        vr4,      vr4
2112    vneg.w        vr8,      vr8
2113    vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
2114    vssrarni.h.w  vr8,      vr4,      12           // t22a
2115    vssrarni.h.w  vr24,     vr11,     12           // t25a
2116
2117    vsadd.h       vr4,      vr1,      vr31         // t16a
2118    vssub.h       vr30,     vr1,      vr31         // t19a
2119    vsadd.h       vr19,     vr0,      vr9          // t17
2120    vssub.h       vr28,     vr0,      vr9          // t18
2121    vssub.h       vr1,      vr26,     vr10         // t20a
2122    vsadd.h       vr31,     vr26,     vr10         // t23a
2123    vssub.h       vr0,      vr8,      vr6          // t21
2124    vsadd.h       vr9,      vr8,      vr6          // t22
2125    vsadd.h       vr10,     vr27,     vr25         // t24a
2126    vssub.h       vr26,     vr27,     vr25         // t27a
2127    vsadd.h       vr6,      vr24,     vr29         // t25
2128    vssub.h       vr8,      vr24,     vr29         // t26
2129    vssub.h       vr25,     vr3,      vr5          // t28a
2130    vsadd.h       vr27,     vr3,      vr5          // t31a
2131    vssub.h       vr24,     vr7,      vr2          // t29
2132    vsadd.h       vr29,     vr7,      vr2          // t30
2133
2134    vldrepl.w     vr20,     t0,       8            // 1567
2135    vldrepl.w     vr21,     t0,       12           // 3784
2136    vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
2137    vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
2138    vssrarni.h.w  vr5,      vr3,      12           // t29a
2139    vssrarni.h.w  vr2,      vr11,     12           // 18a
2140    vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
2141    vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
2142    vssrarni.h.w  vr7,      vr3,      12           // t28
2143    vssrarni.h.w  vr24,     vr11,     12           // t19
2144    vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
2145    vneg.w        vr3,      vr3
2146    vneg.w        vr28,     vr28
2147    vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
2148    vssrarni.h.w  vr28,     vr3,      12           // t20
2149    vssrarni.h.w  vr25,     vr11,     12           // t27
2150    vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
2151    vneg.w        vr3,      vr3
2152    vneg.w        vr30,     vr30
2153    vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
2154    vssrarni.h.w  vr30,     vr3,      12           // t21a
2155    vssrarni.h.w  vr1,      vr11,     12           // t26a
2156
2157    vsadd.h       vr3,      vr4,      vr31         // t16
2158    vssub.h       vr26,     vr4,      vr31         // t23
2159    vsadd.h       vr0,      vr19,     vr9          // t17a
2160    vssub.h       vr8,      vr19,     vr9          // t22a
2161    vsadd.h       vr4,      vr2,      vr30         // t18
2162    vssub.h       vr31,     vr2,      vr30         // t21
2163    vsadd.h       vr9,      vr24,     vr28         // t19a
2164    vssub.h       vr19,     vr24,     vr28         // t20a
2165    vssub.h       vr2,      vr27,     vr10         // t24
2166    vsadd.h       vr30,     vr27,     vr10         // t31
2167    vssub.h       vr24,     vr29,     vr6          // t25a
2168    vsadd.h       vr28,     vr29,     vr6          // t30a
2169    vssub.h       vr10,     vr5,      vr1          // t26
2170    vsadd.h       vr27,     vr5,      vr1          // t29
2171    vssub.h       vr6,      vr7,      vr25         // t27a
2172    vsadd.h       vr29,     vr7,      vr25         // t28a
2173
2174    vldrepl.w     vr20,     t0,       0            // 2896
2175    vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
2176    vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
2177    vssrarni.h.w  vr5,      vr1,      12           // t20
2178    vssrarni.h.w  vr7,      vr11,     12           // t27
2179    vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
2180    vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
2181    vssrarni.h.w  vr25,     vr1,      12           // t21a
2182    vssrarni.h.w  vr6,      vr11,     12           // t26a
2183    vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
2184    vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
2185    vssrarni.h.w  vr19,     vr1,      12           // t22
2186    vssrarni.h.w  vr10,     vr11,     12           // t25
2187    vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
2188    vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
2189    vssrarni.h.w  vr31,     vr1,      12           // t23a
2190    vssrarni.h.w  vr8,      vr11,     12           // t24a
2191
2192    // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
2193    // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
2194    vld_x8 \in2, \vld_st0, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
2195
2196    vsadd.h       vr1,      vr11,     vr30         // c[0]
2197    vssub.h       vr2,      vr11,     vr30         // c[31]
2198    vsadd.h       vr24,     vr12,     vr28         // c[1]
2199    vssub.h       vr26,     vr12,     vr28         // c[30]
2200    vsadd.h       vr11,     vr13,     vr27         // c[2]
2201    vssub.h       vr30,     vr13,     vr27         // c[29]
2202    vsadd.h       vr12,     vr14,     vr29         // c[3]
2203    vssub.h       vr28,     vr14,     vr29         // c[28]
2204    vsadd.h       vr13,     vr15,     vr7          // c[4]
2205    vssub.h       vr27,     vr15,     vr7          // c[27]
2206    vsadd.h       vr14,     vr16,     vr6          // c[5]
2207    vssub.h       vr29,     vr16,     vr6          // c[26]
2208    vsadd.h       vr7,      vr17,     vr10         // c[6]
2209    vssub.h       vr15,     vr17,     vr10         // c[25]
2210    vsadd.h       vr6,      vr18,     vr8          // c[7]
2211    vssub.h       vr16,     vr18,     vr8          // c[24]
2212
2213.ifnb \transpose8x8
2214    LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
2215                       vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
2216                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
2217.endif
2218
2219.ifnb \shift
2220.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
2221    vsrari.h      \i,       \i,       \shift
2222.endr
2223.endif
2224
2225    vst_x8 \in1, \vst_st0, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
2226
2227.ifnb \transpose8x8
2228    LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
2229                       vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
2230                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
2231.endif
2232
2233.ifnb \shift
2234.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
2235    vsrari.h      \i,       \i,       \shift
2236.endr
2237.endif
2238
2239    vst_x8 \in1, \vst_st1, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
2240
2241    vld_x8 \in2, \vld_st1, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
2242
2243    vsadd.h       vr1,      vr11,     vr31         // c[8]
2244    vssub.h       vr2,      vr11,     vr31         // c[23]
2245    vsadd.h       vr24,     vr12,     vr19         // c[9]
2246    vssub.h       vr26,     vr12,     vr19         // c[22]
2247    vsadd.h       vr11,     vr13,     vr25         // c[10]
2248    vssub.h       vr30,     vr13,     vr25         // c[21]
2249    vsadd.h       vr12,     vr14,     vr5          // c[11]
2250    vssub.h       vr28,     vr14,     vr5          // c[20]
2251    vsadd.h       vr13,     vr15,     vr9          // c[12]
2252    vssub.h       vr27,     vr15,     vr9          // c[19]
2253    vsadd.h       vr14,     vr16,     vr4          // c[13]
2254    vssub.h       vr29,     vr16,     vr4          // c[18]
2255    vsadd.h       vr7,      vr17,     vr0          // c[14]
2256    vssub.h       vr15,     vr17,     vr0          // c[17]
2257    vsadd.h       vr6,      vr18,     vr3          // c[15]
2258    vssub.h       vr16,     vr18,     vr3          // c[16]
2259
2260.ifnb \transpose8x8
2261    LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
2262                       vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
2263                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
2264.endif
2265
2266.ifnb \shift
2267.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
2268    vsrari.h      \i,       \i,       \shift
2269.endr
2270.endif
2271
2272    vst_x8 \in1, \vst_st2, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
2273
2274.ifnb \transpose8x8
2275    LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
2276                       vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
2277                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
2278.endif
2279
2280.ifnb \shift
2281.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
2282    vsrari.h      \i,       \i,       \shift
2283.endr
2284.endif
2285
2286    vst_x8 \in1, \vst_st3, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
2287.endm
2288
2289const eob_32x32
2290        .short 36, 136, 300, 1024
2291endconst
2292
2293const eob_8x32
2294        .short 43, 107, 171, 256
2295endconst
2296
2297const eob_16x32
2298        .short 36, 151, 279, 512
2299endconst
2300
2301.macro DST_ADD_W32 in0, in1, in2, in3, in4, in5, in6, in7
2302    vsllwil.hu.bu vr4,      vr10,     0
2303    vsllwil.hu.bu vr5,      vr11,     0
2304    vsllwil.hu.bu vr6,      vr12,     0
2305    vsllwil.hu.bu vr7,      vr13,     0
2306    vexth.hu.bu   vr10,     vr10
2307    vexth.hu.bu   vr11,     vr11
2308    vexth.hu.bu   vr12,     vr12
2309    vexth.hu.bu   vr13,     vr13
2310    vadd.h        vr4,      vr4,      \in0
2311    vadd.h        vr10,     vr10,     \in1
2312    vadd.h        vr5,      vr5,      \in2
2313    vadd.h        vr11,     vr11,     \in3
2314    vadd.h        vr6,      vr6,      \in4
2315    vadd.h        vr12,     vr12,     \in5
2316    vadd.h        vr7,      vr7,      \in6
2317    vadd.h        vr13,     vr13,     \in7
2318    vssrani.bu.h  vr10,     vr4,      0
2319    vssrani.bu.h  vr11,     vr5,      0
2320    vssrani.bu.h  vr12,     vr6,      0
2321    vssrani.bu.h  vr13,     vr7,      0
2322    vst           vr10,     a0,       0
2323    vst           vr11,     a0,       16
2324    vst           vr12,     t2,       0
2325    vst           vr13,     t2,       16
2326.endm
2327
2328.macro idct_dc_w32 w, h, shift
2329    ld.h          t2,       a2,       0      // dc
2330    vldi          vr0,      0x8b5            // 181
2331    vreplgr2vr.w  vr1,      t2
2332    vldi          vr20,     0x880            // 128
2333    vmul.w        vr2,      vr0,      vr1    // dc * 181
2334    st.h          zero,     a2,       0
2335    add.d         t2,       a0,       a1
2336    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
2337    vld           vr13,     t2,       16
2338
2339.if (2*\w == \h) || (2*\h == \w)
2340    vmul.w        vr2,      vr2,      vr0
2341    vsrari.w      vr2,      vr2,      8
2342.endif
2343
2344.if \shift>0
2345    vsrari.w      vr2,      vr2,      \shift      // (dc + rnd) >> shift
2346.endif
2347    vld           vr11,     a0,       16
2348    vmadd.w       vr20,     vr2,      vr0
2349    vld           vr12,     t2,       0
2350    vssrarni.h.w  vr20,     vr20,     12
2351    vld           vr10,     a0,       0
2352.endm
2353
2354function inv_txfm_add_dct_dct_32x8_8bpc_lsx
2355    bnez          a3,       .NO_HAS_DCONLY_32x8
2356
2357    idct_dc_w32 32, 8, 2
2358
2359    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
2360
2361.rept 3
2362    alsl.d        a0,       a1,       a0,     1
2363    add.d         t2,       a0,       a1
2364    vld           vr10,     a0,       0
2365    vld           vr11,     a0,       16
2366    vld           vr12,     t2,       0
2367    vld           vr13,     t2,       16
2368    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
2369.endr
2370    b             .DCT_DCT_32X8_END
2371.NO_HAS_DCONLY_32x8:
2372    malloc_space 512+256
2373
2374    addi.d        t1,       sp,       64
2375    addi.d        t2,       a2,       0
2376    addi.d        t3,       sp,       64
2377    addi.d        t3,       t3,       512
2378
2379    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2380            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2381
2382    vxor.v        vr31,     vr31,     vr31
2383    vst_x16 t2, 0, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2384            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2385
2386    inv_dct16_lsx .8h
2387
2388    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2389            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2390
2391    vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2392            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
2393
2394    vxor.v        vr31,     vr31,     vr31
2395
2396    vst_x16 t2, 16, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2397            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2398
2399    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2
2400
2401    addi.d        t2,       sp,       64
2402.rept 4
2403    vld_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
2404
2405    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
2406
2407.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
2408    vsrari.h      \i,       \i,       4
2409.endr
2410
2411    vst_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
2412
2413    addi.d        t2,       t2,       16
2414.endr
2415
2416    addi.d        t0,       sp,       64
2417.rept 4
2418    add.d         t2,       a0,       a1
2419    vld           vr10,     a0,       0
2420    vld           vr11,     a0,       16
2421    vld           vr12,     t2,       0
2422    vld           vr13,     t2,       16
2423    vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
2424    DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
2425    alsl.d        a0,       a1,       a0,     1
2426    addi.d        t0,       t0,       128
2427.endr
2428    free_space 512+256
2429.DCT_DCT_32X8_END:
2430endfunc
2431
2432function inv_txfm_add_dct_dct_32x16_8bpc_lsx
2433    bnez          a3,       .NO_HAS_DCONLY_32x16
2434
2435    idct_dc_w32 32, 16, 1
2436
2437    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
2438
2439.rept 7
2440    alsl.d        a0,       a1,       a0,     1
2441    add.d         t2,       a0,       a1
2442    vld           vr10,     a0,       0
2443    vld           vr11,     a0,       16
2444    vld           vr12,     t2,       0
2445    vld           vr13,     t2,       16
2446    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
2447.endr
2448    b             .DCT_DCT_32X16_END
2449.NO_HAS_DCONLY_32x16:
2450    malloc_space 1024+256                            // 32*32*2+512
2451    addi.d        t1,       sp,       64
2452    addi.d        t2,       a2,       0
2453    addi.d        t3,       sp,       64
2454    addi.d        t3,       t3,       1024
2455.rept 2
2456    vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2457            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2458
2459    vxor.v        vr31,     vr31,     vr31
2460    vst_x16 t2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2461            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2462
2463    li.w          t0,       2896
2464    vreplgr2vr.w  vr23,     t0
2465.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2466     vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2467    rect2_lsx   \i, vr23, \i
2468.endr
2469
2470    inv_dct16_lsx .8h
2471
2472    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2473            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2474
2475    vld_x16 t2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2476            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
2477
2478    la.local      t0,       idct_coeffs
2479    vldrepl.w     vr23,     t0,       0        // 2896
2480.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2481    vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
2482    rect2_lsx \i, vr23, \i
2483.endr
2484    vxor.v        vr31,     vr31,     vr31
2485    vst_x16 t2, 32, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2486            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2487
2488    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 1
2489
2490    addi.d        t2,       t2,       16
2491    addi.d        t1,       t1,       512
2492.endr
2493
2494    addi.d        t2,       sp,       64
2495.rept 4
2496    vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2497            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2498
2499    inv_dct16_lsx .8h
2500
2501.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2502    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2503    vsrari.h      \i,       \i,       4
2504.endr
2505
2506    vst_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2507            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2508
2509    addi.d        t2,       t2,       16
2510.endr
2511
2512    addi.d        t0,       sp,       64
2513.rept 8
2514    add.d         t2,       a0,       a1
2515    vld           vr10,     a0,       0
2516    vld           vr11,     a0,       16
2517    vld           vr12,     t2,       0
2518    vld           vr13,     t2,       16
2519    vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
2520    DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
2521
2522    alsl.d        a0,       a1,       a0,     1
2523    addi.d        t0,       t0,       128
2524.endr
2525    free_space 1024+256
2526.DCT_DCT_32X16_END:
2527endfunc
2528
2529function inv_txfm_add_dct_dct_32x32_8bpc_lsx
2530    bnez          a3,       .NO_HAS_DCONLY_32x32
2531
2532    idct_dc_w32 32, 32, 2
2533
2534    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
2535.rept 15
2536    alsl.d        a0,       a1,       a0,     1
2537    add.d         t2,       a0,       a1
2538    vld           vr10,     a0,       0
2539    vld           vr11,     a0,       16
2540    vld           vr12,     t2,       0
2541    vld           vr13,     t2,       16
2542    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
2543.endr
2544    b             .DCT_DCT_32X32_END
2545.NO_HAS_DCONLY_32x32:
2546    malloc_space 2560                              // 32*32*2+512
2547
2548    addi.d        t1,       sp,       64
2549    addi.d        t2,       a2,       0
2550    addi.d        t3,       sp,       1024
2551    addi.d        t3,       t3,       1024
2552    addi.d        t3,       t3,       64
2553
2554    la.local      t8,       eob_32x32
2555.DCT_DCT_EOB_32x32:
2556    ld.h          t7,       t8,       0
2557    addi.d        t8,       t8,       2
2558
2559    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2560            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2561
2562    vxor.v        vr31,     vr31,     vr31
2563    vst_x16 t2, 0, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2564            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2565
2566    inv_dct16_lsx .8h
2567
2568    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2569            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2570
2571    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2572            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
2573
2574    vxor.v        vr31,     vr31,     vr31
2575
2576    vst_x16 t2, 64, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2577            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2578
2579    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2
2580
2581    addi.d        t2,       t2,       16
2582    addi.d        t1,       t1,       512
2583    bge           a3,       t7,       .DCT_DCT_EOB_32x32
2584
2585    la.local      t8,       eob_32x32
2586    vxor.v        vr31,     vr31,     vr31
2587    ld.h          t7,       t8,       4
2588    bge           a3,       t7,       .DCT_DCT_EOB_32x32_END   // a3>=t7
2589    vst_x16 sp, 64+1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2590        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2591    addi.d        t1,       sp,       256+64
2592    vst_x16 t1, 1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2593        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2594
2595    ld.h          t7,       t8,       2
2596    bge           a3,       t7,       .DCT_DCT_EOB_32x32_END
2597    vst_x16 sp, 64+1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2598        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2599    vst_x16 t1, 1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2600        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2601
2602    ld.h          t7,       t8,       0
2603    bge           a3,       t7,       .DCT_DCT_EOB_32x32_END
2604    vst_x16 sp, 64+512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2605        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2606
2607    vst_x16 t1, 512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
2608        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
2609
2610.DCT_DCT_EOB_32x32_END:
2611    addi.d        t2,       sp,       64
2612    addi.d        t1,       sp,       64
2613.rept 4
2614    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2615            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2616
2617    inv_dct16_lsx .8h
2618
2619    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2620            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
2621
2622    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
2623            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
2624
2625    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 1536, 512, 1024, 64, , 4
2626
2627    addi.d        t2,       t2,       16
2628    addi.d        t1,       t1,       16
2629.endr
2630
2631    addi.d        t0,       sp,       64
2632.rept 16
2633    add.d         t2,       a0,       a1
2634    vld           vr10,     a0,       0
2635    vld           vr11,     a0,       16
2636    vld           vr12,     t2,       0
2637    vld           vr13,     t2,       16
2638    vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
2639    DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
2640    alsl.d        a0,       a1,       a0,     1
2641    addi.d        t0,       t0,       128
2642.endr
2643
2644    free_space 2560                                // 32*32*2+512
2645.DCT_DCT_32X32_END:
2646endfunc
2647
2648/*
2649 * temp: vr8, vr9, vr10, vr12, vr20, vr21, vr22, vr23
2650 */
2651.macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \
2652                             out1, out2, out3, out4, out5, out6, out7, rect2
2653
2654    la.local      t0,       idct_coeffs
2655
2656.ifc \rect2, rect2_lsx
2657    vldrepl.w     vr23,      t0,       0        // 2896
2658.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
2659    rect2_lsx \i, vr23, \i
2660.endr
2661.endif
2662
2663    la.local      t0,       idct_coeffs
2664
2665    vldrepl.w     vr20,     t0,       8            // 1567
2666    vldrepl.w     vr21,     t0,       12           // 3784
2667    vsllwil.w.h   vr22,     \in2,     0
2668    vexth.w.h     vr23,     \in2
2669    vmul.w        vr8,      vr22,     vr20
2670    vmul.w        vr10,     vr23,     vr20
2671    vmul.w        \in2,     vr22,     vr21
2672    vmul.w        vr9,      vr23,     vr21
2673    vssrarni.h.w  vr10,     vr8,      12           // t2
2674    vssrarni.h.w  vr9,      \in2,     12           // t3
2675
2676    vldrepl.w     vr20,     t0,       0            // 2896
2677    vsllwil.w.h   vr22,     \in0,     0
2678    vexth.w.h     vr23,     \in0
2679    vmul.w        vr8,      vr22,     vr20
2680    vmul.w        \in2,     vr23,     vr20
2681    vssrarni.h.w  \in2,     vr8,      12
2682
2683    vsadd.h       vr8,      \in2,     vr9          // c[0]
2684    vssub.h       vr9,      \in2,     vr9          // c[3]
2685    vsadd.h       \in0,     \in2,     vr10         // c[1]
2686    vssub.h       vr10,     \in2,     vr10         // c[2]
2687
2688    // inv_dct8_1d_internal_c tx64
2689    // in1 in3
2690    vldrepl.w     vr20,     t0,       16           // 799
2691    vldrepl.w     vr21,     t0,       20           // 4017
2692
2693    vsllwil.w.h   vr22,     \in1,     0
2694    vexth.w.h     vr23,     \in1
2695    vmul.w        \in2,     vr22,     vr21
2696    vmul.w        \in4,     vr23,     vr21
2697    vmul.w        \in1,     vr22,     vr20
2698    vmul.w        \in6,     vr23,     vr20
2699    vssrarni.h.w  \in4,     \in2,     12           // t7a
2700    vssrarni.h.w  \in6,     \in1,     12           // t4a
2701
2702    vldrepl.w     vr20,     t0,       24           // 3406
2703    vldrepl.w     vr21,     t0,       28           // 2276
2704
2705    vsllwil.w.h   vr22,     \in3,     0
2706    vexth.w.h     vr23,     \in3
2707    vneg.w        vr21,     vr21
2708    vmul.w        \in2,     vr22,     vr20
2709    vmul.w        \in1,     vr23,     vr20
2710    vmul.w        \in3,     vr22,     vr21
2711    vmul.w        \in7,     vr23,     vr21
2712    vssrarni.h.w  \in1,     \in2,     12           // t6a
2713    vssrarni.h.w  \in7,     \in3,     12           // t5a
2714
2715    vsadd.h       \in3,     \in6,     \in7         // t4
2716    vssub.h       \in6,     \in6,     \in7         // t5a
2717    vsadd.h       \in5,     \in4,     \in1         // t7
2718    vssub.h       \in4,     \in4,     \in1         // t6a
2719
2720    vldrepl.w     vr20,     t0,       0            // 2896
2721    vmul_vmadd_w  \in4, \in6, vr20, vr20, vr21, \in1
2722    vmul_vmsub_w  \in4, \in6, vr20, vr20, \in2, \in7
2723    vssrarni.h.w  \in1,     vr21,     12           // t6
2724    vssrarni.h.w  \in7,     \in2,     12           // t5
2725
2726    vsadd.h       \out0,    vr8,      \in5         // c[0]
2727    vssub.h       \out7,    vr8,      \in5         // c[7]
2728    vsadd.h       \out1,    \in0,     \in1         // c[1]
2729    vssub.h       \out6,    \in0,     \in1         // c[6]
2730    vsadd.h       \out2,    vr10,     \in7         // c[2]
2731    vssub.h       \out5,    vr10,     \in7         // c[5]
2732    vsadd.h       \out3,    vr9,      \in3         // c[3]
2733    vssub.h       \out4,    vr9,      \in3         // c[4]
2734.endm
2735
2736/*
2737 * input:  in0,  in1,  in2,  in3,  in4,  in5,  in6,  in7       (fixed)
2738 *         vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7
2739 *         in8,  in9,  in10, in11, in12, in13, in14, in15
2740 *         vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
2741 * output: out0, out1, out2, out3, out4, out5, out6, out7      (fixed)
2742 *         vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
2743 *         out8, out9, out10, out11, out12, out13, out14, out15
2744 *         vr27, vr30, vr23,  vr21,  vr29,  vr26,  vr25,  vr24
2745 */
2746.macro dct_8x16_tx64_core_lsx rect2
2747    dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \
2748                          vr12, vr13, vr14, vr15, vr16, vr17, vr18, \rect2
2749
2750    // in1 in3 in5 in7 in9  in11 in13 in15
2751    // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30
2752    la.local      t0,       idct_coeffs
2753
2754.ifc \rect2, rect2_lsx
2755    vldrepl.w     vr23,      t0,       0        // 2896
2756.irp i, vr1, vr3, vr5, vr7, vr24, vr26, vr28, vr30
2757    rect2_lsx \i, vr23, \i
2758.endr
2759.endif
2760
2761    vldrepl.w     vr20,     t0,       32           // 401
2762    vldrepl.w     vr21,     t0,       36           // 4076
2763    vsllwil.w.h   vr22,     vr1,      0
2764    vexth.w.h     vr23,     vr1
2765    vmul.w        vr0,      vr22,     vr21
2766    vmul.w        vr10,     vr23,     vr21
2767    vmul.w        vr1,      vr22,     vr20
2768    vmul.w        vr29,     vr23,     vr20
2769    vssrarni.h.w  vr10,     vr0,      12           // t15a
2770    vssrarni.h.w  vr29,     vr1,      12           // t8a
2771
2772    vldrepl.w     vr20,     t0,       40           // 3166 -> 1583
2773    vldrepl.w     vr21,     t0,       44           // 2598 -> 1299
2774    vsllwil.w.h   vr22,     vr7,      0
2775    vexth.w.h     vr23,     vr7
2776    vneg.w        vr21,     vr21
2777    vmul.w        vr0,      vr22,     vr20
2778    vmul.w        vr30,     vr23,     vr20
2779    vmul.w        vr7,      vr22,     vr21
2780    vmul.w        vr31,     vr23,     vr21
2781    vssrarni.h.w  vr30,     vr0,      12           // t14a
2782    vssrarni.h.w  vr31,     vr7,      12           // t9a
2783
2784    vldrepl.w     vr20,     t0,       48           // 1931
2785    vldrepl.w     vr21,     t0,       52           // 3612
2786    vsllwil.w.h   vr22,     vr5,      0
2787    vexth.w.h     vr23,     vr5
2788    vmul.w        vr0,      vr22,     vr21
2789    vmul.w        vr24,     vr23,     vr21
2790    vmul.w        vr5,      vr22,     vr20
2791    vmul.w        vr25,     vr23,     vr20
2792    vssrarni.h.w  vr24,     vr0,      12           // t13a
2793    vssrarni.h.w  vr25,     vr5,      12           // t10a
2794
2795    vldrepl.w     vr20,     t0,       56           // 3920
2796    vldrepl.w     vr21,     t0,       60           // 1189
2797    vsllwil.w.h   vr22,     vr3,      0
2798    vexth.w.h     vr23,     vr3
2799    vneg.w        vr21,     vr21
2800    vmul.w        vr0,      vr22,     vr20
2801    vmul.w        vr26,     vr23,     vr20
2802    vmul.w        vr3,      vr22,     vr21
2803    vmul.w        vr27,     vr23,     vr21
2804    vssrarni.h.w  vr26,     vr0,      12           // t12a
2805    vssrarni.h.w  vr27,     vr3,      12           // t11a
2806
2807    // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
2808    vsadd.h       vr28,     vr29,      vr31        // t8
2809    vssub.h       vr19,     vr29,      vr31        // t9
2810    vssub.h       vr29,     vr27,      vr25        // t10
2811    vsadd.h       vr9,      vr27,      vr25        // t11
2812    vsadd.h       vr31,     vr26,      vr24        // t12
2813    vssub.h       vr25,     vr26,      vr24        // t13
2814    vssub.h       vr27,     vr10,      vr30        // t14
2815    vsadd.h       vr24,     vr10,      vr30        // t15
2816
2817    vldrepl.w     vr20,     t0,       8            // 1567
2818    vldrepl.w     vr21,     t0,       12           // 3784
2819    vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
2820    vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30
2821    vssrarni.h.w  vr26,     vr0,       12          // t14a
2822    vssrarni.h.w  vr30,     vr1,       12          // t9a
2823
2824    vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
2825    vneg.w        vr0,      vr0
2826    vneg.w        vr19,     vr19
2827    vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27
2828    vssrarni.h.w  vr19,     vr0,       12          // t10a
2829    vssrarni.h.w  vr27,     vr1,       12          // t13a
2830
2831    vsadd.h       vr25,     vr28,     vr9          // t8a
2832    vssub.h       vr29,     vr28,     vr9          // t11a
2833    vssub.h       vr28,     vr24,     vr31         // t12a
2834    vsadd.h       vr10,     vr24,     vr31         // t15a
2835    vsadd.h       vr9,      vr30,     vr19         // t9
2836    vssub.h       vr31,     vr30,     vr19         // t10
2837    vssub.h       vr30,     vr26,     vr27         // t13
2838    vsadd.h       vr24,     vr26,     vr27         // t14
2839
2840    vldrepl.w     vr20,     t0,       0            // 2896
2841    vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
2842    vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27
2843    vssrarni.h.w  vr26,     vr0,      12           // t13a
2844    vssrarni.h.w  vr27,     vr1,      12           // t10a
2845
2846    vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
2847    vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30
2848    vssrarni.h.w  vr31,     vr0,      12           // t12
2849    vssrarni.h.w  vr30,     vr1,      12           // t11
2850
2851    // vr11 vr12 ... vr18
2852    vsadd.h       vr28,     vr14,     vr31         // c[3]
2853    vssub.h       vr29,     vr14,     vr31         // c[12]
2854    vsadd.h       vr20,     vr15,     vr30         // c[4]
2855    vssub.h       vr21,     vr15,     vr30         // c[11]
2856    vsadd.h       vr14,     vr16,     vr27         // c[5]
2857    vssub.h       vr23,     vr16,     vr27         // c[10]
2858    vsadd.h       vr15,     vr17,     vr9          // c[6]
2859    vssub.h       vr30,     vr17,     vr9          // c[9]
2860    vsadd.h       vr16,     vr18,     vr25         // c[7]
2861    vssub.h       vr27,     vr18,     vr25         // c[8]
2862    vsadd.h       vr17,     vr13,     vr26         // c[2]
2863    vssub.h       vr26,     vr13,     vr26         // c[13]
2864    vsadd.h       vr18,     vr12,     vr24         // c[1]
2865    vssub.h       vr25,     vr12,     vr24         // c[14]
2866    vsadd.h       vr22,     vr11,     vr10         // c[0]
2867    vssub.h       vr24,     vr11,     vr10         // c[15]
2868.endm // dct_8x16_tx64_core_lsx
2869
2870.macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1
2871    vsllwil.w.h   vr22,      \in0,     0
2872    vexth.w.h     vr23,      \in0
2873    vmul.w        \tmp0,     vr22,     \in1
2874    vmul.w        \out0,     vr23,     \in1
2875    vmul.w        \tmp1,     vr22,     \in2
2876    vmul.w        \out1,     vr23,     \in2
2877    vssrarni.h.w  \out0,     \tmp0,    12
2878    vssrarni.h.w  \out1,     \tmp1,    12
2879.endm
2880
2881const idct64_coeffs, align=4
2882    .word         101, 4095, 2967, -2824
2883    .word         1660, 3745, 3822, -1474
2884    .word         4076, 401, 4017, 799
2885    .word         4036, -700, 2359, 3349
2886    .word         3461, -2191, 897, 3996
2887    .word         -3166, -2598, -799, -4017
2888    .word         501, 4065, 3229, -2520
2889    .word         2019, 3564, 3948, -1092
2890    .word         3612, 1931, 2276, 3406
2891    .word         4085, -301, 2675, 3102
2892    .word         3659, -1842, 1285, 3889
2893    .word         -3920, -1189, -3406, -2276
2894endconst
2895
2896.macro dct64_step1_lsx
2897    vldrepl.w     vr20,     t0,       0            // 101
2898    vldrepl.w     vr21,     t0,       4            // 4095
2899    vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9    // vr8 t32a vr9 t63a
2900    vldrepl.w     vr20,     t0,       8            // 2967
2901    vldrepl.w     vr21,     t0,       12           // -2824
2902    vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11  // vr10 t62a vr11 t33a
2903    vldrepl.w     vr20,     t0,       16           // 1660
2904    vldrepl.w     vr21,     t0,       20           // 3745
2905    vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13  // vr12 t34a vr13 t61a
2906    vldrepl.w     vr20,     t0,       24           // 3822
2907    vldrepl.w     vr21,     t0,       28           // -1474
2908    vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15  // vr14 t60a vr15 t35a
2909
2910    vsadd.h       vr0,      vr8,      vr11         // t32
2911    vssub.h       vr1,      vr8,      vr11         // t33
2912    vssub.h       vr2,      vr15,     vr12         // t34
2913    vsadd.h       vr3,      vr15,     vr12         // t35
2914    vsadd.h       vr4,      vr14,     vr13         // t60
2915    vssub.h       vr5,      vr14,     vr13         // t61
2916    vssub.h       vr6,      vr9,      vr10         // t62
2917    vsadd.h       vr7,      vr9,      vr10         // t63
2918
2919    vldrepl.w     vr20,     t0,       32           // 4076
2920    vldrepl.w     vr21,     t0,       36           // 401
2921    vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10
2922    vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11
2923    vssrarni.h.w  vr10,     vr9,      12           // t62a
2924    vssrarni.h.w  vr11,     vr13,     12           // t33a
2925
2926    vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1
2927    vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6
2928    vneg.w        vr9,      vr9
2929    vneg.w        vr1,      vr1
2930    vssrarni.h.w  vr6,      vr13,     12           // t61a
2931    vssrarni.h.w  vr1,      vr9,      12           // t34a
2932
2933    vsadd.h       vr2,      vr0,      vr3          // t32a
2934    vssub.h       vr5,      vr0,      vr3          // t35a
2935    vsadd.h       vr9,      vr11,     vr1          // t33
2936    vssub.h       vr13,     vr11,     vr1          // t34
2937    vssub.h       vr0,      vr7,      vr4          // t60a
2938    vsadd.h       vr3,      vr7,      vr4          // t63a
2939    vssub.h       vr1,      vr10,     vr6          // t61
2940    vsadd.h       vr11,     vr10,     vr6          // t62
2941
2942    vldrepl.w     vr20,     t0,       40           // 4017
2943    vldrepl.w     vr21,     t0,       44           // 799
2944    vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4
2945    vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7
2946    vssrarni.h.w  vr4,      vr8,      12           // t61a
2947    vssrarni.h.w  vr7,      vr12,     12           // t34a
2948
2949    vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6
2950    vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10
2951    vssrarni.h.w  vr6,      vr8,      12           // t60
2952    vssrarni.h.w  vr10,     vr12,     12           // t35
2953
2954    vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3
2955.endm // dct64_step1
2956
2957    // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
2958    // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
2959    // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
2960    // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
2961.macro dct64_step2_lsx
2962    vld           vr0,      t5,       0            // t32a
2963    vld           vr2,      t4,       0            // t63a
2964    vld           vr3,      t5,       16*8         // t56a
2965    vld           vr1,      t4,       16*8         // t39a
2966    vld           vr4,      t5,       16*16        // t40a
2967    vld           vr6,      t4,       16*16        // t55a
2968    vld           vr7,      t5,       16*24        // t48a
2969    vld           vr5,      t4,       16*24        // t47a
2970
2971    vsadd.h       vr8,      vr0,      vr1          // t32
2972    vssub.h       vr9,      vr0,      vr1          // t39
2973    vsadd.h       vr10,     vr2,      vr3          // t63
2974    vssub.h       vr11,     vr2,      vr3          // t56
2975    vssub.h       vr12,     vr5,      vr4          // t40
2976    vsadd.h       vr13,     vr5,      vr4          // t47
2977    vsadd.h       vr14,     vr7,      vr6          // t48
2978    vssub.h       vr15,     vr7,      vr6          // t55
2979    vldrepl.w     vr20,     t0,       8            // 1567
2980    vldrepl.w     vr21,     t0,       12           // 3784
2981    vmul_vmadd_w  vr11, vr9, vr21, vr20, vr0, vr2
2982    vmul_vmsub_w  vr11, vr9, vr20, vr21, vr1, vr3
2983    vssrarni.h.w  vr2,      vr0,      12           // t56a
2984    vssrarni.h.w  vr3,      vr1,      12           // t39a
2985    vmul_vmadd_w  vr15, vr12, vr21, vr20, vr0, vr4
2986    vmul_vmsub_w  vr15, vr12, vr20, vr21, vr1, vr5
2987    vneg.w        vr0,      vr0
2988    vneg.w        vr4,      vr4
2989    vssrarni.h.w  vr5,      vr1,      12           // t55a
2990    vssrarni.h.w  vr4,      vr0,      12           // t40a
2991    vsadd.h       vr9,      vr8,      vr13         // t32a
2992    vssub.h       vr11,     vr8,      vr13         // t47a
2993    vsadd.h       vr6,      vr3,      vr4          // t39
2994    vssub.h       vr7,      vr3,      vr4          // t40
2995    vssub.h       vr12,     vr10,     vr14         // t48a
2996    vsadd.h       vr15,     vr10,     vr14         // t63a
2997    vssub.h       vr0,      vr2,      vr5          // t55
2998    vsadd.h       vr1,      vr2,      vr5          // t56
2999
3000    vldrepl.w     vr20,     t0,       0            // 2896
3001    vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13
3002    vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4
3003    vssrarni.h.w  vr13,     vr8,      12           // t40a
3004    vssrarni.h.w  vr4,      vr3,      12           // t55a
3005    vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10
3006    vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14
3007    vssrarni.h.w  vr10,     vr8,      12           // t47
3008    vssrarni.h.w  vr14,     vr3,      12           // t48
3009
3010    // t32a t39 t40a t47  t48  t55a t56 t63a
3011    // vr9  vr6 vr13 vr10 vr14 vr4  vr1 vr15
3012    vst           vr9,      t5,       0            // t32a
3013    vst           vr6,      t4,       0            // t39
3014    vst           vr13,     t5,       16*8         // t40a
3015    vst           vr10,     t4,       16*8         // t47
3016    vst           vr14,     t5,       16*16        // t48
3017    vst           vr4,      t4,       16*16        // t55a
3018    vst           vr1,      t5,       16*24        // t56
3019    vst           vr15,     t4,       16*24        // t63a
3020.endm // dct64_step2_lsx
3021
3022.macro dct64_step3_lsx
3023    //                t0   t1   t2   t3   t4    t5    t6    t7
3024    vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17
3025    vld           vr9,      t5,       16*24    // t56
3026    vld           vr6,      t5,       16*24+16 // t57a
3027    vld           vr13,     t5,       16*24+32 // t58
3028    vld           vr10,     t5,       16*24+48 // t59a
3029    vld           vr14,     t4,       16*24-48 // t60
3030    vld           vr4,      t4,       16*24-32 // t61a
3031    vld           vr1,      t4,       16*24-16 // t62
3032    vld           vr15,     t4,       16*24    // t63a
3033    vsadd.h       vr20,     vr2,      vr15     // c[0]
3034    vssub.h       vr21,     vr2,      vr15     // c[63]
3035    vsadd.h       vr22,     vr3,      vr1      // c[1]
3036    vssub.h       vr23,     vr3,      vr1      // c[62]
3037    vsadd.h       vr24,     vr7,      vr4      // c[2]
3038    vssub.h       vr25,     vr7,      vr4      // c[61]
3039    vsadd.h       vr26,     vr8,      vr14     // c[3]
3040    vssub.h       vr27,     vr8,      vr14     // c[60]
3041    vsadd.h       vr28,     vr11,     vr10     // c[4]
3042    vssub.h       vr29,     vr11,     vr10     // c[59]
3043    vsadd.h       vr30,     vr12,     vr13     // c[5]
3044    vssub.h       vr31,     vr12,     vr13     // c[58]
3045    vsadd.h       vr2,      vr16,     vr6      // c[6]
3046    vssub.h       vr15,     vr16,     vr6      // c[57]
3047    vsadd.h       vr1,      vr17,     vr9      // c[7]
3048    vssub.h       vr3,      vr17,     vr9      // c[56]
3049.endm // dct64_step3_lsx
3050
3051.macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1
3052    dct64_step3_lsx
3053
3054.ifnb \transpose8x8
3055    LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
3056                       vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
3057                       vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
3058
3059    LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
3060                       vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
3061                       vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
3062.endif
3063
3064.ifnb \shift
3065.irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
3066     vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
3067     vsrari.h     \i,       \i,       \shift
3068.endr
3069.endif
3070
3071    vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
3072
3073    vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
3074.endm // dct64_step4_lsx
3075
3076.macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7
3077    fld.d         f4,       t0,       0
3078    fldx.d        f5,       t0,       a1
3079    fld.d         f6,       t6,       0
3080    fldx.d        f7,       t6,       a1
3081    alsl.d        t0,       a1,       t0,    2
3082    alsl.d        t6,       a1,       t6,    2
3083    fld.d         f8,       t0,       0
3084    fldx.d        f9,       t0,       a1
3085    fld.d         f10,      t6,       0
3086    fldx.d        f11,      t6,       a1
3087.irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11
3088    vsllwil.hu.bu   \i,      \i,       0
3089.endr
3090    vsrari.h      vr20,     \in0,     4
3091    vsrari.h      vr22,     \in1,     4
3092    vsrari.h      vr24,     \in2,     4
3093    vsrari.h      vr26,     \in3,     4
3094    vsrari.h      vr28,     \in4,     4
3095    vsrari.h      vr30,     \in5,     4
3096    vsrari.h      vr2,      \in6,     4
3097    vsrari.h      vr1,      \in7,     4
3098    vadd.h        vr4,      vr4,      vr20
3099    vadd.h        vr5,      vr5,      vr22
3100    vadd.h        vr6,      vr6,      vr24
3101    vadd.h        vr7,      vr7,      vr26
3102    vadd.h        vr8,      vr8,      vr28
3103    vadd.h        vr9,      vr9,      vr30
3104    vadd.h        vr10,     vr10,     vr2
3105    vadd.h        vr11,     vr11,     vr1
3106    vssrani.bu.h  vr5,      vr4,      0
3107    vssrani.bu.h  vr7,      vr6,      0
3108    vssrani.bu.h  vr9,      vr8,      0
3109    vssrani.bu.h  vr11,     vr10,     0
3110
3111    vstelm.d      vr5,      t1,       0,     0
3112    vstelm.d      vr5,      t2,       0,     1
3113    alsl.d        t1,       a1,       t1,    1
3114    alsl.d        t2,       a1,       t2,    1
3115    vstelm.d      vr7,      t1,       0,     0
3116    vstelm.d      vr7,      t2,       0,     1
3117    alsl.d        t1,       a1,       t1,    1
3118    alsl.d        t2,       a1,       t2,    1
3119    vstelm.d      vr9,      t1,       0,     0
3120    vstelm.d      vr9,      t2,       0,     1
3121    alsl.d        t1,       a1,       t1,    1
3122    alsl.d        t2,       a1,       t2,    1
3123    vstelm.d      vr11,     t1,       0,     0
3124    vstelm.d      vr11,     t2,       0,     1
3125.endm // dct64_step5_lsx
3126
3127.macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1, rect2
3128    vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
3129
3130    dct_8x16_tx64_core_lsx \rect2
3131
3132    vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
3133            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
3134
3135    vxor.v        vr31,     vr31,     vr31
3136    vst_x8 t2, \vld_loc0, \stride0, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
3137
3138    vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
3139
3140    vst_x8 t2, \vld_loc1, \stride1, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
3141
3142    la.local      t0,       idct_coeffs
3143
3144.ifc \rect2, rect2_lsx
3145    vldrepl.w     vr23,      t0,       0        // 2896
3146.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
3147    rect2_lsx \i, vr23, \i
3148.endr
3149.endif
3150
3151    vldrepl.w     vr20,     t0,       64           // 201
3152    vldrepl.w     vr21,     t0,       68           // 4091
3153    vsllwil.w.h   vr22,     vr0,      0
3154    vexth.w.h     vr23,     vr0
3155    vmul.w        vr8,      vr22,     vr21
3156    vmul.w        vr9,      vr23,     vr21
3157    vmul.w        vr0,      vr22,     vr20
3158    vmul.w        vr10,     vr23,     vr20
3159    vssrarni.h.w  vr9,      vr8,      12           // t31a
3160    vssrarni.h.w  vr10,     vr0,      12           // t16a
3161
3162    vldrepl.w     vr20,     t0,       72           // 3035
3163    vldrepl.w     vr21,     t0,       76           // 2751
3164    vsllwil.w.h   vr22,     vr7,      0
3165    vexth.w.h     vr23,     vr7
3166    vneg.w        vr21,     vr21
3167    vmul.w        vr8,      vr22,     vr20
3168    vmul.w        vr0,      vr23,     vr20
3169    vmul.w        vr7,      vr22,     vr21
3170    vmul.w        vr30,     vr23,     vr21
3171    vssrarni.h.w  vr0,      vr8,      12           // t30a
3172    vssrarni.h.w  vr30,     vr7,      12           // t17a
3173
3174    vldrepl.w     vr20,     t0,       80           // 1751
3175    vldrepl.w     vr21,     t0,       84           // 3703
3176    vsllwil.w.h   vr22,     vr4,      0
3177    vexth.w.h     vr23,     vr4
3178    vmul.w        vr8,      vr22,     vr21
3179    vmul.w        vr7,      vr23,     vr21
3180    vmul.w        vr4,      vr22,     vr20
3181    vmul.w        vr19,     vr23,     vr20
3182    vssrarni.h.w  vr7,      vr8,      12           // t29a
3183    vssrarni.h.w  vr19,     vr4,      12           // t18a
3184
3185    vldrepl.w     vr20,     t0,       88           // 3857
3186    vldrepl.w     vr21,     t0,       92           // 1380
3187    vsllwil.w.h   vr22,     vr3,      0
3188    vexth.w.h     vr23,     vr3
3189    vneg.w        vr21,     vr21
3190    vmul.w        vr8,      vr22,     vr20
3191    vmul.w        vr4,      vr23,     vr20
3192    vmul.w        vr3,      vr22,     vr21
3193    vmul.w        vr26,     vr23,     vr21
3194    vssrarni.h.w  vr4,      vr8,      12           // t28a
3195    vssrarni.h.w  vr26,     vr3,      12           // t19a
3196
3197    vldrepl.w     vr20,     t0,       96           // 995
3198    vldrepl.w     vr21,     t0,       100          // 3973
3199    vsllwil.w.h   vr22,     vr2,      0
3200    vexth.w.h     vr23,     vr2
3201    vmul.w        vr8,      vr22,     vr21
3202    vmul.w        vr3,      vr23,     vr21
3203    vmul.w        vr2,      vr22,     vr20
3204    vmul.w        vr27,     vr23,     vr20
3205    vssrarni.h.w  vr3,      vr8,      12           // t27a
3206    vssrarni.h.w  vr27,     vr2,      12           // t20a
3207
3208    vldrepl.w     vr20,     t0,       104          // 3513
3209    vldrepl.w     vr21,     t0,       108          // 2106
3210    vsllwil.w.h   vr22,     vr5,      0
3211    vexth.w.h     vr23,     vr5
3212    vneg.w        vr21,     vr21
3213    vmul.w        vr8,      vr22,     vr20
3214    vmul.w        vr2,      vr23,     vr20
3215    vmul.w        vr5,      vr22,     vr21
3216    vmul.w        vr28,     vr23,     vr21
3217    vssrarni.h.w  vr2,      vr8,      12           // t26a
3218    vssrarni.h.w  vr28,     vr5,      12           // t21a
3219
3220    vldrepl.w     vr20,     t0,       112          // 2440 -> 1220
3221    vldrepl.w     vr21,     t0,       116          // 3290 -> 1645
3222    vsllwil.w.h   vr22,     vr6,      0
3223    vexth.w.h     vr23,     vr6
3224    vmul.w        vr8,      vr22,     vr21
3225    vmul.w        vr5,      vr23,     vr21
3226    vmul.w        vr6,      vr22,     vr20
3227    vmul.w        vr25,     vr23,     vr20
3228    vssrarni.h.w  vr5,      vr8,      12           // t25a
3229    vssrarni.h.w  vr25,     vr6,      12           // t22a
3230
3231    vldrepl.w     vr20,     t0,       120          // 4052
3232    vldrepl.w     vr21,     t0,       124          // 601
3233    vsllwil.w.h   vr22,     vr1,      0
3234    vexth.w.h     vr23,     vr1
3235    vneg.w        vr21,     vr21
3236    vmul.w        vr8,      vr22,     vr20
3237    vmul.w        vr6,      vr23,     vr20
3238    vmul.w        vr1,      vr22,     vr21
3239    vmul.w        vr24,     vr23,     vr21
3240    vssrarni.h.w  vr6,      vr8,      12           // t24a
3241    vssrarni.h.w  vr24,     vr1,      12           // t23a
3242
3243    vsadd.h       vr1,      vr10,     vr30         // t16
3244    vssub.h       vr29,     vr10,     vr30         // t17
3245    vssub.h       vr8,      vr26,     vr19         // t18
3246    vsadd.h       vr31,     vr26,     vr19         // t19
3247    vsadd.h       vr10,     vr27,     vr28         // t20
3248    vssub.h       vr30,     vr27,     vr28         // t21
3249    vssub.h       vr19,     vr24,     vr25         // t22
3250    vsadd.h       vr26,     vr24,     vr25         // t23
3251    vsadd.h       vr27,     vr6,      vr5          // t24
3252    vssub.h       vr28,     vr6,      vr5          // t25
3253    vssub.h       vr24,     vr3,      vr2          // t26
3254    vsadd.h       vr25,     vr3,      vr2          // t27
3255    vsadd.h       vr5,      vr4,      vr7          // t28
3256    vssub.h       vr6,      vr4,      vr7          // t29
3257    vssub.h       vr2,      vr9,      vr0          // t30
3258    vsadd.h       vr3,      vr9,      vr0          // t31
3259
3260    vldrepl.w     vr20,     t0,       16           // 799
3261    vldrepl.w     vr21,     t0,       20           // 4017
3262    vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
3263    vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
3264    vssrarni.h.w  vr7,      vr4,      12           // t30a
3265    vssrarni.h.w  vr0,      vr11,     12           // t17a
3266    vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
3267    vneg.w        vr4,      vr4
3268    vneg.w        vr9,      vr9
3269    vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
3270    vssrarni.h.w  vr9,      vr4,      12           // t18a
3271    vssrarni.h.w  vr2,      vr11,     12           // t29a
3272
3273    vldrepl.w     vr20,     t0,       24           // 3406 -> 1703
3274    vldrepl.w     vr21,     t0,       28           // 2276 -> 1138
3275    vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
3276    vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
3277    vssrarni.h.w  vr29,     vr4,      12           // t26a
3278    vssrarni.h.w  vr6,      vr11,     12           // t21a
3279
3280    vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
3281    vneg.w        vr4,      vr4
3282    vneg.w        vr8,      vr8
3283    vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
3284    vssrarni.h.w  vr8,      vr4,      12           // t22a
3285    vssrarni.h.w  vr24,     vr11,     12           // t25a
3286
3287    vsadd.h       vr4,      vr1,      vr31         // t16a
3288    vssub.h       vr30,     vr1,      vr31         // t19a
3289    vsadd.h       vr19,     vr0,      vr9          // t17
3290    vssub.h       vr28,     vr0,      vr9          // t18
3291    vssub.h       vr1,      vr26,     vr10         // t20a
3292    vsadd.h       vr31,     vr26,     vr10         // t23a
3293    vssub.h       vr0,      vr8,      vr6          // t21
3294    vsadd.h       vr9,      vr8,      vr6          // t22
3295    vsadd.h       vr10,     vr27,     vr25         // t24a
3296    vssub.h       vr26,     vr27,     vr25         // t27a
3297    vsadd.h       vr6,      vr24,     vr29         // t25
3298    vssub.h       vr8,      vr24,     vr29         // t26
3299    vssub.h       vr25,     vr3,      vr5          // t28a
3300    vsadd.h       vr27,     vr3,      vr5          // t31a
3301    vssub.h       vr24,     vr7,      vr2          // t29
3302    vsadd.h       vr29,     vr7,      vr2          // t30
3303
3304    vldrepl.w     vr20,     t0,       8            // 1567
3305    vldrepl.w     vr21,     t0,       12           // 3784
3306    vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
3307    vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
3308    vssrarni.h.w  vr5,      vr3,      12           // t29a
3309    vssrarni.h.w  vr2,      vr11,     12           // 18a
3310
3311    vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
3312    vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
3313    vssrarni.h.w  vr7,      vr3,      12           // t28
3314    vssrarni.h.w  vr24,     vr11,     12           // t19
3315
3316    vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
3317    vneg.w        vr3,      vr3
3318    vneg.w        vr28,     vr28
3319    vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
3320    vssrarni.h.w  vr28,     vr3,      12           // t20
3321    vssrarni.h.w  vr25,     vr11,     12           // t27
3322
3323    vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
3324    vneg.w        vr3,      vr3
3325    vneg.w        vr30,     vr30
3326    vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
3327    vssrarni.h.w  vr30,     vr3,      12           // t21a
3328    vssrarni.h.w  vr1,      vr11,     12           // t26a
3329
3330    vsadd.h       vr3,      vr4,      vr31         // t16
3331    vssub.h       vr26,     vr4,      vr31         // t23
3332    vsadd.h       vr0,      vr19,     vr9          // t17a
3333    vssub.h       vr8,      vr19,     vr9          // t22a
3334    vsadd.h       vr4,      vr2,      vr30         // t18
3335    vssub.h       vr31,     vr2,      vr30         // t21
3336    vsadd.h       vr9,      vr24,     vr28         // t19a
3337    vssub.h       vr19,     vr24,     vr28         // t20a
3338    vssub.h       vr2,      vr27,     vr10         // t24
3339    vsadd.h       vr30,     vr27,     vr10         // t31
3340    vssub.h       vr24,     vr29,     vr6          // t25a
3341    vsadd.h       vr28,     vr29,     vr6          // t30a
3342    vssub.h       vr10,     vr5,      vr1          // t26
3343    vsadd.h       vr27,     vr5,      vr1          // t29
3344    vssub.h       vr6,      vr7,      vr25         // t27a
3345    vsadd.h       vr29,     vr7,      vr25         // t28a
3346
3347    vldrepl.w     vr20,     t0,       0            // 2896
3348    vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
3349    vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
3350    vssrarni.h.w  vr5,      vr1,      12           // t20
3351    vssrarni.h.w  vr7,      vr11,     12           // t27
3352
3353    vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
3354    vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
3355    vssrarni.h.w  vr25,     vr1,      12           // t21a
3356    vssrarni.h.w  vr6,      vr11,     12           // t26a
3357
3358    vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
3359    vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
3360    vssrarni.h.w  vr19,     vr1,      12           // t22
3361    vssrarni.h.w  vr10,     vr11,     12           // t25
3362
3363    vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
3364    vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
3365    vssrarni.h.w  vr31,     vr1,      12           // t23a
3366    vssrarni.h.w  vr8,      vr11,     12           // t24a
3367
3368    // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
3369    // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
3370    vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
3371
3372    vsadd.h       vr1,      vr11,     vr30         // c[0]
3373    vssub.h       vr2,      vr11,     vr30         // c[31]
3374    vsadd.h       vr24,     vr12,     vr28         // c[1]
3375    vssub.h       vr26,     vr12,     vr28         // c[30]
3376    vsadd.h       vr11,     vr13,     vr27         // c[2]
3377    vssub.h       vr30,     vr13,     vr27         // c[29]
3378    vsadd.h       vr12,     vr14,     vr29         // c[3]
3379    vssub.h       vr28,     vr14,     vr29         // c[28]
3380    vsadd.h       vr13,     vr15,     vr7          // c[4]
3381    vssub.h       vr27,     vr15,     vr7          // c[27]
3382    vsadd.h       vr14,     vr16,     vr6          // c[5]
3383    vssub.h       vr29,     vr16,     vr6          // c[26]
3384    vsadd.h       vr7,      vr17,     vr10         // c[6]
3385    vssub.h       vr15,     vr17,     vr10         // c[25]
3386    vsadd.h       vr6,      vr18,     vr8          // c[7]
3387    vssub.h       vr16,     vr18,     vr8          // c[24]
3388
3389    vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
3390
3391    vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
3392
3393    vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
3394
3395    vsadd.h       vr1,      vr11,     vr31         // c[8]
3396    vssub.h       vr2,      vr11,     vr31         // c[23]
3397    vsadd.h       vr24,     vr12,     vr19         // c[9]
3398    vssub.h       vr26,     vr12,     vr19         // c[22]
3399    vsadd.h       vr11,     vr13,     vr25         // c[10]
3400    vssub.h       vr30,     vr13,     vr25         // c[21]
3401    vsadd.h       vr12,     vr14,     vr5          // c[11]
3402    vssub.h       vr28,     vr14,     vr5          // c[20]
3403    vsadd.h       vr13,     vr15,     vr9          // c[12]
3404    vssub.h       vr27,     vr15,     vr9          // c[19]
3405    vsadd.h       vr14,     vr16,     vr4          // c[13]
3406    vssub.h       vr29,     vr16,     vr4          // c[18]
3407    vsadd.h       vr7,      vr17,     vr0          // c[14]
3408    vssub.h       vr15,     vr17,     vr0          // c[17]
3409    vsadd.h       vr6,      vr18,     vr3          // c[15]
3410    vssub.h       vr16,     vr18,     vr3          // c[16]
3411
3412    vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
3413
3414    vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
3415.endm // dct_8x32_tx64_new_lsx
3416
3417.macro DST_ADD_W64 in0, in1, in2, in3, in4, in5, in6, in7
3418    vsllwil.hu.bu vr4,      vr10,     0
3419    vsllwil.hu.bu vr5,      vr11,     0
3420    vsllwil.hu.bu vr6,      vr12,     0
3421    vsllwil.hu.bu vr7,      vr13,     0
3422    vexth.hu.bu   vr10,     vr10
3423    vexth.hu.bu   vr11,     vr11
3424    vexth.hu.bu   vr12,     vr12
3425    vexth.hu.bu   vr13,     vr13
3426    vadd.h        vr4,      vr4,      \in0
3427    vadd.h        vr10,     vr10,     \in1
3428    vadd.h        vr5,      vr5,      \in2
3429    vadd.h        vr11,     vr11,     \in3
3430    vadd.h        vr6,      vr6,      \in4
3431    vadd.h        vr12,     vr12,     \in5
3432    vadd.h        vr7,      vr7,      \in6
3433    vadd.h        vr13,     vr13,     \in7
3434    vssrani.bu.h  vr10,     vr4,      0
3435    vssrani.bu.h  vr11,     vr5,      0
3436    vssrani.bu.h  vr12,     vr6,      0
3437    vssrani.bu.h  vr13,     vr7,      0
3438    vst           vr10,     a0,       0
3439    vst           vr11,     a0,       16
3440    vst           vr12,     a0,       32
3441    vst           vr13,     a0,       48
3442.endm
3443
3444.macro idct_dc_w64 w, h, shift
3445    ld.h          t2,       a2,       0
3446    vldi          vr0,      0x8b5
3447    vreplgr2vr.w  vr1,      t2
3448    vldi          vr20,     0x880
3449    vmul.w        vr2,      vr0,      vr1
3450    st.h          zero,     a2,       0
3451    vsrari.w      vr2,      vr2,      8
3452    vld           vr13,     a0,       48
3453
3454.if (2*\w == \h) || (2*\h == \w)
3455    vmul.w        vr2,      vr2,      vr0
3456    vsrari.w      vr2,      vr2,      8
3457.endif
3458
3459.if \shift>0
3460    vsrari.w      vr2,      vr2,      \shift
3461.endif
3462    vld           vr11,     a0,       16
3463    vmadd.w       vr20,     vr2,      vr0
3464    vld           vr12,     a0,       32
3465    vssrarni.h.w  vr20,     vr20,     12
3466    vld           vr10,     a0,       0
3467.endm
3468
3469function inv_txfm_add_dct_dct_64x64_8bpc_lsx
3470    bnez          a3,       .NO_HAS_DCONLY_64x64
3471
3472    idct_dc_w64 64, 64, 2
3473
3474    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
3475
3476    li.w          t3,       63
3477.loop63:
3478    add.d         a0,       a0,       a1
3479    vld           vr10,     a0,       0
3480    vld           vr11,     a0,       16
3481    vld           vr12,     a0,       32
3482    vld           vr13,     a0,       48
3483    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
3484    addi.d        t3,       t3,       -1
3485    blt           zero,     t3,       .loop63
3486    b             .DCT_DCT_64X64_END
3487.NO_HAS_DCONLY_64x64:
3488
3489    malloc_space  64*32*2+512+512
3490
3491.macro dct64x64_core1_lsx shift, rect2
3492    //addi.d        t2,       a2,       \in0
3493    //addi.d        t7,       t7,       \in1
3494    li.w          t4,       64*32*2+64
3495    add.d         t3,       sp,       t4
3496    addi.d        t6,       t3,       512
3497    add.d         t5,       t6,       zero
3498
3499    dct_8x32_tx64_new_lsx 0, 256, 128, 256, \rect2
3500
3501    la.local      t0,       idct64_coeffs
3502    vxor.v        vr31,     vr31,     vr31
3503
3504    //addi.d        a4,       a2,       \in2         // 32 ...
3505    // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
3506    vld           vr0,      a4,       128*0        // in1
3507    vld           vr1,      a4,       128*15       // in31
3508    vld           vr2,      a4,       128*8        // in17
3509    vld           vr3,      a4,       128*7        // in15
3510    la.local      a6,       idct_coeffs
3511.ifc \rect2, rect2_lsx
3512    vldrepl.w     vr23,      a6,       0        // 2896
3513.irp i, vr0, vr1, vr2, vr3
3514    rect2_lsx \i, vr23, \i
3515.endr
3516.endif
3517    vst           vr31,     a4,       128*0
3518    vst           vr31,     a4,       128*15
3519    vst           vr31,     a4,       128*8
3520    vst           vr31,     a4,       128*7
3521    dct64_step1_lsx
3522
3523    addi.d        t0,       t0,       48
3524    addi.d        t6,       t6,       128
3525    // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
3526    vld           vr0,      a4,       128*3        // in7
3527    vld           vr1,      a4,       128*12       // in25
3528    vld           vr2,      a4,       128*11       // in23
3529    vld           vr3,      a4,       128*4        // in9
3530    la.local      a6,       idct_coeffs
3531.ifc \rect2, rect2_lsx
3532    vldrepl.w     vr23,      a6,       0        // 2896
3533.irp i, vr0, vr1, vr2, vr3
3534    rect2_lsx \i, vr23, \i
3535.endr
3536.endif
3537    vst           vr31,     a4,       128*3
3538    vst           vr31,     a4,       128*12
3539    vst           vr31,     a4,       128*11
3540    vst           vr31,     a4,       128*4
3541    dct64_step1_lsx
3542
3543    addi.d        t0,       t0,       48
3544    addi.d        t6,       t6,       128
3545    // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
3546    vld           vr0,      a4,       128*2        // in5
3547    vld           vr1,      a4,       128*13       // in27
3548    vld           vr2,      a4,       128*10       // in21
3549    vld           vr3,      a4,       128*5        // in11
3550    la.local      a6,       idct_coeffs
3551.ifc \rect2, rect2_lsx
3552    vldrepl.w     vr23,      a6,      0        // 2896
3553.irp i, vr0, vr1, vr2, vr3
3554    rect2_lsx \i, vr23, \i
3555.endr
3556.endif
3557    vst           vr31,     a4,       128*2
3558    vst           vr31,     a4,       128*13
3559    vst           vr31,     a4,       128*10
3560    vst           vr31,     a4,       128*5
3561    dct64_step1_lsx
3562
3563    addi.d        t0,       t0,       48
3564    addi.d        t6,       t6,       128
3565    // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
3566    vld           vr0,      a4,       128*1        // in3
3567    vld           vr1,      a4,       128*14       // in29
3568    vld           vr2,      a4,       128*9        // in19
3569    vld           vr3,      a4,       128*6        // in13
3570    la.local      a6,       idct_coeffs
3571.ifc \rect2, rect2_lsx
3572    vldrepl.w     vr23,      a6,       0        // 2896
3573.irp i, vr0, vr1, vr2, vr3
3574    rect2_lsx \i, vr23, \i
3575.endr
3576.endif
3577    vst           vr31,     a4,       128*1
3578    vst           vr31,     a4,       128*14
3579    vst           vr31,     a4,       128*9
3580    vst           vr31,     a4,       128*6
3581    dct64_step1_lsx
3582
3583    la.local      t0,       idct_coeffs
3584    addi.d        t4,       t5,       16*7
3585    // t32a/t39/t40a/t47/t48/t55a/t56/t63a
3586    dct64_step2_lsx
3587
3588    addi.d        t5,       t5,       16
3589    addi.d        t4,       t4,       -16
3590    // t33/t38a/t41/t46a/t49a/t54/t57a/t62
3591    dct64_step2_lsx
3592
3593    addi.d        t5,       t5,       16
3594    addi.d        t4,       t4,       -16
3595    // t34a/t37/t42a/t45/t50/t53a/t58/t61a
3596    dct64_step2_lsx
3597
3598    addi.d        t5,       t5,       16
3599    addi.d        t4,       t4,       -16
3600    // t35/t36a/t43/t44a/t51a/t52/t59a/t60
3601    dct64_step2_lsx
3602
3603    li.w          t4,       64*32*2+64+512
3604    add.d         t5,       t4,       sp
3605    addi.d        t4,       t5,       16*7
3606    dct64_step4_lsx transpose8x8, \shift, 0, 128, 112, 128
3607
3608    addi.d        t3,       t3,       128
3609    addi.d        t4,       t4,       -16*8
3610    addi.d        t5,       t5,       -16*8
3611    dct64_step4_lsx transpose8x8, \shift, 16, 128, 96, 128
3612
3613    addi.d        t5,       t5,       -16*8
3614    addi.d        t4,       t4,       -16*8
3615    addi.d        t3,       t3,       128
3616    dct64_step4_lsx transpose8x8, \shift, 32, 128, 80, 128
3617
3618    addi.d        t5,       t5,       -16*8
3619    addi.d        t4,       t4,       -16*8
3620    addi.d        t3,       t3,       128
3621    dct64_step4_lsx transpose8x8, \shift, 48, 128, 64, 128
3622.endm
3623    la.local      t8,       eob_32x32
3624    addi.d        t2,       a2,       0
3625    addi.d        t7,       sp,       64
3626    addi.d        t7,       t7,       0
3627    addi.d        a4,       a2,       64
3628.DCT_DCT_EOB_64x64:
3629    ld.h          a5,       t8,       0
3630    addi.d        t8,       t8,       2
3631    dct64x64_core1_lsx 2, no_rect2
3632    addi.d        t2,       t2,       16
3633    addi.d        t7,       t7,       128*8
3634    addi.d        a4,       a4,       16
3635    bge           a3,       a5,       .DCT_DCT_EOB_64x64
3636
3637    la.local      t8,       eob_32x32
3638    vxor.v        vr31,     vr31,     vr31
3639
3640    ld.h          t7,       t8,       4
3641    bge           a3,       t7,       .DCT_DCT_EOB_64x64_END
3642    li.d          t1,       1024*3+64
3643    add.d         t0,       sp,       t1
3644.rept 4
3645    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
3646            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
3647    addi.d t0, t0, 256
3648.endr
3649
3650    ld.h          t7,       t8,       2
3651    bge           a3,       t7,       .DCT_DCT_EOB_64x64_END
3652    li.d          t1,       1024*2+64
3653    add.d         t0,       sp,       t1
3654.rept 4
3655    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
3656            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
3657    addi.d        t0,       t0,       256
3658.endr
3659    ld.h          t7,       t8,       0
3660    bge           a3,       t7,       .DCT_DCT_EOB_64x64_END
3661
3662    li.d          t1,       1024*1+64
3663    add.d         t0,       sp,       t1
3664.rept 4
3665    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
3666            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
3667    addi.d        t0,       t0,       256
3668.endr
3669
3670.DCT_DCT_EOB_64x64_END:
3671
3672.macro dct64x64_core2_lsx in0, in1, rect2
3673    addi.d        t2,       sp,       64+\in0
3674    addi.d        t7,       sp,       64+\in0
3675    li.w          t4,       64*32*2+64
3676    add.d         t3,       sp,       t4
3677    addi.d        t6,       t3,       512
3678    add.d         t5,       t6,       zero
3679
3680    addi.d        t2,       t2,       1024
3681    addi.d        t2,       t2,       1024
3682    dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512, \rect2
3683
3684    la.local      t0,       idct64_coeffs
3685    addi.d        t2,       sp,       64+64*2+\in0
3686    addi.d        t4,       t2,       256*7
3687    addi.d        t4,       t4,       256
3688
3689    vld           vr0,      t2,       256*0        // in1
3690    vld           vr1,      t4,       256*7        // in31
3691    vld           vr2,      t4,       256*0        // in17
3692    vld           vr3,      t2,       256*7        // in15
3693    dct64_step1_lsx
3694
3695    addi.d        t0,       t0,       48
3696    addi.d        t6,       t6,       128
3697    vld           vr0,      t2,       256*3        // in7
3698    vld           vr1,      t4,       256*4        // in25
3699    vld           vr2,      t4,       256*3        // in23
3700    vld           vr3,      t2,       256*4        // in9
3701    dct64_step1_lsx
3702
3703    addi.d        t0,        t0,       48
3704    addi.d        t6,        t6,       128
3705    vld           vr0,       t2,       256*2       // in5
3706    vld           vr1,       t4,       256*5       // in27
3707    vld           vr2,       t4,       256*2       // in21
3708    vld           vr3,       t2,       256*5       // in11
3709    dct64_step1_lsx
3710
3711    addi.d        t0,        t0,       48
3712    addi.d        t6,        t6,       128
3713    vld           vr0,       t2,       256*1       // in3
3714    vld           vr1,       t4,       256*6       // in29
3715    vld           vr2,       t4,       256*1       // in19
3716    vld           vr3,       t2,       256*6       // in13
3717    dct64_step1_lsx
3718
3719    la.local      t0,       idct_coeffs
3720    addi.d        t4,       t5,       16*7
3721    // t32a/t39/t40a/t47/t48/t55a/t56/t63a
3722    dct64_step2_lsx
3723
3724    addi.d        t5,       t5,       16
3725    addi.d        t4,       t4,       -16
3726    // t33/t38a/t41/t46a/t49a/t54/t57a/t62
3727    dct64_step2_lsx
3728
3729    addi.d        t5,       t5,       16
3730    addi.d        t4,       t4,       -16
3731    // t34a/t37/t42a/t45/t50/t53a/t58/t61a
3732    dct64_step2_lsx
3733
3734    addi.d        t5,       t5,       16
3735    addi.d        t4,       t4,       -16
3736    // t35/t36a/t43/t44a/t51a/t52/t59a/t60
3737    dct64_step2_lsx
3738
3739    li.w          t4,       64*32*2+64+512
3740    add.d         t5,       t4,       sp
3741    addi.d        t4,       t5,       16*7
3742    addi.d        a0,       a0,       \in1
3743    // 0 - 7, 56 -63
3744    dct64_step3_lsx
3745    li.w          t8,       0
3746    mul.w         t0,       t8,       a1
3747    add.d         t0,       a0,       t0
3748    alsl.d        t6,       a1,       t0,      1
3749    addi.d        t1,       t0,       0
3750    add.d         t2,       t0,       a1
3751    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
3752    li.w          t8,       56
3753    mul.w         t0,       t8,       a1
3754    add.d         t0,       a0,       t0
3755    alsl.d        t6,       a1,       t0,      1
3756    addi.d        t1,       t0,       0
3757    add.d         t2,       t0,       a1
3758    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
3759
3760    // 8 - 15, 48 - 55
3761    addi.d        t3,       t3,       128
3762    addi.d        t4,       t4,       -16*8
3763    addi.d        t5,       t5,       -16*8
3764    dct64_step3_lsx
3765    li.w          t8,       8
3766    mul.w         t0,       t8,       a1
3767    add.d         t0,       t0,       a0
3768    alsl.d        t6,       a1,       t0,     1
3769    addi.d        t1,       t0,       0
3770    add.d         t2,       t0,       a1
3771    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
3772    li.w          t8,       48
3773    mul.w         t0,       t8,       a1
3774    add.d         t0,       t0,       a0
3775    alsl.d        t6,       a1,       t0,     1
3776    addi.d        t1,       t0,       0
3777    add.d         t2,       t0,       a1
3778    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
3779
3780    // 16 - 23, 40 - 47
3781    addi.d        t3,       t3,       128
3782    addi.d        t4,       t4,       -16*8
3783    addi.d        t5,       t5,       -16*8
3784    dct64_step3_lsx
3785    li.w          t8,       16
3786    mul.w         t0,       t8,       a1
3787    add.d         t0,       t0,       a0
3788    alsl.d        t6,       a1,       t0,     1
3789    addi.d        t1,       t0,       0
3790    add.d         t2,       t0,       a1
3791    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
3792    li.w          t8,       40
3793    mul.w         t0,       t8,       a1
3794    add.d         t0,       t0,       a0
3795    alsl.d        t6,       a1,       t0,     1
3796    addi.d        t1,       t0,       0
3797    add.d         t2,       t0,       a1
3798    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
3799
3800    // 24 - 31, 32 - 39
3801    addi.d        t3,       t3,       128
3802    addi.d        t4,       t4,       -16*8
3803    addi.d        t5,       t5,       -16*8
3804    dct64_step3_lsx
3805    li.w          t8,       24
3806    mul.w         t0,       t8,       a1
3807    add.d         t0,       t0,       a0
3808    alsl.d        t6,       a1,       t0,     1
3809    addi.d        t1,       t0,       0
3810    add.d         t2,       t0,       a1
3811    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
3812    li.w          t8,       32
3813    mul.w         t0,       t8,       a1
3814    add.d         t0,       t0,       a0
3815    alsl.d        t6,       a1,       t0,     1
3816    addi.d        t1,       t0,       0
3817    add.d         t2,       t0,       a1
3818    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
3819.endm
3820    dct64x64_core2_lsx 16*0, 0, no_rect2
3821    dct64x64_core2_lsx 16*1, 8, no_rect2
3822    dct64x64_core2_lsx 16*2, 8, no_rect2
3823    dct64x64_core2_lsx 16*3, 8, no_rect2
3824    dct64x64_core2_lsx 16*4, 8, no_rect2
3825    dct64x64_core2_lsx 16*5, 8, no_rect2
3826    dct64x64_core2_lsx 16*6, 8, no_rect2
3827    dct64x64_core2_lsx 16*7, 8, no_rect2
3828
3829    free_space 64*32*2+512+512
3830.DCT_DCT_64X64_END:
3831endfunc
3832
3833function inv_txfm_add_dct_dct_64x32_8bpc_lsx
3834    bnez          a3,       .NO_HAS_DCONLY_64x32
3835
3836    idct_dc_w64 64, 32, 1
3837
3838    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
3839
3840    li.w          t3,       31
3841.loop31:
3842    add.d         a0,       a0,       a1
3843    vld           vr10,     a0,       0
3844    vld           vr11,     a0,       16
3845    vld           vr12,     a0,       32
3846    vld           vr13,     a0,       48
3847    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
3848    addi.d        t3,       t3,       -1
3849    blt           zero,     t3,       .loop31
3850    b             .DCT_DCT_64X32_END
3851.NO_HAS_DCONLY_64x32:
3852    malloc_space  64*32*2+512+512
3853
3854    la.local      t8,       eob_32x32
3855    addi.d        t2,       a2,       0
3856    addi.d        t7,       sp,       64
3857    addi.d        t7,       t7,       0
3858    addi.d        a4,       a2,       64
3859.DCT_DCT_EOB_64x32:
3860    ld.h          a5,       t8,       0
3861    addi.d        t8,       t8,       2
3862    dct64x64_core1_lsx 1, rect2_lsx
3863    addi.d        t2,       t2,       16
3864    addi.d        t7,       t7,       128*8
3865    addi.d        a4,       a4,       16
3866    bge           a3,       a5,       .DCT_DCT_EOB_64x32
3867
3868    la.local      t8,       eob_32x32
3869    vxor.v        vr31,     vr31,     vr31
3870
3871    ld.h          t7,       t8,       4
3872    bge           a3,       t7,       .DCT_DCT_EOB_64x32_END
3873    li.d          t1,       1024*3+64
3874    add.d         t0,       sp,       t1
3875.rept 4
3876    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
3877            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
3878    addi.d t0, t0, 256
3879.endr
3880
3881    ld.h          t7,       t8,       2
3882    bge           a3,       t7,       .DCT_DCT_EOB_64x32_END
3883    li.d          t1,       1024*2+64
3884    add.d         t0,       sp,       t1
3885.rept 4
3886    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
3887            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
3888    addi.d        t0,       t0,       256
3889.endr
3890
3891    ld.h          t7,       t8,       0
3892    bge           a3,       t7,       .DCT_DCT_EOB_64x32_END
3893    li.d          t1,       1024*1+64
3894    add.d         t0,       sp,       t1
3895.rept 4
3896    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
3897            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
3898    addi.d        t0,       t0,       256
3899.endr
3900
3901.DCT_DCT_EOB_64x32_END:
3902    addi.d        t2,       sp,       64
3903    li.w          t4,       64*32*2+64
3904    add.d         t3,       sp,       t4
3905    addi.d        t5,       sp,       64
3906    addi.d        t5,       t5,       1024
3907    addi.d        t5,       t5,       1024
3908.rept 8
3909    vld_x8 t2, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
3910
3911    addi.d        t4,       t2,       1024
3912    addi.d        t4,       t4,       1024
3913
3914    vld_x8 t4, 0, 256, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
3915
3916    inv_dct16_lsx no_rect2
3917
3918    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
3919            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
3920
3921    addi.d        t4,       t2,       128
3922    vld_x8 t4, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
3923
3924    addi.d        t4,       t4,       1024
3925    addi.d        t4,       t4,       1024
3926
3927    vld_x8 t4, 0, 256, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
3928
3929    dct_8x32_core_lsx t5, t3, 0, 128, 16, -2048, 1024, -1024, 0, 128, , 4
3930
3931    addi.d        t2,       t2,       16
3932    addi.d        t5,       t5,       16
3933    addi.d        t1,       t1,       16
3934.endr
3935    addi.d        t2,       sp,       64
3936    li.w          t3,       32
3937.loop32:
3938    vld           vr10,     a0,       0
3939    vld           vr11,     a0,       16
3940    vld           vr12,     a0,       32
3941    vld           vr13,     a0,       48
3942    vld_x8 t2, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
3943    DST_ADD_W64 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
3944    add.d         a0,       a0,       a1
3945    addi.d        t2,       t2,       128
3946    addi.d        t3,       t3,       -1
3947    blt           zero,     t3,       .loop32
3948
3949    free_space  64*32*2+512+512
3950.DCT_DCT_64X32_END:
3951endfunc
3952
3953.macro VLD_DST_ADD_W8_H32 in0
3954    vld           vr4,      t3,       0
3955    vld           vr5,      t3,       16
3956    vld           vr6,      t3,       32
3957    vld           vr7,      t3,       48
3958    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
3959    addi.d        t3,       t3,       64
3960    add.d         a0,       a1,       a0
3961    alsl.d        t2,       a1,       t2,     2
3962    vld           vr4,      t3,       0
3963    vld           vr5,      t3,       16
3964    vld           vr6,      t3,       32
3965    vld           vr7,      t3,       48
3966    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
3967    addi.d        t3,       sp,       \in0
3968    add.d         a0,       a1,       a0
3969    alsl.d        t2,       a1,       t2,     2
3970.endm
3971
3972function inv_txfm_add_dct_dct_8x32_8bpc_lsx
3973    bnez          a3,       .NO_HAS_DCONLY_8x32
3974
3975    idct_dc 8, 32, 2
3976
3977    DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20
3978.rept 7
3979    add.d         a0,       a1,       a0
3980    alsl.d        t2,       a1,       a0,     1
3981
3982    VLD_DST_ADD_W8 vr20, vr20, vr20, vr20
3983.endr
3984    b             .DCT_DCT_8X32_END
3985.NO_HAS_DCONLY_8x32:
3986    malloc_space 512
3987
3988    la.local      t8,       eob_8x32
3989    addi.d        t3,       sp,       64
3990    addi.d        t2,       a2,       0
3991.DCT_DCT_EOB_8x32:
3992    ld.h          t7,       t8,       0
3993    addi.d        t8,       t8,       2
3994
3995    vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
3996
3997    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
3998
3999.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4000    vsrari.h      \i,       \i,       2
4001.endr
4002
4003    vxor.v        vr31,     vr31,     vr31
4004    vst_x8 a2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4005
4006    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4007                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4008                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
4009
4010    vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4011
4012    addi.d        a2,       a2,       16
4013    addi.d        t3,       t3,       128
4014    bge           a3,       t7,       .DCT_DCT_EOB_8x32
4015
4016    la.local      t8,       eob_8x32
4017    vxor.v        vr31,     vr31,     vr31
4018    ld.h          t7,       t8,       4
4019    bge           a3,       t7,       .DCT_DCT_EOB_8x32_END
4020    vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4021
4022    ld.h          t7,       t8,       2
4023    bge           a3,       t7,       .DCT_DCT_EOB_8x32_END
4024    vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4025
4026    ld.h          t7,       t8,       0
4027    bge           a3,       t7,       .DCT_DCT_EOB_8x32_END
4028    vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4029.DCT_DCT_EOB_8x32_END:
4030    addi.d        t2,       sp,       64
4031    addi.d        t3,       sp,       64
4032
4033    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4034            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
4035
4036    inv_dct16_lsx .8h
4037
4038    vst_x16 t3, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4039            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
4040
4041    vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4042            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
4043
4044    dct_8x32_core_lsx t2, t3, 0, 256, 32, 0, 128, 256, 384, 16, , 4
4045
4046    alsl.d        t2,       a1,       a0,     1
4047    addi.d        t3,       sp,       64
4048
4049    VLD_DST_ADD_W8_H32 320
4050    VLD_DST_ADD_W8_H32 448
4051    VLD_DST_ADD_W8_H32 192
4052    VLD_DST_ADD_W8_H32 0
4053
4054    free_space 512
4055.DCT_DCT_8X32_END:
4056endfunc
4057
4058function inv_txfm_add_identity_identity_8x32_8bpc_lsx
4059    la.local      t7,       eob_8x32
4060    alsl.d        t2,       a1,       a0,     1
4061
4062.IDENTITY_IDENTITY_EOB_8x32:
4063    ld.h          t6,       t7,       0
4064    addi.d        t7,       t7,       2
4065    vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4066
4067    vxor.v        vr23,     vr23,     vr23
4068    vst_x8 a2, 0, 64, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23
4069
4070.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4071    vsrari.h       \i,       \i,       1
4072.endr
4073
4074    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4075                   vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \
4076                   vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
4077
4078.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
4079    vsrari.h       \i,       \i,       2
4080.endr
4081    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
4082    add.d         a0,       a1,       a0
4083    alsl.d        t2,       a1,       a0,     1
4084
4085    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
4086    add.d         a0,       a1,       a0
4087    alsl.d        t2,       a1,       a0,     1
4088
4089    addi.d        a2,       a2,       16
4090    bge           a3,       t6,       .IDENTITY_IDENTITY_EOB_8x32
4091endfunc
4092
4093.macro def_fn_16x4_base txfm
4094functionl inv_txfm_\txfm\()add_16x4_lsx
4095    vld_x8 a2, 0, 16, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14
4096
4097.ifc \txfm, identity_
4098    li.w          t0,       1697
4099    vreplgr2vr.w  vr20,     t0
4100.irp i, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14
4101    inv_identity16_lsx \i, vr20, \i, \i, .8h
4102.endr
4103
4104    vilvh.d       vr1,      vr0,      vr0
4105    vilvh.d       vr3,      vr2,      vr2
4106    vilvh.d       vr5,      vr4,      vr4
4107    vilvh.d       vr7,      vr6,      vr6
4108    vilvh.d       vr9,      vr8,      vr8
4109    vilvh.d       vr11,     vr10,     vr10
4110    vilvh.d       vr13,     vr12,     vr12
4111    vilvh.d       vr15,     vr14,     vr14
4112.else
4113    vilvh.d       vr1,      vr0,      vr0
4114    vilvh.d       vr3,      vr2,      vr2
4115    vilvh.d       vr5,      vr4,      vr4
4116    vilvh.d       vr7,      vr6,      vr6
4117    vilvh.d       vr9,      vr8,      vr8
4118    vilvh.d       vr11,     vr10,     vr10
4119    vilvh.d       vr13,     vr12,     vr12
4120    vilvh.d       vr15,     vr14,     vr14
4121
4122    move          t6,       ra
4123    jirl          ra,       t7,       0
4124    move          ra,       t6
4125.endif
4126
4127    vxor.v        vr23,     vr23,     vr23
4128    vst_x8 a2, 0, 16, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23
4129
4130    LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \
4131                       vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21
4132
4133    LSX_TRANSPOSE8x4_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, vr4, \
4134                       vr5, vr6, vr7, vr16, vr17, vr18, vr19, vr20, vr21
4135
4136    vsrari.h      vr0,      vr0,      1
4137    vsrari.h      vr1,      vr1,      1
4138    vsrari.h      vr2,      vr2,      1
4139    vsrari.h      vr3,      vr3,      1
4140    move          t6,       ra
4141    jirl          ra,       t8,       0
4142    move          ra,       t6
4143
4144    vsrari.h      vr8,      vr0,      4
4145    vsrari.h      vr9,      vr1,      4
4146    vsrari.h      vr10,     vr2,      4
4147    vsrari.h      vr11,     vr3,      4
4148    vsrari.h      vr0,      vr4,      1
4149    vsrari.h      vr1,      vr5,      1
4150    vsrari.h      vr2,      vr6,      1
4151    vsrari.h      vr3,      vr7,      1
4152
4153    move          t6,       ra
4154    jirl          ra,       t8,       0
4155    move          ra,       t6
4156
4157    vsrari.h      vr16,     vr0,      4
4158    vsrari.h      vr17,     vr1,      4
4159    vsrari.h      vr18,     vr2,      4
4160    vsrari.h      vr19,     vr3,      4
4161
4162    alsl.d        t2,       a1,       a0,    1
4163    VLD_DST_ADD_W16 vr8, vr16, vr9, vr17, vr10, vr18, vr11, vr19
4164endfuncl
4165.endm
4166
4167def_fn_16x4_base identity_
4168def_fn_16x4_base
4169
4170.macro fn_16x4 txfm1, txfm2
4171function inv_txfm_add_\txfm1\()_\txfm2\()_16x4_8bpc_lsx
4172.ifc \txfm1\()_\txfm2, dct_dct
4173    bnez          a3,       .NO_HAS_DCONLY_16x4
4174
4175    idct_dc 16, 4, 1
4176
4177    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
4178                vr20, vr20, vr20, vr20, vr20
4179    b             .\txfm1\()_\txfm2\()_16x4_END
4180.NO_HAS_DCONLY_16x4:
4181.endif
4182
4183.ifnc \txfm1, identity
4184    la.local     t7,    inv_\txfm1\()_4h_x16_lsx
4185.endif
4186    la.local     t8,    inv_\txfm2\()_8h_x4_lsx
4187
4188.ifc \txfm1, identity
4189    b            inv_txfm_identity_add_16x4_lsx
4190.else
4191    b            inv_txfm_add_16x4_lsx
4192.endif
4193.\txfm1\()_\txfm2\()_16x4_END:
4194endfunc
4195.endm
4196
4197fn_16x4 dct, dct
4198fn_16x4 identity, identity
4199fn_16x4 adst, dct
4200
4201.macro VLD_DST_ADD_W16_H32 in0
4202    vld           vr14,     t3,       0
4203    vld           vr15,     t3,       16
4204    vld           vr16,     t3,       32
4205    vld           vr17,     t3,       48
4206    vld           vr18,     t5,       0
4207    vld           vr19,     t5,       16
4208    vld           vr20,     t5,       32
4209    vld           vr21,     t5,       48
4210    vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \
4211                vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4
4212    VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21
4213    alsl.d        a0,       a1,       a0,    2
4214    alsl.d        t2,       a1,       t2,    2
4215    addi.d        t3,       t3,       64
4216    addi.d        t5,       t5,       64
4217    vld           vr14,     t3,       0
4218    vld           vr15,     t3,       16
4219    vld           vr16,     t3,       32
4220    vld           vr17,     t3,       48
4221    vld           vr18,     t5,       0
4222    vld           vr19,     t5,       16
4223    vld           vr20,     t5,       32
4224    vld           vr21,     t5,       48
4225    vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \
4226                vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4
4227    VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21
4228    alsl.d        a0,       a1,       a0,    2
4229    alsl.d        t2,       a1,       t2,    2
4230    addi.d        t3,       sp,       \in0
4231    addi.d        t5,       sp,       \in0+512
4232.endm
4233
4234function inv_txfm_add_dct_dct_16x32_8bpc_lsx
4235    bnez          a3,       .NO_HAS_DCONLY_16x32
4236
4237    idct_dc 16, 32, 1
4238
4239    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
4240                    vr20, vr20, vr20, vr20, vr20
4241.rept 7
4242    alsl.d        a0,       a1,       a0,     2
4243    alsl.d        t2,       a1,       a0,     1
4244
4245    VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
4246.endr
4247    b             .DCT_DCT_16x32_END
4248.NO_HAS_DCONLY_16x32:
4249    malloc_space 512+512
4250
4251    addi.d        t3,       sp,       64
4252    la.local      t8,       eob_16x32
4253
4254.DCT_DCT_EOB_16x32:
4255    ld.h          t7,       t8,       0
4256    addi.d        t8,       t8,       2
4257    vld_x16 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4258            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
4259
4260    vxor.v        vr31,     vr31,     vr31
4261.irp i, 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960
4262    vst           vr31,     a2,       \i
4263.endr
4264
4265    li.w          t0,       2896
4266    vreplgr2vr.w  vr23,     t0
4267.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4268     vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
4269    rect2_lsx   \i, vr23, \i
4270.endr
4271
4272    inv_dct16_lsx .8h
4273
4274    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4275                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4276                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
4277
4278    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
4279                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
4280                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
4281
4282.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4283    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
4284    vsrari.h       \i,       \i,       1
4285.endr
4286
4287    vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,
4288    vst_x8 t3, 512, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
4289
4290    addi.d        a2,       a2,       16
4291    addi.d        t3,       t3,       128
4292    bge           a3,       t7,       .DCT_DCT_EOB_16x32
4293
4294    la.local      t8,       eob_16x32
4295    vxor.v        vr31,     vr31,     vr31
4296
4297    ld.h          t7,       t8,       4
4298    bge           a3,       t7,       .DCT_DCT_EOB_16x32_END
4299    vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4300    vst_x8 sp, 64+896, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4301
4302    ld.h          t7,       t8,       2
4303    bge           a3,       t7,       .DCT_DCT_EOB_16x32_END
4304    vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4305    vst_x8 sp, 64+768, 16,  vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4306
4307    ld.h          t7,       t8,       0
4308    bge           a3,       t7,       .DCT_DCT_EOB_16x32_END
4309    vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4310    vst_x8 sp, 64+512+128, 16  vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
4311
4312.DCT_DCT_EOB_16x32_END:
4313    addi.d      t7,   sp,    64
4314.rept 2
4315    vld_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4316            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
4317
4318    inv_dct16_lsx .8h
4319
4320    vst_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4321            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
4322
4323    vld_x16 t7, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4324            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
4325
4326    dct_8x32_core_lsx t7, t7, 0, 256, 32, 0, 128, 256, 384, 16, ,
4327
4328    addi.d        t7,       t7,       512
4329.endr
4330    alsl.d        t2,       a1,       a0,    1
4331    addi.d        t3,       sp,       64
4332    addi.d        t5,       sp,       512+64
4333
4334    VLD_DST_ADD_W16_H32 320
4335    VLD_DST_ADD_W16_H32 448
4336    VLD_DST_ADD_W16_H32 192
4337    VLD_DST_ADD_W16_H32 0
4338
4339    free_space 512+512
4340.DCT_DCT_16x32_END:
4341endfunc
4342
4343.macro xvmulev_xvmaddod_lasx in0, in1, in2, in3, out0, out1
4344    xvmulwev.w.h   \out0,    \in0,     \in2
4345    xvmulwod.w.h   \out1,    \in0,     \in2
4346    xvmaddwev.w.h  \out0,    \in1,     \in3
4347    xvmaddwod.w.h  \out1,    \in1,     \in3
4348.endm
4349
4350.macro xvsrari_h_x16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
4351                     in11, in12, in13, in14, in15, out0, out1, out2, out3, \
4352                     out4, out5, out6, out7, out8, out9, out10, out11, out12, \
4353                     out13, out14, out15, shift
4354    xvsrari.h  \out0,       \in0,     \shift
4355    xvsrari.h  \out1,       \in1,     \shift
4356    xvsrari.h  \out2,       \in2,     \shift
4357    xvsrari.h  \out3,       \in3,     \shift
4358    xvsrari.h  \out4,       \in4,     \shift
4359    xvsrari.h  \out5,       \in5,     \shift
4360    xvsrari.h  \out6,       \in6,     \shift
4361    xvsrari.h  \out7,       \in7,     \shift
4362    xvsrari.h  \out8,       \in8,     \shift
4363    xvsrari.h  \out9,       \in9,     \shift
4364    xvsrari.h  \out10,      \in10,    \shift
4365    xvsrari.h  \out11,      \in11,    \shift
4366    xvsrari.h  \out12,      \in12,    \shift
4367    xvsrari.h  \out13,      \in13,    \shift
4368    xvsrari.h  \out14,      \in14,    \shift
4369    xvsrari.h  \out15,      \in15,    \shift
4370.endm
4371
4372.macro xvpermi_q_x2 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1
4373    xvor.v      \tmp0,      \in0,     \in0
4374    xvor.v      \tmp1,      \in1,     \in1
4375    xvpermi.q   \out0,      \in2,     0x02
4376    xvpermi.q   \out1,      \in3,     0x02
4377    xvpermi.q   \out2,      \tmp0,    0x31
4378    xvpermi.q   \out3,      \tmp1,    0x31
4379.endm
4380
4381.macro DST_ADD_W16_LASX in0, in1, in2, in3, in4, in5, in6, in7
4382    vext2xv.hu.bu xr0,      \in0
4383    vext2xv.hu.bu xr1,      \in1
4384    vext2xv.hu.bu xr2,      \in2
4385    vext2xv.hu.bu xr3,      \in3
4386    xvadd.h       xr0,      xr0,      \in4
4387    xvadd.h       xr1,      xr1,      \in5
4388    xvadd.h       xr2,      xr2,      \in6
4389    xvadd.h       xr3,      xr3,      \in7
4390    xvssrani.bu.h xr1,      xr0,      0
4391    xvssrani.bu.h xr3,      xr2,      0
4392    xvpermi.d     xr0,      xr1,      0b11011000
4393    xvpermi.d     xr2,      xr3,      0b11011000
4394    xvpermi.d     xr1,      xr0,      0b00001110
4395    xvpermi.d     xr3,      xr2,      0b00001110
4396    vst           vr0,      a0,       0
4397    vstx          vr1,      a0,       a1
4398    vst           vr2,      t2,       0
4399    vstx          vr3,      t2,       a1
4400.endm
4401
4402.macro XVLD_DST_ADD_W16 in0, in1, in2, in3
4403    vld           vr0,      a0,       0
4404    vldx          vr1,      a0,       a1
4405    vld           vr2,      t2,       0
4406    vldx          vr3,      t2,       a1
4407    DST_ADD_W16_LASX xr0, xr1, xr2, xr3, \in0, \in1, \in2, \in3
4408.endm
4409
4410.macro inv_adst16_lasx
4411    la.local      t0,       iadst16_coeffs_h
4412
4413    xvldrepl.h    xr20,     t0,       0        // 4091
4414    xvldrepl.h    xr21,     t0,       2        // 201
4415    xvmulev_xvmaddod_lasx xr15, xr0, xr20, xr21, xr16, xr18
4416    xvneg.h       xr20,     xr20
4417    xvmulev_xvmaddod_lasx xr15, xr0, xr21, xr20, xr17, xr19
4418    xvilvl.w      xr15,     xr18,     xr16
4419    xvilvl.w      xr0,      xr19,     xr17
4420    xvilvh.w      xr18,     xr18,     xr16
4421    xvilvh.w      xr19,     xr19,     xr17
4422    xvssrarni.h.w xr18,     xr15,     12       // t0
4423    xvssrarni.h.w xr19,     xr0,      12       // t1
4424
4425    xvldrepl.h    xr20,     t0,       4        // 3973
4426    xvldrepl.h    xr21,     t0,       6        // 995
4427    xvmulev_xvmaddod_lasx xr13, xr2, xr20, xr21, xr16, xr0
4428    xvneg.h       xr20,     xr20
4429    xvmulev_xvmaddod_lasx xr13, xr2, xr21, xr20, xr17, xr15
4430    xvilvl.w      xr13,     xr0,      xr16
4431    xvilvl.w      xr2,      xr15,     xr17
4432    xvilvh.w      xr0,      xr0,      xr16
4433    xvilvh.w      xr15,     xr15,     xr17
4434    xvssrarni.h.w xr0,      xr13,     12       // t2
4435    xvssrarni.h.w xr15,     xr2,      12       // t3
4436
4437    xvldrepl.h    xr20,     t0,        8       // 3703
4438    xvldrepl.h    xr21,     t0,        10      // 1751
4439    xvmulev_xvmaddod_lasx xr11, xr4, xr20, xr21, xr16, xr2
4440    xvneg.h       xr20,     xr20
4441    xvmulev_xvmaddod_lasx xr11, xr4, xr21, xr20, xr17, xr13
4442    xvilvl.w      xr11,     xr2,       xr16
4443    xvilvl.w      xr4,      xr13,      xr17
4444    xvilvh.w      xr2,      xr2,       xr16
4445    xvilvh.w      xr13,     xr13,      xr17
4446    xvssrarni.h.w xr2,      xr11,      12       // t4
4447    xvssrarni.h.w xr13,     xr4,       12       // t5
4448
4449    xvldrepl.h    xr20,     t0,        12       // 3290 -> 1645
4450    xvldrepl.h    xr21,     t0,        14       // 2440 -> 1220
4451    xvmulev_xvmaddod_lasx xr9, xr6, xr20, xr21, xr16, xr4
4452    xvneg.h       xr20,     xr20
4453    xvmulev_xvmaddod_lasx xr9, xr6, xr21, xr20, xr17, xr11
4454    xvilvl.w      xr9,      xr4,       xr16
4455    xvilvl.w      xr6,      xr11,      xr17
4456    xvilvh.w      xr4,      xr4,       xr16
4457    xvilvh.w      xr11,     xr11,      xr17
4458    xvssrarni.h.w xr4,      xr9,       12       // t6
4459    xvssrarni.h.w xr11,     xr6,       12       // t7
4460
4461    xvldrepl.h    xr20,     t0,        16       // 2751
4462    xvldrepl.h    xr21,     t0,        18       // 3035
4463    xvmulev_xvmaddod_lasx xr7, xr8, xr20, xr21, xr16, xr6
4464    xvneg.h       xr20,     xr20
4465    xvmulev_xvmaddod_lasx xr7, xr8, xr21, xr20, xr17, xr9
4466    xvilvl.w      xr7,      xr6,       xr16
4467    xvilvl.w      xr8,      xr9,       xr17
4468    xvilvh.w      xr6,      xr6,       xr16
4469    xvilvh.w      xr9,      xr9,       xr17
4470    xvssrarni.h.w xr6,      xr7,       12       // t8
4471    xvssrarni.h.w xr9,      xr8,       12       // t9
4472
4473    xvldrepl.h    xr20,     t0,        20       // 2106
4474    xvldrepl.h    xr21,     t0,        22       // 3513
4475    xvmulev_xvmaddod_lasx xr5, xr10, xr20, xr21, xr16, xr7
4476    xvneg.h       xr20,     xr20
4477    xvmulev_xvmaddod_lasx xr5, xr10, xr21, xr20, xr17, xr8
4478    xvilvl.w      xr5,      xr7,       xr16
4479    xvilvl.w      xr10,     xr8,       xr17
4480    xvilvh.w      xr7,      xr7,       xr16
4481    xvilvh.w      xr8,      xr8,       xr17
4482    xvssrarni.h.w xr7,      xr5,       12       // t10
4483    xvssrarni.h.w xr8,      xr10,      12       // t11
4484
4485    xvldrepl.h    xr20,     t0,        24       // 1380
4486    xvldrepl.h    xr21,     t0,        26       // 3857
4487    xvmulev_xvmaddod_lasx xr3, xr12, xr20, xr21, xr16, xr5
4488    xvneg.h       xr20,     xr20
4489    xvmulev_xvmaddod_lasx xr3, xr12, xr21, xr20, xr17, xr10
4490    xvilvl.w      xr3,      xr5,       xr16
4491    xvilvl.w      xr12,     xr10,      xr17
4492    xvilvh.w      xr5,      xr5,       xr16
4493    xvilvh.w      xr10,     xr10,      xr17
4494    xvssrarni.h.w xr5,      xr3,       12       // t12
4495    xvssrarni.h.w xr10,     xr12,      12       // t13
4496
4497    xvldrepl.h    xr20,     t0,        28       // 601
4498    xvldrepl.h    xr21,     t0,        30       // 4052
4499    xvmulev_xvmaddod_lasx xr1, xr14, xr20, xr21, xr16, xr3
4500    xvneg.h       xr20,     xr20
4501    xvmulev_xvmaddod_lasx xr1, xr14, xr21, xr20, xr17, xr12
4502    xvilvl.w      xr1,      xr3,       xr16
4503    xvilvl.w      xr14,     xr12,      xr17
4504    xvilvh.w      xr3,      xr3,       xr16
4505    xvilvh.w      xr12,     xr12,      xr17
4506    xvssrarni.h.w xr3,      xr1,       12       // t14
4507    xvssrarni.h.w xr12,     xr14,      12       // t15
4508
4509    xvsadd.h      xr1,      xr18,      xr6      // t0a
4510    xvssub.h      xr14,     xr18,      xr6      // t8a
4511    xvsadd.h      xr16,     xr19,      xr9      // t1a
4512    xvssub.h      xr17,     xr19,      xr9      // t9a
4513    xvsadd.h      xr6,      xr0,       xr7      // t2a
4514    xvssub.h      xr18,     xr0,       xr7      // t10a
4515    xvsadd.h      xr9,      xr15,      xr8      // t3a
4516    xvssub.h      xr19,     xr15,      xr8      // t11a
4517    xvsadd.h      xr0,      xr2,       xr5      // t4a
4518    xvssub.h      xr7,      xr2,       xr5      // t12a
4519    xvsadd.h      xr8,      xr13,      xr10     // t5a
4520    xvssub.h      xr15,     xr13,      xr10     // t13a
4521    xvsadd.h      xr2,      xr4,       xr3      // t6a
4522    xvssub.h      xr5,      xr4,       xr3      // t14a
4523    xvsadd.h      xr10,     xr11,      xr12     // t7a
4524    xvssub.h      xr13,     xr11,      xr12     // t15a
4525
4526    la.local      t0,       idct_coeffs_h
4527
4528    xvldrepl.h    xr20,     t0,        8        // 799
4529    xvldrepl.h    xr21,     t0,        10       // 4017
4530    xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr3, xr11
4531    xvneg.h       xr21,     xr21
4532    xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr4, xr12
4533    xvilvl.w      xr14,     xr11,      xr3
4534    xvilvl.w      xr17,     xr12,      xr4
4535    xvilvh.w      xr11,     xr11,      xr3
4536    xvilvh.w      xr12,     xr12,      xr4
4537    xvssrarni.h.w xr11,     xr14,      12       // t8
4538    xvssrarni.h.w xr12,     xr17,      12       // t9
4539
4540    xvneg.h       xr21,     xr21
4541    xvmulev_xvmaddod_lasx xr15, xr7, xr20, xr21, xr3, xr14
4542    xvneg.h       xr20,     xr20
4543    xvmulev_xvmaddod_lasx xr15, xr7, xr21, xr20, xr4, xr17
4544    xvilvl.w      xr15,     xr14,      xr3
4545    xvilvl.w      xr7,      xr17,      xr4
4546    xvilvh.w      xr14,     xr14,      xr3
4547    xvilvh.w      xr17,     xr17,      xr4
4548    xvssrarni.h.w xr14,     xr15,      12       // t13
4549    xvssrarni.h.w xr17,     xr7,       12       // t12
4550
4551    xvldrepl.h    xr20,     t0,        12       // 3406
4552    xvldrepl.h    xr21,     t0,        14       // 2276
4553    xvmulev_xvmaddod_lasx xr18, xr19, xr21, xr20, xr3, xr7
4554    xvneg.h       xr21,     xr21
4555    xvmulev_xvmaddod_lasx xr18, xr19, xr20, xr21, xr4, xr15
4556    xvilvl.w      xr18,     xr7,       xr3
4557    xvilvl.w      xr19,     xr15,      xr4
4558    xvilvh.w      xr7,      xr7,       xr3
4559    xvilvh.w      xr15,     xr15,      xr4
4560    xvssrarni.h.w xr7,      xr18,      12       // t10
4561    xvssrarni.h.w xr15,     xr19,      12       // t11
4562
4563    xvneg.h       xr21,     xr21
4564    xvmulev_xvmaddod_lasx xr13, xr5, xr20, xr21, xr3, xr18
4565    xvneg.h       xr20,     xr20
4566    xvmulev_xvmaddod_lasx xr13, xr5, xr21, xr20, xr4, xr19
4567    xvilvl.w      xr13,     xr18,      xr3
4568    xvilvl.w      xr5,      xr19,      xr4
4569    xvilvh.w      xr18,     xr18,      xr3
4570    xvilvh.w      xr19,     xr19,      xr4
4571    xvssrarni.h.w xr18,     xr13,      12       // t15
4572    xvssrarni.h.w xr19,     xr5,       12       // t14
4573
4574    xvsadd.h      xr5,      xr1,       xr0      // t0
4575    xvssub.h      xr13,     xr1,       xr0      // t4
4576    xvsadd.h      xr3,      xr16,      xr8      // t1
4577    xvssub.h      xr4,      xr16,      xr8      // t5
4578    xvsadd.h      xr0,      xr6,       xr2      // t2
4579    xvssub.h      xr1,      xr6,       xr2      // t6
4580    xvsadd.h      xr8,      xr9,       xr10     // t3
4581    xvssub.h      xr16,     xr9,       xr10     // t7
4582    xvsadd.h      xr2,      xr11,      xr17     // t8a
4583    xvssub.h      xr6,      xr11,      xr17     // t12a
4584    xvsadd.h      xr9,      xr12,      xr14     // t9a
4585    xvssub.h      xr10,     xr12,      xr14     // t13a
4586    xvsadd.h      xr11,     xr7,       xr19     // t10a
4587    xvssub.h      xr17,     xr7,       xr19     // t14a
4588    xvsadd.h      xr12,     xr15,      xr18     // t11a
4589    xvssub.h      xr14,     xr15,      xr18     // t15a
4590
4591    la.local      t0,       idct_coeffs_h
4592
4593    xvldrepl.h    xr20,     t0,        4        // 1567
4594    xvldrepl.h    xr21,     t0,        6        // 3784
4595    xvmulev_xvmaddod_lasx xr13, xr4, xr21, xr20, xr7, xr18
4596    xvneg.h       xr21,     xr21
4597    xvmulev_xvmaddod_lasx xr13, xr4, xr20, xr21, xr15, xr19
4598    xvilvl.w      xr13,     xr18,      xr7
4599    xvilvl.w      xr4,      xr19,      xr15
4600    xvilvh.w      xr18,     xr18,      xr7
4601    xvilvh.w      xr19,     xr19,      xr15
4602    xvssrarni.h.w xr18,     xr13,      12       // t4a
4603    xvssrarni.h.w xr19,     xr4,       12       // t5a
4604
4605    xvneg.h       xr21,     xr21
4606    xvmulev_xvmaddod_lasx xr16, xr1, xr20, xr21, xr7, xr4
4607    xvneg.h       xr20,     xr20
4608    xvmulev_xvmaddod_lasx xr16, xr1, xr21, xr20, xr15, xr13
4609    xvilvl.w      xr16,     xr4,       xr7
4610    xvilvl.w      xr1,      xr13,      xr15
4611    xvilvh.w      xr4,      xr4,       xr7
4612    xvilvh.w      xr13,     xr13,      xr15
4613    xvssrarni.h.w xr4,      xr16,      12       // t7a
4614    xvssrarni.h.w xr13,     xr1,       12       // t6a
4615
4616    xvneg.h       xr20,     xr20
4617    xvmulev_xvmaddod_lasx xr6, xr10, xr21, xr20, xr7, xr1
4618    xvneg.h       xr21,     xr21
4619    xvmulev_xvmaddod_lasx xr6, xr10, xr20, xr21, xr15, xr16
4620    xvilvl.w      xr6,      xr1,       xr7
4621    xvilvl.w      xr10,     xr16,      xr15
4622    xvilvh.w      xr1,      xr1,       xr7
4623    xvilvh.w      xr16,     xr16,      xr15
4624    xvssrarni.h.w xr1,      xr6,       12       // t12
4625    xvssrarni.h.w xr16,     xr10,      12       // t13
4626
4627    xvneg.h       xr21,     xr21
4628    xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr7, xr6
4629    xvneg.h       xr20,     xr20
4630    xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr15, xr10
4631    xvilvl.w      xr14,     xr6,       xr7
4632    xvilvl.w      xr17,     xr10,      xr15
4633    xvilvh.w      xr6,      xr6,       xr7
4634    xvilvh.w      xr10,     xr10,      xr15
4635    xvssrarni.h.w xr6,      xr14,      12       // t15
4636    xvssrarni.h.w xr10,     xr17,      12       // t14
4637
4638    xvsadd.h       xr14,     xr5,       xr0      // out[0]
4639    xvssub.h       xr17,     xr5,       xr0      // t2a
4640    xvssub.h       xr7,      xr3,       xr8      // t3a
4641    xvsadd.h       xr15,     xr3,       xr8      // out[15]
4642    xvsllwil.w.h   xr22,     xr15,      0
4643    xvexth.w.h     xr15,     xr15
4644    xvneg.w        xr22,     xr22
4645    xvneg.w        xr15,     xr15
4646    xvssrarni.h.w  xr15,     xr22,      0        // out[15]
4647    xvssub.h       xr7,      xr3,       xr8      // t3a
4648
4649    xvsadd.h       xr3,      xr19,      xr4      // out[12]
4650    xvssub.h       xr8,      xr19,      xr4      // t7
4651    xvssub.h       xr0,      xr18,      xr13     // t6
4652    xvsadd.h       xr5,      xr18,      xr13     // out[3]
4653    xvsllwil.w.h   xr22,     xr5,       0
4654    xvexth.w.h     xr5,      xr5
4655    xvneg.w        xr22,     xr22
4656    xvneg.w        xr5,      xr5
4657    xvssrarni.h.w  xr5,      xr22,      0        // out[3]
4658
4659    xvsadd.h       xr13,     xr9,       xr12     // out[14]
4660    xvssub.h       xr19,     xr9,       xr12     // t11
4661    xvssub.h       xr4,      xr2,       xr11     // t10
4662    xvsadd.h       xr18,     xr2,       xr11     // out[1]
4663    xvsllwil.w.h   xr22,     xr18,      0
4664    xvexth.w.h     xr18,     xr18
4665    xvneg.w        xr22,     xr22
4666    xvneg.w        xr18,     xr18
4667    xvssrarni.h.w  xr18,     xr22,      0        // out[1]
4668
4669    xvsadd.h       xr2,      xr1,       xr10     // out[2]
4670    xvssub.h       xr11,     xr1,       xr10     // t14a
4671    xvssub.h       xr12,     xr16,      xr6      // t15a
4672    xvsadd.h       xr9,      xr16,      xr6      // out[13]
4673    xvsllwil.w.h   xr22,     xr9,       0
4674    xvexth.w.h     xr9,      xr9
4675    xvneg.w        xr22,     xr22
4676    xvneg.w        xr9,      xr9
4677    xvssrarni.h.w  xr9,      xr22,      0        // out[13]
4678
4679    xvldrepl.h     xr20,     t0,        0        // 2896
4680    xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr20, xr6, xr10
4681    xvneg.h        xr21,     xr20
4682    xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr21, xr16, xr1
4683    xvilvl.w       xr17,     xr10,      xr6
4684    xvilvl.w       xr7,      xr1,       xr16
4685    xvilvh.w       xr10,     xr10,      xr6
4686    xvilvh.w       xr1,      xr1,       xr16
4687    xvssrarni.h.w  xr1,      xr7,       12       // out[8]
4688    xvsrari.w      xr17,     xr17,      12
4689    xvsrari.w      xr10,     xr10,      12
4690    xvneg.w        xr17,     xr17
4691    xvneg.w        xr10,     xr10
4692    xvssrarni.h.w  xr10,     xr17,      0        // out[7]
4693
4694    xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr21, xr16, xr17
4695    xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr20, xr6, xr7
4696    xvilvl.w       xr0,      xr17,      xr16
4697    xvilvl.w       xr8,      xr7,       xr6
4698    xvilvh.w       xr17,     xr17,      xr16
4699    xvilvh.w       xr7,      xr7,       xr6
4700    xvssrarni.h.w  xr7,      xr8,       12       // out[4]
4701    xvsrari.w      xr0,      xr0,       12
4702    xvsrari.w      xr17,     xr17,      12
4703    xvneg.w        xr0,      xr0
4704    xvneg.w        xr17,     xr17
4705    xvssrarni.h.w xr17,      xr0,       0        // out[11]
4706
4707    xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr21, xr16, xr0
4708    xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr20, xr6, xr8
4709    xvilvl.w       xr4,      xr0,       xr16
4710    xvilvl.w       xr19,     xr8,       xr6
4711    xvilvh.w       xr0,      xr0,       xr16
4712    xvilvh.w       xr8,      xr8,       xr6
4713    xvssrarni.h.w  xr8,      xr19,      12       // out[6]
4714    xvsrari.w      xr4,      xr4,       12
4715    xvsrari.w      xr0,      xr0,       12
4716    xvneg.w        xr4,      xr4
4717    xvneg.w        xr0,      xr0
4718    xvssrarni.h.w  xr0,      xr4,       0        // out[9]
4719    xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr20, xr6, xr4
4720    xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr21, xr16, xr19
4721    xvilvl.w       xr11,     xr4,       xr6
4722    xvilvl.w       xr12,     xr19,      xr16
4723    xvilvh.w       xr4,      xr4,       xr6
4724    xvilvh.w       xr19,     xr19,      xr16
4725    xvssrarni.h.w  xr19,     xr12,      12       // out[10]
4726    xvsrari.w      xr11,     xr11,      12
4727    xvsrari.w      xr4,      xr4,       12
4728    xvneg.w        xr11,     xr11
4729    xvneg.w        xr4,      xr4
4730    xvssrarni.h.w  xr4,      xr11,      0        // out[5]
4731.endm
4732
4733function inv_txfm_add_adst_adst_16x16_8bpc_lasx
4734    PUSH_REG
4735    xvld_x16 a2, 0, 32, xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \
4736             xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15
4737
4738    inv_adst16_lasx
4739
4740    LASX_TRANSPOSE8x8_H xr14, xr18, xr2, xr5, xr7, xr4, xr8, xr10, \
4741                        xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \
4742                        xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27
4743
4744    LASX_TRANSPOSE8x8_H xr1,  xr0,  xr19, xr17, xr3, xr9, xr13, xr15, \
4745                        xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \
4746                        xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27
4747
4748    xvsrari_h_x16 xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \
4749                  xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \
4750                  xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \
4751                  xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15, 2
4752
4753    xvpermi_q_x2 xr0, xr1, xr8, xr9, xr0, xr1, xr8, xr9, xr20, xr21
4754    xvpermi_q_x2 xr2, xr3, xr10, xr11, xr2, xr3, xr10, xr11, xr20, xr21
4755    xvpermi_q_x2 xr4, xr5, xr12, xr13, xr4, xr5, xr12, xr13, xr20, xr21
4756    xvpermi_q_x2 xr6, xr7, xr14, xr15, xr6, xr7, xr14, xr15, xr20, xr21
4757
4758    inv_adst16_lasx
4759
4760    xvsrari_h_x16 xr14, xr18, xr2,  xr5,  xr7,  xr4, xr8,  xr10, \
4761                  xr1,  xr0,  xr19, xr17, xr3,  xr9, xr13, xr15, \
4762                  xr14, xr18, xr11, xr5,  xr7,  xr4, xr8,  xr10, \
4763                  xr12, xr16, xr19, xr17, xr20, xr9, xr13, xr15, 4
4764
4765    xvxor.v       xr23,     xr23,     xr23
4766.irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480
4767    xvst          xr23,     a2,       \i
4768.endr
4769    alsl.d        t2,       a1,       a0,    1
4770    XVLD_DST_ADD_W16 xr14, xr18, xr11, xr5
4771    alsl.d        a0,       a1,       a0,    2
4772    alsl.d        t2,       a1,       a0,    1
4773    XVLD_DST_ADD_W16 xr7, xr4, xr8, xr10
4774    alsl.d        a0,       a1,       a0,    2
4775    alsl.d        t2,       a1,       a0,    1
4776    XVLD_DST_ADD_W16 xr12, xr16, xr19, xr17
4777    alsl.d        a0,       a1,       a0,    2
4778    alsl.d        t2,       a1,       a0,    1
4779    XVLD_DST_ADD_W16 xr20, xr9, xr13, xr15
4780    POP_REG
4781endfunc
4782