xref: /aosp_15_r20/external/libdav1d/src/x86/itx16_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; Copyright © 2021, Matthias Dressel
4; All rights reserved.
5;
6; Redistribution and use in source and binary forms, with or without
7; modification, are permitted provided that the following conditions are met:
8;
9; 1. Redistributions of source code must retain the above copyright notice, this
10;    list of conditions and the following disclaimer.
11;
12; 2. Redistributions in binary form must reproduce the above copyright notice,
13;    this list of conditions and the following disclaimer in the documentation
14;    and/or other materials provided with the distribution.
15;
16; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27%include "config.asm"
28%include "ext/x86/x86inc.asm"
29
30%if ARCH_X86_64
31
32SECTION_RODATA 32
33itx4_shuf:       dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
34                 dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
35idct4_12_shuf:   dd 0, 2, 4, 6, 1, 3, 5, 7
36idct4_12_shuf2:  dd 2, 0, 6, 4, 3, 1, 7, 5
37iadst8_12_shuf:  dd 0, 4, 1, 5, 2, 6, 3, 7
38idct16_12_shuf:  dd 0, 4, 1, 5, 3, 7, 2, 6
39iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5
40pw_2048_m2048:   dw  2048,  2048,  2048,  2048, -2048, -2048, -2048, -2048
41idct4_shuf:   db  0,  1,  4,  5, 12, 13,  8,  9,  2,  3,  6,  7, 14, 15, 10, 11
42idct32_shuf:  db  0,  1,  8,  9,  4,  5, 12, 13,  2,  3, 10, 11,  6,  7, 14, 15
43
44%macro COEF_PAIR 2-3 0
45pd_%1_%2: dd %1, %1, %2, %2
46%define pd_%1 (pd_%1_%2 + 4*0)
47%define pd_%2 (pd_%1_%2 + 4*2)
48%if %3
49dd -%2, -%2
50%define pd_%2_m%2 pd_%2
51%endif
52%endmacro
53
54COEF_PAIR  201,  995
55COEF_PAIR  401, 1931
56COEF_PAIR  799, 3406
57COEF_PAIR 1380,  601
58COEF_PAIR 1751, 2440
59COEF_PAIR 2598, 1189
60COEF_PAIR 2751, 2106
61COEF_PAIR 2896, 1567, 1
62COEF_PAIR 2896, 3784, 1
63COEF_PAIR 3035, 3513
64COEF_PAIR 3166, 3920
65COEF_PAIR 3703, 3290
66COEF_PAIR 3857, 4052
67COEF_PAIR 4017, 2276
68COEF_PAIR 4076, 3612
69COEF_PAIR 4091, 3973
70
71pd_8:      dd     8
72pd_m601:   dd  -601
73pd_m1189:  dd -1189
74pd_m1380:  dd -1380
75pd_m2106:  dd -2106
76pd_m2598:  dd -2598
77pd_m2751:  dd -2751
78pd_m3344:  dd -3344
79pd_1024:   dd  1024
80pd_1321:   dd  1321
81pd_1448:   dd  1448
82pd_1697:   dd  1697
83pd_2482:   dd  2482
84pd_3072:   dd  3072 ; 1024 + 2048
85pd_3803:   dd  3803
86pd_5119:   dd  5119 ; 1024 + 4096 - 1
87pd_5120:   dd  5120 ; 1024 + 4096
88pd_5793:   dd  5793
89pd_6144:   dd  6144 ; 2048 + 4096
90pd_17408:  dd 17408 ; 1024 + 16384
91
92pixel_10bpc_max: times 2 dw 0x03ff
93pixel_12bpc_max: times 2 dw 0x0fff
94dconly_10bpc:    times 2 dw 0x7c00
95dconly_12bpc:    times 2 dw 0x7000
96clip_18b_min:  dd -0x20000
97clip_18b_max:  dd  0x1ffff
98clip_20b_min:  dd -0x80000
99clip_20b_max:  dd  0x7ffff
100
101const idct64_mul_16bpc
102dd 4095,  101, 2967, -2824,  3745, 1660, 3822, -1474,   401,  4076,   799,  4017
103dd -700, 4036, 2359,  3349, -2191, 3461,  897,  3996, -2598, -3166, -4017,  -799
104dd 4065,  501, 3229, -2520,  3564, 2019, 3948, -1092,  1931,  3612,  3406,  2276
105dd -301, 4085, 2675,  3102, -1842, 3659, 1285,  3889, -1189, -3920, -2276, -3406
106
107cextern deint_shuf
108cextern idct64_mul
109cextern pw_1697x8
110cextern pw_1697x16
111cextern pw_1567_3784
112cextern pw_m1567_m3784
113cextern pw_m3784_1567
114cextern pw_2896_2896
115cextern pw_m2896_2896
116cextern pw_5
117cextern pw_2048
118cextern pw_4096
119cextern pw_8192
120cextern pw_16384
121cextern pw_2896x8
122cextern pd_2048
123
124cextern idct_4x8_internal_8bpc_avx2.main
125cextern idct_4x16_internal_8bpc_avx2.main
126cextern idct_8x8_internal_8bpc_avx2.main
127cextern idct_8x16_internal_8bpc_avx2.main
128cextern idct_16x4_internal_8bpc_avx2.main
129cextern idct_16x8_internal_8bpc_avx2.main
130cextern idct_16x16_internal_8bpc_avx2.main
131cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
132cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
133cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
134cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
135cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
136cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
137
138cextern iadst_4x4_internal_8bpc_avx2.main
139cextern iadst_4x8_internal_8bpc_avx2.main_pass2
140cextern iadst_4x16_internal_8bpc_avx2.main2
141cextern iadst_8x4_internal_8bpc_avx2.main
142cextern iadst_8x8_internal_8bpc_avx2.main_pass2
143cextern iadst_8x16_internal_8bpc_avx2.main
144cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
145cextern iadst_16x4_internal_8bpc_avx2.main
146cextern iadst_16x8_internal_8bpc_avx2.main
147cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
148cextern iadst_16x16_internal_8bpc_avx2.main
149cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
150
151SECTION .text
152
153%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
154
155%macro WRAP_XMM 1+
156    INIT_XMM cpuname
157    %1
158    INIT_YMM cpuname
159%endmacro
160
161%macro IWHT4_1D_PACKED 0
162    ; m0 = in0 in2, m1 = in1 in3
163    psubd                m2, m0, m1 ; t2
164    paddd               xm0, xm1    ; t0
165    vpermq               m2, m2, q3322
166    vpermq               m0, m0, q1100
167    vpermq               m1, m1, q3120
168    psubd                m3, m0, m2
169    psrad                m3, 1
170    psubd                m3, m1     ; t1 t3
171    psubd                m0, m3     ; ____ out0
172    paddd                m2, m3     ; out3 ____
173%endmacro
174
175INIT_YMM avx2
176cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
177    mova                xm0, [cq+16*0]
178    vinserti128          m0, [cq+16*2], 1
179    mova                xm1, [cq+16*1]
180    vinserti128          m1, [cq+16*3], 1
181    pxor                 m4, m4
182    mova          [cq+32*0], m4
183    mova          [cq+32*1], m4
184    lea                  r6, [dstq+strideq*2]
185    psrad                m0, 2
186    psrad                m1, 2
187    IWHT4_1D_PACKED
188    punpckhdq            m0, m3
189    punpckldq            m3, m2
190    punpckhqdq           m1, m0, m3
191    punpcklqdq           m0, m3
192    IWHT4_1D_PACKED
193    vpblendd             m0, m2, 0x33
194    packssdw             m0, m3
195    vextracti128        xm2, m0, 1
196    punpckhdq           xm1, xm0, xm2 ; out2 out1
197    punpckldq           xm0, xm2      ; out3 out0
198    movq                xm2, [r6  +strideq*1]
199    movhps              xm2, [dstq+strideq*0]
200    movq                xm3, [r6  +strideq*0]
201    movhps              xm3, [dstq+strideq*1]
202%ifidn bdmaxd, bdmaxm
203    movd                xm5, bdmaxd
204    vpbroadcastw        xm5, xm5
205%else   ; win64: load from stack
206    vpbroadcastw        xm5, bdmaxm
207%endif
208    paddsw              xm0, xm2
209    paddsw              xm1, xm3
210    pmaxsw              xm0, xm4
211    pmaxsw              xm1, xm4
212    pminsw              xm0, xm5
213    pminsw              xm1, xm5
214    movhps [dstq+strideq*0], xm0
215    movhps [dstq+strideq*1], xm1
216    movq   [r6  +strideq*0], xm1
217    movq   [r6  +strideq*1], xm0
218    RET
219
220; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
221; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
222; flags: 1 = packed, 2 = inv_dst2
223; skip round/shift if rnd is not a number
224%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
225%if %8 < 32
226    pmulld              m%4, m%1, m%8
227    pmulld              m%3, m%2, m%8
228%else
229%if %9 & 1
230    vbroadcasti128      m%3, [pd_%8]
231%else
232    vpbroadcastd        m%3, [pd_%8]
233%endif
234    pmulld              m%4, m%1, m%3
235    pmulld              m%3, m%2
236%endif
237%if %7 < 32
238    pmulld              m%1, m%7
239    pmulld              m%2, m%7
240%else
241%if %9 & 1
242    vbroadcasti128      m%5, [pd_%7]
243%else
244    vpbroadcastd        m%5, [pd_%7]
245%endif
246    pmulld              m%1, m%5
247    pmulld              m%2, m%5
248%endif
249%if %9 & 2
250    psubd               m%4, m%6, m%4
251    psubd               m%2, m%4, m%2
252%else
253%ifnum %6
254    paddd               m%4, m%6
255%endif
256    paddd               m%2, m%4
257%endif
258%ifnum %6
259    paddd               m%1, m%6
260%endif
261    psubd               m%1, m%3
262%ifnum %6
263    psrad               m%2, 12
264    psrad               m%1, 12
265%endif
266%endmacro
267
268%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth
269cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2
270    %define %%p1 m(i%1_%4_internal_%5bpc)
271    ; Jump to the 1st txfm function if we're not taking the fast path, which
272    ; in turn performs an indirect jump to the 2nd txfm function.
273    lea tx2q, [m(i%2_%4_internal_%5bpc).pass2]
274%ifidn %1_%2, dct_dct
275    test               eobd, eobd
276    jnz %%p1
277%else
278%if %3
279    add                eobd, %3
280%endif
281    ; jump to the 1st txfm function unless it's located directly after this
282    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
283ALIGN function_align
284%%end:
285%endif
286%endmacro
287
288%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth
289    INV_TXFM_FN          %1, %2, 0, 4x4, %3
290%ifidn %1_%2, dct_dct
291    vpbroadcastd        xm2, [dconly_%3bpc]
292%if %3 = 10
293.dconly:
294    imul                r6d, [cq], 181
295    mov                [cq], eobd ; 0
296    or                  r3d, 4
297.dconly2:
298    add                 r6d, 128
299    sar                 r6d, 8
300.dconly3:
301    imul                r6d, 181
302    add                 r6d, 2176
303    sar                 r6d, 12
304    movd                xm0, r6d
305    paddsw              xm0, xm2
306    vpbroadcastw        xm0, xm0
307.dconly_loop:
308    movq                xm1, [dstq+strideq*0]
309    movhps              xm1, [dstq+strideq*1]
310    paddsw              xm1, xm0
311    psubusw             xm1, xm2
312    movq   [dstq+strideq*0], xm1
313    movhps [dstq+strideq*1], xm1
314    lea                dstq, [dstq+strideq*2]
315    sub                 r3d, 2
316    jg .dconly_loop
317    WRAP_XMM RET
318%else
319    jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly
320%endif
321%endif
322%endmacro
323
324%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd
325    ITX_MULSUB_2D        %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1
326    punpckhqdq          m%3, m%2, m%1 ; t3 t2
327    punpcklqdq          m%2, m%1      ; t0 t1
328    paddd               m%1, m%2, m%3 ; out0 out1
329    psubd               m%2, m%3      ; out3 out2
330%endmacro
331
332%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
333    vpbroadcastd        m%5, [pw_m3784_1567]
334    punpckhwd           m%3, m%2, m%1
335    vpbroadcastd        m%4, [pw_1567_3784]
336    punpcklwd           m%2, m%1
337    vpbroadcastd        m%1, [pw_m2896_2896]
338    pmaddwd             m%5, m%3
339    pmaddwd             m%3, m%4
340    vpbroadcastd        m%4, [pw_2896_2896]
341    pmaddwd             m%1, m%2
342    pmaddwd             m%2, m%4
343    REPX     {paddd x, m%6}, m%5, m%3, m%1, m%2
344    REPX     {psrad x, 12 }, m%5, m%3, m%1, m%2
345    packssdw            m%3, m%5      ; t3 t2
346    packssdw            m%2, m%1      ; t0 t1
347    paddsw              m%1, m%2, m%3 ; out0 out1
348    psubsw              m%2, m%3      ; out3 out2
349%endmacro
350
351INV_TXFM_4X4_FN dct, dct
352INV_TXFM_4X4_FN dct, identity
353INV_TXFM_4X4_FN dct, adst
354INV_TXFM_4X4_FN dct, flipadst
355
356cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
357    call .main
358    vbroadcasti128       m2, [idct4_shuf]
359    packssdw             m0, m1
360    pshufb               m0, m2
361    jmp                tx2q
362.pass2:
363    vextracti128        xm1, m0, 1
364    WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5
365    packssdw            xm5, xm5 ; pw_2048
366    pmulhrsw            xm0, xm5
367    pmulhrsw            xm1, xm5
368    movq                xm2, [dstq+strideq*0]
369    movhps              xm2, [dstq+strideq*1]
370    lea                  r6, [dstq+strideq*2]
371    movq                xm3, [r6  +strideq*1]
372    movhps              xm3, [r6  +strideq*0]
373    vpbroadcastd        xm5, [pixel_10bpc_max]
374    pxor                 m4, m4
375    mova          [cq+32*0], m4
376    mova          [cq+32*1], m4
377    paddw               xm0, xm2
378    paddw               xm1, xm3
379    pmaxsw              xm0, xm4
380    pmaxsw              xm1, xm4
381    pminsw              xm0, xm5
382    pminsw              xm1, xm5
383    movq   [dstq+strideq*0], xm0
384    movhps [dstq+strideq*1], xm0
385    movhps [r6  +strideq*0], xm1
386    movq   [r6  +strideq*1], xm1
387    RET
388ALIGN function_align
389.main:
390    vpermq               m0, [cq+32*0], q3120
391    vpermq               m1, [cq+32*1], q3120
392    vpbroadcastd         m5, [pd_2048]
393.main2:
394    IDCT4_1D_PACKED       0, 1, 2, 3, 4, 5
395    ret
396
397INV_TXFM_4X4_FN adst, dct
398INV_TXFM_4X4_FN adst, adst
399INV_TXFM_4X4_FN adst, flipadst
400INV_TXFM_4X4_FN adst, identity
401
402%macro IADST4_1D 0
403    vpbroadcastd         m5, [pd_1321]
404    vpbroadcastd         m7, [pd_2482]
405    pmulld               m4, m0, m5    ; 1321*in0
406    pmulld               m6, m3, m7    ; 2482*in3
407    paddd                m4, m6        ; 1321*in0 + 2482*in3
408    pmulld               m6, m0, m7    ; 2482*in0
409    paddd                m0, m3        ; in0 + in3
410    paddd                m7, m5        ; pd_3803
411    pmulld               m5, m2        ; 1321*in2
412    pmulld               m3, m7        ; 3803*in3
413    pmulld               m7, m2        ; 3803*in2
414    psubd                m2, m0        ; in2 - in0 - in3
415    vpbroadcastd         m0, [pd_m3344]
416    pmulld               m1, m0        ; -t3
417    pmulld               m2, m0        ; out2 (unrounded)
418    psubd                m6, m5        ; 2482*in0 - 1321*in2
419    paddd                m4, m7        ;  t0
420    psubd                m6, m3        ;  t1
421    paddd                m3, m4, m6
422    psubd                m4, m1        ; out0 (unrounded)
423    psubd                m6, m1        ; out1 (unrounded)
424    paddd                m3, m1        ; out3 (unrounded)
425%endmacro
426
427cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
428    call .main
429    vinserti128          m0, m4, xm6, 1
430    vinserti128          m1, m2, xm3, 1
431.pass1_end:
432    vpbroadcastd         m5, [pd_2048]
433    mova                 m2, [itx4_shuf]
434    paddd                m0, m5
435    paddd                m1, m5
436    psrad                m0, 12
437    psrad                m1, 12
438    packssdw             m0, m1
439    vpermd               m0, m2, m0
440    psrld                m2, 4
441    pshufb               m0, m2
442%if WIN64
443    movaps             xmm6, [rsp+ 8]
444    movaps             xmm7, [rsp+24]
445%endif
446    jmp                tx2q
447.pass2:
448    lea                  r6, [deint_shuf+128]
449    vextracti128        xm1, m0, 1
450    call m(iadst_4x4_internal_8bpc).main
451.end:
452    vpbroadcastd        xm4, [pw_2048]
453    movq                xm2, [dstq+strideq*0]
454    movhps              xm2, [dstq+strideq*1]
455    lea                  r6, [dstq+strideq*2]
456    movq                xm3, [r6  +strideq*0]
457    movhps              xm3, [r6  +strideq*1]
458    vpbroadcastd        xm5, [pixel_10bpc_max]
459    pmulhrsw            xm0, xm4
460    pmulhrsw            xm1, xm4
461    pxor                 m4, m4
462    mova          [cq+32*0], m4
463    mova          [cq+32*1], m4
464    paddw               xm0, xm2
465    paddw               xm1, xm3
466    pmaxsw              xm0, xm4
467    pmaxsw              xm1, xm4
468    pminsw              xm0, xm5
469    pminsw              xm1, xm5
470    movq   [dstq+strideq*0], xm0
471    movhps [dstq+strideq*1], xm0
472    movq   [r6  +strideq*0], xm1
473    movhps [r6  +strideq*1], xm1
474    RET
475ALIGN function_align
476.main:
477    mova                xm0, [cq+16*0]
478    mova                xm1, [cq+16*1]
479    mova                xm2, [cq+16*2]
480    mova                xm3, [cq+16*3]
481%if WIN64
482    movaps         [rsp+16], xmm6
483    movaps         [rsp+32], xmm7
484%endif
485.main2:
486    WRAP_XMM IADST4_1D
487    ret
488
489INV_TXFM_4X4_FN flipadst, dct
490INV_TXFM_4X4_FN flipadst, adst
491INV_TXFM_4X4_FN flipadst, flipadst
492INV_TXFM_4X4_FN flipadst, identity
493
494cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
495    call m(iadst_4x4_internal_10bpc).main
496    vinserti128          m0, m3, xm2, 1
497    vinserti128          m1, m6, xm4, 1
498    jmp m(iadst_4x4_internal_10bpc).pass1_end
499.pass2:
500    lea                  r6, [deint_shuf+128]
501    vextracti128        xm1, m0, 1
502    call m(iadst_4x4_internal_8bpc).main
503    vpbroadcastd        xm4, [pw_2048]
504    movq                xm3, [dstq+strideq*1]
505    movhps              xm3, [dstq+strideq*0]
506    lea                  r6, [dstq+strideq*2]
507    movq                xm2, [r6  +strideq*1]
508    movhps              xm2, [r6  +strideq*0]
509    vpbroadcastd        xm5, [pixel_10bpc_max]
510    pmulhrsw            xm0, xm4
511    pmulhrsw            xm1, xm4
512    pxor                 m4, m4
513    mova          [cq+32*0], m4
514    mova          [cq+32*1], m4
515    paddw               xm0, xm2
516    paddw               xm1, xm3
517    pmaxsw              xm0, xm4
518    pmaxsw              xm1, xm4
519    pminsw              xm0, xm5
520    pminsw              xm1, xm5
521    movhps [dstq+strideq*0], xm1
522    movq   [dstq+strideq*1], xm1
523    movhps [r6  +strideq*0], xm0
524    movq   [r6  +strideq*1], xm0
525    RET
526
527INV_TXFM_4X4_FN identity, dct
528INV_TXFM_4X4_FN identity, adst
529INV_TXFM_4X4_FN identity, flipadst
530INV_TXFM_4X4_FN identity, identity
531
532cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2
533    vpbroadcastd         m1, [pd_5793]
534    pmulld               m0, m1, [cq+32*0]
535    pmulld               m1,     [cq+32*1]
536    vpbroadcastd         m5, [pd_2048]
537    mova                 m3, [itx4_shuf]
538    paddd                m0, m5
539    paddd                m1, m5
540    psrad                m0, 12
541    psrad                m1, 12
542    packssdw             m0, m1
543    vpermd               m0, m3, m0
544    psrld                m3, 4
545    pshufb               m0, m3
546    jmp                tx2q
547.pass2:
548    vpbroadcastd         m1, [pw_1697x8]
549    movq                xm2, [dstq+strideq*0]
550    movhps              xm2, [dstq+strideq*1]
551    lea                  r6, [dstq+strideq*2]
552    pmulhrsw             m1, m0
553    paddsw               m0, m1
554    movq                xm3, [r6  +strideq*0]
555    movhps              xm3, [r6  +strideq*1]
556    vpbroadcastd        xm4, [pixel_10bpc_max]
557    packssdw             m5, m5 ; pw_2048
558    pmulhrsw             m0, m5
559    pxor                 m5, m5
560    mova          [cq+32*0], m5
561    mova          [cq+32*1], m5
562    vextracti128        xm1, m0, 1
563    paddw               xm0, xm2
564    paddw               xm1, xm3
565    pmaxsw              xm0, xm5
566    pmaxsw              xm1, xm5
567    pminsw              xm0, xm4
568    pminsw              xm1, xm4
569    movq   [dstq+strideq*0], xm0
570    movhps [dstq+strideq*1], xm0
571    movq   [r6  +strideq*0], xm1
572    movhps [r6  +strideq*1], xm1
573    RET
574
575INV_TXFM_4X4_FN dct, dct,      12
576INV_TXFM_4X4_FN dct, identity, 12
577INV_TXFM_4X4_FN dct, adst,     12
578INV_TXFM_4X4_FN dct, flipadst, 12
579
580cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
581    call m(idct_4x4_internal_10bpc).main
582    mova                 m3, [idct4_12_shuf]
583    mova                 m4, [idct4_12_shuf2]
584    vpermd               m2, m4, m1
585    vpermd               m1, m3, m0
586    jmp m(iadst_4x4_internal_12bpc).pass1_end2
587.pass2:
588    vpbroadcastd         m5, [pd_2048]
589    vpermq               m0, m0, q3120
590    vpermq               m1, m1, q3120
591    call m(idct_4x4_internal_10bpc).main2
592    vpermq               m0, m0, q3120
593    vpermq               m1, m1, q2031
594    jmp m(iadst_4x4_internal_12bpc).end
595
596INV_TXFM_4X4_FN adst, dct,      12
597INV_TXFM_4X4_FN adst, adst,     12
598INV_TXFM_4X4_FN adst, flipadst, 12
599INV_TXFM_4X4_FN adst, identity, 12
600
601cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
602    call m(iadst_4x4_internal_10bpc).main
603    vinserti128          m1, m4, xm6, 1
604    vinserti128          m2, xm3, 1
605.pass1_end:
606    mova                 m3, [itx4_shuf]
607    vpbroadcastd         m5, [pd_1024]
608    psrad                m1, 1
609    psrad                m2, 1
610    vpermd               m1, m3, m1
611    vpermd               m2, m3, m2
612    paddd                m1, m5
613    paddd                m2, m5
614    psrad                m1, 11
615    psrad                m2, 11
616.pass1_end2:
617    vpbroadcastd         m3, [clip_18b_min]
618    vpbroadcastd         m4, [clip_18b_max]
619    punpcklqdq           m0, m1, m2
620    punpckhqdq           m1, m2
621    pmaxsd               m0, m3
622    pmaxsd               m1, m3
623    pminsd               m0, m4
624    pminsd               m1, m4
625    jmp                tx2q
626.pass2:
627    call .main_pass2
628    vinserti128          m0, m4, xm6, 1
629    vinserti128          m1, m2, xm3, 1
630.pass2_end:
631    vpbroadcastd         m5, [pd_2048]
632    paddd                m0, m5
633    paddd                m1, m5
634    psrad                m0, 12
635    psrad                m1, 12
636.end:
637%if WIN64
638    WIN64_RESTORE_XMM_INTERNAL
639    %assign xmm_regs_used 6
640%endif
641.end2:
642    vpbroadcastd         m4, [pw_16384]
643    movq                xm2, [dstq+strideq*0]
644    movq                xm3, [dstq+strideq*1]
645    lea                  r6, [dstq+strideq*2]
646    movhps              xm2, [r6  +strideq*0]   ; dst0 dst2
647    movhps              xm3, [r6  +strideq*1]   ; dst1 dst3
648    vpbroadcastd         m5, [pixel_12bpc_max]
649    vinserti128          m2, xm3, 1
650    psrad                m0, 3
651    psrad                m1, 3
652    packssdw             m0, m1     ; t0 t2 t1 t3
653    pmulhrsw             m0, m4
654    pxor                 m4, m4
655    mova          [cq+32*0], m4
656    mova          [cq+32*1], m4
657    paddw                m0, m2     ; out0 out2 out1 out3
658    pmaxsw               m0, m4
659    pminsw               m0, m5
660    vextracti128        xm1, m0, 1  ; out1 out3
661    movq   [dstq+strideq*0], xm0
662    movq   [dstq+strideq*1], xm1
663    movhps [r6  +strideq*0], xm0
664    movhps [r6  +strideq*1], xm1
665    RET
666.main_pass2:
667    vextracti128        xm3, m1, 1
668    mova                xm2, xm1
669    vextracti128        xm1, m0, 1
670    jmp m(iadst_4x4_internal_10bpc).main2
671
672INV_TXFM_4X4_FN flipadst, dct,      12
673INV_TXFM_4X4_FN flipadst, adst,     12
674INV_TXFM_4X4_FN flipadst, flipadst, 12
675INV_TXFM_4X4_FN flipadst, identity, 12
676
677cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
678    call m(iadst_4x4_internal_10bpc).main
679    vinserti128          m1, m3, xm2, 1
680    vinserti128          m2, m6, xm4, 1
681    jmp m(iadst_4x4_internal_12bpc).pass1_end
682.pass2:
683    call m(iadst_4x4_internal_12bpc).main_pass2
684    vinserti128          m0, m3, xm2, 1
685    vinserti128          m1, m6, xm4, 1
686    jmp m(iadst_4x4_internal_12bpc).pass2_end
687
688INV_TXFM_4X4_FN identity, dct,      12
689INV_TXFM_4X4_FN identity, adst,     12
690INV_TXFM_4X4_FN identity, flipadst, 12
691INV_TXFM_4X4_FN identity, identity, 12
692
693cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2
694    mova                 m2, [itx4_shuf]
695    vpbroadcastd         m3, [pd_1697]
696    vpermd               m0, m2, [cq+32*0]
697    vpermd               m2, m2, [cq+32*1]
698    vpbroadcastd         m5, [pd_2048]
699    pmulld               m1, m3, m0
700    pmulld               m3, m2
701    paddd                m1, m5
702    paddd                m3, m5
703    psrad                m1, 12
704    psrad                m3, 12
705    paddd                m1, m0
706    paddd                m2, m3
707    jmp m(iadst_4x4_internal_12bpc).pass1_end2
708.pass2:
709    ; m0 = in0 in1
710    ; m1 = in2 in3
711    vpbroadcastd         m3, [pd_5793]
712    vpbroadcastd         m5, [pd_2048]
713    pmulld               m0, m3
714    pmulld               m1, m3
715    paddd                m0, m5 ; 2048
716    paddd                m1, m5
717    psrad                m0, 12
718    psrad                m1, 12
719    jmp m(iadst_4x4_internal_12bpc).end
720
721%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth
722    INV_TXFM_FN          %1, %2, 0, 4x8, %3
723%ifidn %1_%2, dct_dct
724    vpbroadcastd        xm2, [dconly_%3bpc]
725%if %3 = 10
726.dconly:
727    imul                r6d, [cq], 181
728    mov                [cq], eobd ; 0
729    or                  r3d, 8
730    add                 r6d, 128
731    sar                 r6d, 8
732    imul                r6d, 181
733    jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2
734%else
735    jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly
736%endif
737%endif
738%endmacro
739
740%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
741    ITX_MULSUB_2D        %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3
742    vpbroadcastd        m%5, [pd_2896]
743    pmulld              m%1, m%5
744    pmulld              m%3, m%5
745    paddd               m%1, m%8
746    paddd               m%5, m%1, m%3
747    psubd               m%1, m%3
748    psrad               m%5, 12 ; t0
749    psrad               m%1, 12 ; t1
750    psubd               m%3, m%1, m%2
751    paddd               m%2, m%1
752    paddd               m%1, m%5, m%4
753    psubd               m%4, m%5, m%4
754%endmacro
755
756INV_TXFM_4X8_FN dct, dct
757INV_TXFM_4X8_FN dct, identity
758INV_TXFM_4X8_FN dct, adst
759INV_TXFM_4X8_FN dct, flipadst
760
761cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
762.pass1:
763    vpbroadcastd         m3, [pd_2896]
764    pmulld               m0, m3, [cq+32*0]
765    pmulld               m1, m3, [cq+32*1]
766    pmulld               m2, m3, [cq+32*2]
767    pmulld               m3, m3, [cq+32*3]
768    vpbroadcastd         m7, [pd_2048]
769    REPX      {paddd x, m7}, m0, m1, m2, m3
770    REPX      {psrad x, 12}, m0, m1, m2, m3
771    IDCT4_1D              0, 1, 2, 3, 4, 5, 6, 7
772    jmp                tx2q
773.pass2:
774    packssdw             m0, m2
775    packssdw             m1, m3
776    lea                  r6, [deint_shuf+128]
777    punpckhwd            m2, m0, m1
778    punpcklwd            m0, m1
779    punpckhdq            m1, m0, m2 ; 2 3
780    punpckldq            m0, m2     ; 0 1
781    vextracti128        xm2, m0, 1  ; 4 5
782    vextracti128        xm3, m1, 1  ; 6 7
783    call m(idct_4x8_internal_8bpc).main
784    vpbroadcastd        xm4, [pw_2048]
785    REPX  {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
786    lea                  r3, [strideq*3]
787    lea                  r6, [dstq+strideq*4]
788    movq                xm4, [dstq+strideq*0]
789    movhps              xm4, [dstq+strideq*1]
790    movq                xm5, [dstq+r3       ]
791    movhps              xm5, [dstq+strideq*2]
792    movq                xm6, [r6  +strideq*0]
793    movhps              xm6, [r6  +strideq*1]
794    movq                xm7, [r6  +r3       ]
795    movhps              xm7, [r6  +strideq*2]
796    paddw               xm0, xm4 ; 0 1
797    paddw               xm1, xm5 ; 3 2
798    paddw               xm2, xm6 ; 4 5
799    paddw               xm3, xm7 ; 7 6
800    vpbroadcastd        xm5, [pixel_10bpc_max]
801    pxor                 m4, m4
802    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
803    REPX    {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
804    REPX    {pminsw x, xm5}, xm0, xm1, xm2, xm3
805    movq   [dstq+strideq*0], xm0
806    movhps [dstq+strideq*1], xm0
807    movhps [dstq+strideq*2], xm1
808    movq   [dstq+r3       ], xm1
809    movq   [r6  +strideq*0], xm2
810    movhps [r6  +strideq*1], xm2
811    movhps [r6  +strideq*2], xm3
812    movq   [r6  +r3       ], xm3
813    RET
814
815INV_TXFM_4X8_FN adst, dct
816INV_TXFM_4X8_FN adst, adst
817INV_TXFM_4X8_FN adst, flipadst
818INV_TXFM_4X8_FN adst, identity
819
820cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
821    call m(iadst_8x4_internal_10bpc).main
822    vpbroadcastd         m5, [pd_2048]
823    paddd                m0, m5, m4
824    paddd                m1, m5, m6
825    paddd                m2, m5
826    paddd                m3, m5
827.pass1_end:
828    REPX      {psrad x, 12}, m0, m1, m2, m3
829    jmp                tx2q
830.pass2:
831    call .pass2_main
832    mova                xm4, [pw_2048_m2048]
833    REPX  {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
834.end:
835    lea                  r3, [strideq*3]
836    lea                  r6, [dstq+strideq*4]
837    movq                xm4, [dstq+strideq*0]
838    movhps              xm4, [dstq+strideq*1]
839    movq                xm5, [dstq+strideq*2]
840    movhps              xm5, [dstq+r3       ]
841    movq                xm6, [r6  +strideq*0]
842    movhps              xm6, [r6  +strideq*1]
843    movq                xm7, [r6  +strideq*2]
844    movhps              xm7, [r6  +r3       ]
845    paddw               xm0, xm4 ; 0 1
846    paddw               xm1, xm5 ; 2 3
847    paddw               xm2, xm6 ; 4 5
848    paddw               xm3, xm7 ; 6 7
849    vpbroadcastd        xm5, [pixel_10bpc_max]
850    pxor                 m4, m4
851    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
852    REPX    {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
853    REPX    {pminsw x, xm5}, xm0, xm1, xm2, xm3
854    movq   [dstq+strideq*0], xm0
855    movhps [dstq+strideq*1], xm0
856    movq   [dstq+strideq*2], xm1
857    movhps [dstq+r3       ], xm1
858    movq   [r6  +strideq*0], xm2
859    movhps [r6  +strideq*1], xm2
860    movq   [r6  +strideq*2], xm3
861    movhps [r6  +r3       ], xm3
862    RET
863ALIGN function_align
864.pass2_main:
865    packssdw             m0, m2
866    packssdw             m1, m3
867    lea                  r6, [deint_shuf+128]
868    punpcklwd            m4, m0, m1
869    punpckhwd            m0, m1
870    punpckhdq            m5, m4, m0
871    punpckldq            m4, m0
872    vextracti128        xm2, m4, 1      ; 4 5
873    vextracti128        xm3, m5, 1      ; 6 7
874    pshufd              xm4, xm4, q1032 ; 1 0
875    pshufd              xm5, xm5, q1032 ; 3 2
876    jmp m(iadst_4x8_internal_8bpc).main_pass2
877ALIGN function_align
878.main:
879    vpbroadcastd         m8, [clip_18b_min]
880    vpbroadcastd         m9, [clip_18b_max]
881.main2:
882    vbroadcasti128       m0, [cq+16*0]
883    vbroadcasti128       m2, [cq+16*2]
884    vbroadcasti128       m3, [cq+16*5]
885    vbroadcasti128       m1, [cq+16*7]
886    vpbroadcastd         m6, [pd_2896]
887    shufpd               m0, m2, 0x0c ; 0 2
888    shufpd               m1, m3, 0x0c ; 7 5
889    vbroadcasti128       m2, [cq+16*4]
890    vbroadcasti128       m4, [cq+16*6]
891    vbroadcasti128       m5, [cq+16*1]
892    vbroadcasti128       m3, [cq+16*3]
893    vpbroadcastd         m7, [pd_2048]
894    shufpd               m2, m4, 0x0c ; 4 6
895    shufpd               m3, m5, 0x0c ; 3 1
896    REPX {pmulld x, m6}, m0, m1, m2, m3
897    REPX {paddd  x, m7}, m0, m1, m2, m3
898    REPX {psrad  x, 12}, m0, m1, m2, m3
899.main3:
900    ITX_MULSUB_2D         1, 0, 4, 5, 6, 7,  401_1931, 4076_3612, 1
901    ITX_MULSUB_2D         3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1
902    psubd                m4, m0, m2   ; t4  t6
903    paddd                m0, m2       ; t0  t2
904    psubd                m2, m1, m3   ; t5  t7
905    paddd                m1, m3       ; t1  t3
906    REPX     {pmaxsd x, m8}, m4, m2, m0, m1
907    REPX     {pminsd x, m9}, m4, m2, m0, m1
908    pxor                 m5, m5
909    psubd                m5, m4
910    vpblendd             m4, m2, 0xcc ; t4  t7
911    vpblendd             m2, m5, 0xcc ; t5 -t6
912    ITX_MULSUB_2D         4, 2, 3, 5, 6, 7, 1567, 3784
913    vpbroadcastd         m5, [pd_2896]
914    vbroadcasti128       m6, [pw_2048_m2048] ; + + - -
915    punpckhqdq           m3, m0, m1
916    punpcklqdq           m0, m1
917    psubd                m1, m0, m3   ; t2  t3
918    paddd                m0, m3       ;  out0 -out7
919    punpckhqdq           m3, m4, m2   ; t7a t6a
920    punpcklqdq           m4, m2       ; t5a t4a
921    psubd                m2, m4, m3   ; t7  t6
922    paddd                m4, m3       ;  out6 -out1
923    REPX     {pmaxsd x, m8}, m1, m2
924    REPX     {pminsd x, m9}, m1, m2
925    vpblendd             m3, m1, m2, 0xcc
926    shufpd               m1, m2, 0x05
927    pmulld               m3, m5
928    pmulld               m5, m1
929    psignd               m0, m6       ;  out0  out7
930    psignd               m4, m6       ;  out6  out1
931    paddd                m3, m7
932    psubd                m2, m3, m5
933    paddd                m5, m3
934    psrad                m2, 12       ;  out4 -out5
935    psrad                m5, 12       ; -out3  out2
936    ret
937
938INV_TXFM_4X8_FN flipadst, dct
939INV_TXFM_4X8_FN flipadst, adst
940INV_TXFM_4X8_FN flipadst, flipadst
941INV_TXFM_4X8_FN flipadst, identity
942
943cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
944    call m(iadst_8x4_internal_10bpc).main
945    vpbroadcastd         m5, [pd_2048]
946    paddd                m0, m5, m3
947    paddd                m1, m5, m2
948    paddd                m2, m5, m6
949    paddd                m3, m5, m4
950    jmp m(iadst_4x8_internal_10bpc).pass1_end
951.pass2:
952    call m(iadst_4x8_internal_10bpc).pass2_main
953    mova                xm4, [pw_2048_m2048]
954    REPX  {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0
955    lea                  r3, [strideq*3]
956    lea                  r6, [dstq+strideq*4]
957    movq                xm4, [dstq+strideq*1]
958    movhps              xm4, [dstq+strideq*0]
959    movq                xm5, [dstq+r3       ]
960    movhps              xm5, [dstq+strideq*2]
961    movq                xm6, [r6  +strideq*1]
962    movhps              xm6, [r6  +strideq*0]
963    movq                xm7, [r6  +r3       ]
964    movhps              xm7, [r6  +strideq*2]
965    paddw               xm3, xm4 ; 1 0
966    paddw               xm2, xm5 ; 3 2
967    paddw               xm1, xm6 ; 5 4
968    paddw               xm0, xm7 ; 7 6
969    vpbroadcastd        xm5, [pixel_10bpc_max]
970    pxor                 m4, m4
971    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
972    REPX    {pmaxsw x, xm4}, xm3, xm2, xm1, xm0
973    REPX    {pminsw x, xm5}, xm3, xm2, xm1, xm0
974    movhps [dstq+strideq*0], xm3
975    movq   [dstq+strideq*1], xm3
976    movhps [dstq+strideq*2], xm2
977    movq   [dstq+r3       ], xm2
978    movhps [r6  +strideq*0], xm1
979    movq   [r6  +strideq*1], xm1
980    movhps [r6  +strideq*2], xm0
981    movq   [r6  +r3       ], xm0
982    RET
983
984INV_TXFM_4X8_FN identity, dct
985INV_TXFM_4X8_FN identity, adst
986INV_TXFM_4X8_FN identity, flipadst
987INV_TXFM_4X8_FN identity, identity
988
989cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2
990.pass1:
991    vpbroadcastd         m3, [pd_2896]
992    pmulld               m0, m3, [cq+32*0]
993    pmulld               m1, m3, [cq+32*1]
994    pmulld               m2, m3, [cq+32*2]
995    pmulld               m3,     [cq+32*3]
996    vpbroadcastd         m5, [pd_2048]
997    vpbroadcastd         m4, [pd_5793]
998    REPX     {paddd  x, m5}, m0, m1, m2, m3
999    REPX     {psrad  x, 12}, m0, m1, m2, m3
1000    REPX     {pmulld x, m4}, m0, m1, m2, m3
1001    REPX     {paddd  x, m5}, m0, m1, m2, m3
1002    REPX     {psrad  x, 12}, m0, m1, m2, m3
1003    jmp                tx2q
1004.pass2:
1005    vpbroadcastd         m6, [pixel_10bpc_max]
1006    call .pass2_end
1007    RET
1008ALIGN function_align
1009.pass2_end:
1010    vpbroadcastd         m4, [pw_4096]
1011    packssdw             m0, m2
1012    packssdw             m1, m3
1013    punpckhwd            m2, m0, m1
1014    punpcklwd            m0, m1
1015    pmulhrsw             m2, m4
1016    pmulhrsw             m0, m4
1017    punpckhdq            m1, m0, m2 ; 2 3 6 7
1018    punpckldq            m0, m2     ; 0 1 4 5
1019    lea                  r3, [strideq*3]
1020    lea                  r6, [dstq+strideq*4]
1021    movq                xm2, [dstq+strideq*0]
1022    movhps              xm2, [dstq+strideq*1]
1023    vpbroadcastq         m4, [r6  +strideq*0]
1024    vpbroadcastq         m5, [r6  +strideq*1]
1025    movq                xm3, [dstq+strideq*2]
1026    movhps              xm3, [dstq+r3       ]
1027    vpblendd             m2, m4, 0x30
1028    vpblendd             m2, m5, 0xc0
1029    vpbroadcastq         m4, [r6  +strideq*2]
1030    vpbroadcastq         m5, [r6  +r3       ]
1031    vpblendd             m3, m4, 0x30
1032    vpblendd             m3, m5, 0xc0
1033    pxor                 m4, m4
1034    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
1035    paddw                m0, m2 ; out0 out1 out4 out5
1036    paddw                m1, m3 ; out2 out3 out6 out7
1037    pmaxsw               m0, m4
1038    pmaxsw               m1, m4
1039    pminsw               m0, m6
1040    pminsw               m1, m6
1041    vextracti128        xm2, m0, 1  ; out4 out5
1042    vextracti128        xm3, m1, 1  ; out6 out7
1043    movq   [dstq+strideq*0], xm0
1044    movhps [dstq+strideq*1], xm0
1045    movq   [dstq+strideq*2], xm1
1046    movhps [dstq+r3       ], xm1
1047    movq   [r6  +strideq*0], xm2
1048    movhps [r6  +strideq*1], xm2
1049    movq   [r6  +strideq*2], xm3
1050    movhps [r6  +r3       ], xm3
1051    ret
1052
1053INV_TXFM_4X8_FN dct, dct,      12
1054INV_TXFM_4X8_FN dct, identity, 12
1055INV_TXFM_4X8_FN dct, adst,     12
1056INV_TXFM_4X8_FN dct, flipadst, 12
1057
1058cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
1059    jmp m(idct_4x8_internal_10bpc).pass1
1060.pass2:
1061    vpbroadcastd         m8, [clip_18b_min]
1062    vpbroadcastd         m9, [clip_18b_max]
1063    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
1064    REPX     {pminsd x, m9}, m0, m1, m2, m3
1065    ; transpose & interleave
1066    pshufd               m0, m0, q1320
1067    pshufd               m1, m1, q1320
1068    pshufd               m2, m2, q1320
1069    pshufd               m3, m3, q1320
1070    punpckldq            m4, m0, m1
1071    punpckhdq            m0, m1
1072    punpckldq            m5, m2, m3
1073    punpckhdq            m2, m3
1074    vpermq               m0, m0, q3102
1075    vpermq               m2, m2, q3102
1076    vperm2i128           m1, m0, m2, 0x31   ; 1 5 (interleaved)
1077    vperm2i128           m3, m0, m2, 0x20   ; 7 3 (interleaved)
1078    vperm2i128           m0, m4, m5, 0x20   ; 0 2 (interleaved)
1079    vperm2i128           m2, m4, m5, 0x31   ; 4 6 (interleaved)
1080    vpbroadcastd         m7, [pd_2048]
1081    call m(idct_8x4_internal_10bpc).main
1082    psubd                m3, m0, m4  ; out7 out6
1083    paddd                m0, m4      ; out0 out1
1084    paddd                m1, m2, m5  ; out3 out2
1085    psubd                m2, m5      ; out4 out5
1086    pshufd               m1, m1, q1032
1087    pshufd               m3, m3, q1032
1088    jmp m(iadst_4x8_internal_12bpc).end
1089
1090INV_TXFM_4X8_FN adst, dct,      12
1091INV_TXFM_4X8_FN adst, adst,     12
1092INV_TXFM_4X8_FN adst, flipadst, 12
1093INV_TXFM_4X8_FN adst, identity, 12
1094
1095cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
1096    call m(iadst_8x4_internal_10bpc).main
1097    psrad                m0, m4, 1
1098    psrad                m1, m6, 1
1099    psrad                m2, 1
1100    psrad                m3, 1
1101.pass1_end:
1102    vpbroadcastd         m5, [pd_1024]
1103    REPX      {paddd x, m5}, m0, m1, m2, m3
1104    REPX      {psrad x, 11}, m0, m1, m2, m3
1105    jmp                tx2q
1106.pass2:
1107    vpbroadcastd         m8, [clip_18b_min]
1108    vpbroadcastd         m9, [clip_18b_max]
1109    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
1110    REPX     {pminsd x, m9}, m0, m1, m2, m3
1111    call .pass2_main
1112    vpblendd             m3, m0, m4, 0x33 ; out6 out7
1113    vpblendd             m0, m4, 0xcc     ; out0 out1
1114    pshufd               m1, m5, q1032
1115    psignd               m2, m6           ; out4 out5
1116    psignd               m1, m6           ; out2 out3
1117.end:
1118    vpbroadcastd         m4, [pw_16384]
1119    REPX       {psrad x, 3}, m0, m1, m2, m3
1120    packssdw             m0, m2     ; 0 1 4 5 (interleaved)
1121    packssdw             m1, m3     ; 2 3 6 7 (interleaved)
1122    mova                 m2, [iadst8_12_shuf]
1123    vpermd               m0, m2, m0 ; 0 1 4 5
1124    vpermd               m1, m2, m1 ; 2 3 6 7
1125    pmulhrsw             m0, m4
1126    pmulhrsw             m1, m4
1127    lea                  r3, [strideq*3]
1128    lea                  r6, [dstq+strideq*4]
1129    movq                xm4, [dstq+strideq*0]
1130    movhps              xm4, [dstq+strideq*1]
1131    movq                xm5, [dstq+strideq*2]
1132    movhps              xm5, [dstq+r3       ]
1133    movq                xm6, [r6  +strideq*0]
1134    movhps              xm6, [r6  +strideq*1]
1135    vinserti128          m4, xm6, 1
1136    movq                xm7, [r6  +strideq*2]
1137    movhps              xm7, [r6  +r3       ]
1138    vinserti128          m5, xm7, 1
1139    paddw                m0, m4 ; 0 1 4 5
1140    paddw                m1, m5 ; 2 3 6 7
1141    vpbroadcastd         m5, [pixel_12bpc_max]
1142    pxor                 m4, m4
1143    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
1144    REPX    {pmaxsw x,  m4}, m0, m1
1145    REPX    {pminsw x,  m5}, m0, m1
1146    vextracti128        xm2, m0, 1  ; out4 out5
1147    vextracti128        xm3, m1, 1  ; out6 out7
1148    movq   [dstq+strideq*0], xm0
1149    movhps [dstq+strideq*1], xm0
1150    movq   [dstq+strideq*2], xm1
1151    movhps [dstq+r3       ], xm1
1152    movq   [r6  +strideq*0], xm2
1153    movhps [r6  +strideq*1], xm2
1154    movq   [r6  +strideq*2], xm3
1155    movhps [r6  +r3       ], xm3
1156    RET
1157ALIGN function_align
1158.pass2_main:
1159    ; transpose & interleave
1160    pshufd               m0, m0, q1320
1161    pshufd               m1, m1, q1320
1162    pshufd               m2, m2, q1320
1163    pshufd               m3, m3, q1320
1164    punpckldq            m4, m0, m1
1165    punpckhdq            m0, m1
1166    punpckldq            m5, m2, m3
1167    punpckhdq            m2, m3
1168    vperm2i128           m1, m0, m2, 0x31   ; 7 5 (interleaved)
1169    vperm2i128           m3, m0, m2, 0x20   ; 3 1 (interleaved)
1170    vperm2i128           m0, m4, m5, 0x20   ; 0 2 (interleaved)
1171    vperm2i128           m2, m4, m5, 0x31   ; 4 6 (interleaved)
1172    vpbroadcastd         m7, [pd_2048]
1173    jmp m(iadst_4x8_internal_10bpc).main3
1174
1175INV_TXFM_4X8_FN flipadst, dct,      12
1176INV_TXFM_4X8_FN flipadst, adst,     12
1177INV_TXFM_4X8_FN flipadst, flipadst, 12
1178INV_TXFM_4X8_FN flipadst, identity, 12
1179
1180cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
1181    call m(iadst_8x4_internal_10bpc).main
1182    psrad                m0, m3, 1
1183    psrad                m1, m2, 1
1184    psrad                m2, m6, 1
1185    psrad                m3, m4, 1
1186    jmp m(iadst_4x8_internal_12bpc).pass1_end
1187.pass2:
1188    vpbroadcastd         m8, [clip_18b_min]
1189    vpbroadcastd         m9, [clip_18b_max]
1190    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
1191    REPX     {pminsd x, m9}, m0, m1, m2, m3
1192    call m(iadst_4x8_internal_12bpc).pass2_main
1193    shufpd               m3, m4, m0, 0x05 ; out1 out0
1194    shufpd               m0, m4, 0x05     ; out7 out6
1195    psignd               m2, m6
1196    pshufd               m6, m6, q1032
1197    pshufd               m1, m2, q1032    ; out5 out4
1198    psignd               m2, m5, m6       ; out3 out2
1199    jmp m(iadst_4x8_internal_12bpc).end
1200
1201INV_TXFM_4X8_FN identity, dct,      12
1202INV_TXFM_4X8_FN identity, adst,     12
1203INV_TXFM_4X8_FN identity, flipadst, 12
1204INV_TXFM_4X8_FN identity, identity, 12
1205
1206cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
1207    jmp m(iidentity_4x8_internal_10bpc).pass1
1208.pass2:
1209    ; m0 = in0 in1
1210    ; m1 = in2 in3
1211    ; m2 = in4 in5
1212    ; m3 = in6 in7
1213    vpbroadcastd         m6, [pixel_12bpc_max]
1214    call m(iidentity_4x8_internal_10bpc).pass2_end
1215    RET
1216
1217%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth
1218    INV_TXFM_FN          %1, %2, 0, 4x16, %3
1219%ifidn %1_%2, dct_dct
1220    imul                r6d, [cq], 181
1221    vpbroadcastd        xm2, [dconly_%3bpc]
1222    mov                [cq], eobd ; 0
1223    or                  r3d, 16
1224    add                 r6d, 384
1225    sar                 r6d, 9
1226    jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3
1227%endif
1228%endmacro
1229
1230INV_TXFM_4X16_FN dct, dct
1231INV_TXFM_4X16_FN dct, identity
1232INV_TXFM_4X16_FN dct, adst
1233INV_TXFM_4X16_FN dct, flipadst
1234
1235cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
1236.pass1:
1237    vpbroadcastd        m10, [pd_3072]
1238    mova                 m1, [cq+32*2]
1239    mova                 m3, [cq+32*6]
1240    mova                 m5, [cq+32*3]
1241    mova                 m7, [cq+32*7]
1242    call .pass1_main
1243    pmulld               m0, m6, [cq+32*0]
1244    pmulld               m2, m6, [cq+32*4]
1245    pmulld               m4, m6, [cq+32*1]
1246    pmulld               m6,     [cq+32*5]
1247    call .pass1_main2
1248    REPX       {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
1249    jmp                tx2q
1250.pass2:
1251    packssdw             m0, m4
1252    packssdw             m1, m5
1253    packssdw             m2, m6
1254    packssdw             m3, m7
1255    lea                  r6, [deint_shuf+128]
1256    punpcklwd            m4, m2, m3
1257    punpckhwd            m2, m3
1258    punpckhwd            m5, m0, m1
1259    punpcklwd            m0, m1
1260    punpckhdq            m1, m0, m4     ; 2 3
1261    punpckldq            m0, m4         ; 0 1
1262    punpckldq            m4, m5, m2     ; 8 9
1263    punpckhdq            m5, m2         ; a b
1264    vextracti128        xm2, m0, 1      ; 4 5
1265    vextracti128        xm3, m1, 1      ; 6 7
1266    vextracti128        xm6, m4, 1      ; c d
1267    vextracti128        xm7, m5, 1      ; e f
1268    call m(idct_4x16_internal_8bpc).main
1269    vpbroadcastd         m9, [pw_2048]
1270    vinserti128          m0, m0, xm1, 1 ; 0 1   3 2
1271    vinserti128          m1, m2, xm3, 1 ; 4 5   7 6
1272    vinserti128          m2, m4, xm5, 1 ; 8 9   b a
1273    vinserti128          m3, m6, xm7, 1 ; c d   f e
1274    vpbroadcastd         m8, [pixel_10bpc_max]
1275    call .pass2_end
1276    RET
1277ALIGN function_align
1278.pass1_main:
1279    vpbroadcastd         m4, [pd_3784]
1280    vpbroadcastd         m8, [pd_1567]
1281    vpbroadcastd         m9, [pd_2048]
1282    vpbroadcastd         m6, [pd_1448]
1283    ITX_MULSUB_2D         1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
1284    ITX_MULSUB_2D         5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
1285    ret
1286ALIGN function_align
1287.pass1_main2:
1288    paddd                m0, m10
1289    paddd                m4, m10
1290    paddd                m8, m0, m2
1291    psubd                m0, m2
1292    paddd                m9, m4, m6
1293    psubd                m4, m6
1294    REPX      {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
1295    psubd                m2, m0, m1
1296    paddd                m1, m0
1297    psubd                m6, m4, m5
1298    paddd                m5, m4
1299    paddd                m0, m8, m3
1300    psubd                m3, m8, m3
1301    paddd                m4, m9, m7
1302    psubd                m7, m9, m7
1303    ret
1304ALIGN function_align
1305.pass2_end:
1306    lea                  r6, [strideq*3]
1307    pxor                 m7, m7
1308    pmulhrsw             m0, m9
1309    call .write_4x4
1310    pmulhrsw             m0, m1, m9
1311    call .write_4x4
1312    pmulhrsw             m0, m2, m9
1313    call .write_4x4
1314    pmulhrsw             m0, m3, m9
1315    call .write_4x4
1316    ret
1317ALIGN function_align
1318.write_4x4:
1319    movq                xm4, [dstq+strideq*0]
1320    movhps              xm4, [dstq+strideq*1]
1321    vpbroadcastq         m5, [dstq+strideq*2]
1322    vpbroadcastq         m6, [dstq+r6       ]
1323    mova          [cq+32*0], m7
1324    mova          [cq+32*1], m7
1325    add                  cq, 32*2
1326    vpblendd             m4, m5, 0xc0
1327    vpblendd             m4, m6, 0x30
1328    paddw                m4, m0
1329    pmaxsw               m4, m7
1330    pminsw               m4, m8
1331    vextracti128        xm5, m4, 1
1332    movq   [dstq+strideq*0], xm4
1333    movhps [dstq+strideq*1], xm4
1334    movhps [dstq+strideq*2], xm5
1335    movq   [dstq+r6       ], xm5
1336    lea                dstq, [dstq+strideq*4]
1337    ret
1338
1339INV_TXFM_4X16_FN adst, dct
1340INV_TXFM_4X16_FN adst, adst
1341INV_TXFM_4X16_FN adst, flipadst
1342INV_TXFM_4X16_FN adst, identity
1343
1344cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
1345    call m(iadst_16x4_internal_10bpc).main
1346    vpbroadcastd         m6, [pd_6144]
1347    call m(iadst_16x4_internal_10bpc).main_end
1348    psrad                m0, m4, 13
1349    psrad                m1, m5, 13
1350    psrad                m2, 13
1351    psrad                m3, 13
1352    psrad                m4, m8, 13
1353    psrad                m5, m9, 13
1354    psrad                m6, 13
1355    psrad                m7, 13
1356    jmp                tx2q
1357.pass2:
1358    call .pass2_main
1359    vpbroadcastd         m5, [pw_2048]
1360    vpbroadcastd         m8, [pixel_10bpc_max]
1361    lea                  r6, [strideq*3]
1362    vpblendd             m4, m3, m0, 0xcc ; -out3   out0   out2  -out1
1363    pshufd               m2, m2, q1032    ; -out11  out8   out10 -out9
1364    vpblendd             m3, m0, 0x33     ; -out15  out12  out14 -out13
1365    pxor                 m7, m7
1366    psubw                m9, m7, m5
1367    vpblendd             m9, m5, 0x3c     ; -2048   2048   2048  -2048
1368    pmulhrsw             m0, m4, m9
1369    call .write_4x4
1370    pmulhrsw             m0, m1, m9
1371    call .write_4x4
1372    pmulhrsw             m0, m2, m9
1373    call .write_4x4
1374    pmulhrsw             m0, m3, m9
1375    call .write_4x4
1376    RET
1377ALIGN function_align
1378.write_4x4:
1379    movq                xm4, [dstq+r6       ]
1380    movhps              xm4, [dstq+strideq*0]
1381    vpbroadcastq         m5, [dstq+strideq*1]
1382    vpbroadcastq         m6, [dstq+strideq*2]
1383    mova          [cq+32*0], m7
1384    mova          [cq+32*1], m7
1385    add                  cq, 32*2
1386    vpblendd             m4, m5, 0xc0
1387    vpblendd             m4, m6, 0x30
1388    paddw                m4, m0
1389    pmaxsw               m4, m7
1390    pminsw               m4, m8
1391    vextracti128        xm5, m4, 1
1392    movhps [dstq+strideq*0], xm4
1393    movhps [dstq+strideq*1], xm5
1394    movq   [dstq+strideq*2], xm5
1395    movq   [dstq+r6       ], xm4
1396    lea                dstq, [dstq+strideq*4]
1397    ret
1398ALIGN function_align
1399.pass2_main:
1400    packssdw             m0, m4
1401    packssdw             m1, m5
1402    packssdw             m2, m6
1403    packssdw             m3, m7
1404    lea                  r6, [deint_shuf+128]
1405    punpcklwd            m4, m2, m3
1406    punpckhwd            m2, m3
1407    punpckhwd            m5, m0, m1
1408    punpcklwd            m0, m1
1409    punpckhdq            m1, m0, m4
1410    punpckldq            m0, m4
1411    punpckldq            m4, m5, m2
1412    punpckhdq            m5, m2
1413    vpblendd             m3, m0, m1, 0x33
1414    vpblendd             m0, m1, 0xcc
1415    shufpd               m2, m5, m4, 0x05
1416    shufpd               m4, m5, 0x05
1417    vperm2i128           m1, m0, m3, 0x31 ; 4 7   6 5
1418    vinserti128          m0, xm3, 1       ; 0 3   2 1
1419    vperm2i128           m3, m2, m4, 0x31 ; c f   e d ; ????
1420    vinserti128          m2, xm4, 1       ; b 8   9 a
1421    call m(iadst_4x16_internal_8bpc).main2
1422    vpbroadcastd         m5, [pw_2896x8]
1423    paddsw               m1, m2, m4
1424    psubsw               m2, m4
1425    pmulhrsw             m1, m5           ; -out7   out4   out6  -out5
1426    pmulhrsw             m2, m5           ;  out8  -out11 -out9   out10
1427    ret
1428ALIGN function_align
1429.main:
1430    vbroadcasti128       m0, [cq+16* 0]
1431    vbroadcasti128       m4, [cq+16* 2]
1432    vbroadcasti128       m1, [cq+16*15]
1433    vbroadcasti128       m5, [cq+16*13]
1434    vbroadcasti128       m2, [cq+16* 4]
1435    vbroadcasti128       m6, [cq+16* 6]
1436    vbroadcasti128       m3, [cq+16*11]
1437    vbroadcasti128       m7, [cq+16* 9]
1438    shufpd               m0, m4, 0x0c ;  0  2
1439    shufpd               m1, m5, 0x0c ; 15 13
1440    shufpd               m2, m6, 0x0c ;  4  6
1441    shufpd               m3, m7, 0x0c ; 11  9
1442    vbroadcasti128       m4, [cq+16* 8]
1443    vbroadcasti128       m6, [cq+16*10]
1444    vbroadcasti128       m5, [cq+16* 7]
1445    vbroadcasti128       m7, [cq+16* 5]
1446    shufpd               m4, m6, 0x0c ;  8 10
1447    shufpd               m5, m7, 0x0c ;  7  5
1448    vbroadcasti128       m6, [cq+16*12]
1449    vbroadcasti128       m7, [cq+16*14]
1450    shufpd               m6, m7, 0x0c ; 12 14
1451    vbroadcasti128       m7, [cq+16* 3]
1452    vbroadcasti128       m8, [cq+16* 1]
1453    shufpd               m7, m8, 0x0c ;  3  1
1454.main2:
1455    ; expects: m12 = clip_min   m13 = clip_max
1456    vpbroadcastd        m11, [pd_2048]
1457    ITX_MULSUB_2D         1, 0, 8, 9, 10, 11,  201_995,  4091_3973, 1
1458    ITX_MULSUB_2D         3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1
1459    ITX_MULSUB_2D         5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1
1460    ITX_MULSUB_2D         7, 6, 8, 9, 10, 11, 3857_4052, 1380_601,  1
1461    psubd                m8, m0, m4 ; t8a  t10a
1462    paddd                m0, m4     ; t0a  t2a
1463    psubd                m4, m1, m5 ; t9a  t11a
1464    paddd                m1, m5     ; t1a  t3a
1465    psubd                m5, m2, m6 ; t12a t14a
1466    paddd                m2, m6     ; t4a  t6a
1467    psubd                m6, m3, m7 ; t13a t15a
1468    paddd                m3, m7     ; t5a  t7a
1469    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8
1470    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8
1471    ITX_MULSUB_2D         8, 4, 7, 9, 10, 11,  799_3406, 4017_2276, 1
1472    ITX_MULSUB_2D         6, 5, 7, 9, 10, 11, 4017_2276, 10,        1
1473    psubd                m7, m0, m2 ; t4   t6
1474    paddd                m0, m2     ; t0   t2
1475    psubd                m2, m1, m3 ; t5   t7
1476    paddd                m1, m3     ; t1   t3
1477    psubd                m3, m4, m6 ; t12a t14a
1478    paddd                m4, m6     ; t8a  t10a
1479    psubd                m6, m8, m5 ; t13a t15a
1480    paddd                m8, m5     ; t9a  t11a
1481    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8
1482    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8
1483    punpcklqdq           m5, m3, m7 ; t12a t4
1484    punpckhqdq           m3, m7     ; t14a t6
1485    punpckhqdq           m7, m6, m2 ; t15a t7
1486    punpcklqdq           m6, m2     ; t13a t5
1487    ITX_MULSUB_2D         7, 3, 2, 9, 10, 11, 3784, 1567
1488    ITX_MULSUB_2D         5, 6, 2, 9, 10, 11, 1567, 10
1489    vpbroadcastd        m10, [pd_2896]
1490    vbroadcasti128       m9, [pw_2048_m2048] ; + + - -
1491    punpckhqdq           m2, m4, m0 ; t10a t2
1492    punpcklqdq           m4, m0     ; t8a  t0
1493    punpckhqdq           m0, m8, m1 ; t11a t3
1494    punpcklqdq           m8, m1     ; t9a  t1
1495    paddd                m1, m6, m7 ; out2   -out3
1496    psubd                m6, m7     ; t14a t6
1497    paddd                m7, m5, m3 ; -out13  out12
1498    psubd                m5, m3     ; t15a t7
1499    psubd                m3, m8, m0 ; t11  t3a
1500    paddd                m8, m0     ; out14  -out15
1501    paddd                m0, m4, m2 ; -out1   out0
1502    psubd                m4, m2     ; t10  t2a
1503    REPX    {pmaxsd x, m12}, m6, m5, m3, m4
1504    REPX    {pminsd x, m13}, m6, m5, m3, m4
1505    REPX    {pmulld x, m10}, m6, m5, m3, m4
1506    paddd                m6, m11
1507    paddd                m4, m11
1508    paddd                m2, m6, m5 ; -out5   out4
1509    psubd                m6, m5     ;  out10 -out11
1510    psubd                m5, m4, m3 ; -out9   out8
1511    paddd                m3, m4     ;  out6  -out7
1512    REPX     {psrad  x, 12}, m2, m3, m5, m6
1513    REPX     {psignd x, m9}, m1, m8, m3, m6
1514    pshufd               m9, m9, q1032
1515    REPX     {psignd x, m9}, m0, m7, m2, m5
1516    ret
1517
1518INV_TXFM_4X16_FN flipadst, dct
1519INV_TXFM_4X16_FN flipadst, adst
1520INV_TXFM_4X16_FN flipadst, flipadst
1521INV_TXFM_4X16_FN flipadst, identity
1522
1523cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
1524.pass1:
1525    call m(iadst_16x4_internal_10bpc).main
1526    vpbroadcastd         m6, [pd_6144]
1527    call m(iadst_16x4_internal_10bpc).main_end
1528    psrad                m0, m3, 13
1529    psrad                m1, m2, 13
1530    psrad                m2, m5, 13
1531    psrad                m3, m4, 13
1532    psrad                m4, m7, 13
1533    psrad                m5, m6, 13
1534    psrad                m6, m9, 13
1535    psrad                m7, m8, 13
1536    jmp                tx2q
1537.pass2:
1538    call m(iadst_4x16_internal_10bpc).pass2_main
1539    vpbroadcastd         m5, [pw_2048]
1540    vpbroadcastd         m8, [pixel_10bpc_max]
1541    lea                  r6, [strideq*3]
1542    vpblendd             m4, m3, m0, 0x33 ; -out0   out3   out1  -out2
1543    pshufd               m2, m2, q1032    ; -out11  out8   out10 -out9
1544    vpblendd             m3, m0, 0xcc     ; -out12  out15  out13 -out14
1545    pxor                 m7, m7
1546    psubw                m9, m7, m5
1547    vpblendd             m9, m5, 0x3c     ; -2048   2048   2048  -2048
1548    pmulhrsw             m0, m4, m9
1549    call .write_4x4
1550    pmulhrsw             m0, m2, m9
1551    call .write_4x4
1552    pmulhrsw             m0, m1, m9
1553    call .write_4x4
1554    pmulhrsw             m0, m3, m9
1555    call .write_4x4
1556    RET
1557ALIGN function_align
1558.write_4x4:
1559    movq                xm4, [dstq+strideq*0]
1560    movhps              xm4, [dstq+r6       ]
1561    vpbroadcastq         m5, [dstq+strideq*1]
1562    vpbroadcastq         m6, [dstq+strideq*2]
1563    mova          [cq+32*0], m7
1564    mova          [cq+32*1], m7
1565    add                  cq, 32*2
1566    vpblendd             m4, m5, 0x30
1567    vpblendd             m4, m6, 0xc0
1568    paddw                m4, m0
1569    pmaxsw               m4, m7
1570    pminsw               m4, m8
1571    vextracti128        xm5, m4, 1
1572    movq   [dstq+strideq*0], xm4
1573    movq   [dstq+strideq*1], xm5
1574    movhps [dstq+strideq*2], xm5
1575    movhps [dstq+r6       ], xm4
1576    lea                dstq, [dstq+strideq*4]
1577    ret
1578
1579INV_TXFM_4X16_FN identity, dct
1580INV_TXFM_4X16_FN identity, adst
1581INV_TXFM_4X16_FN identity, flipadst
1582INV_TXFM_4X16_FN identity, identity
1583
1584cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2
1585    vpbroadcastd         m7, [pd_5793]
1586    pmulld               m0, m7, [cq+32*0]
1587    pmulld               m4, m7, [cq+32*1]
1588    pmulld               m1, m7, [cq+32*2]
1589    pmulld               m5, m7, [cq+32*3]
1590    pmulld               m2, m7, [cq+32*4]
1591    pmulld               m6, m7, [cq+32*5]
1592    pmulld               m3, m7, [cq+32*6]
1593    pmulld               m7,     [cq+32*7]
1594    vpbroadcastd         m8, [pd_6144]
1595    REPX      {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7
1596    REPX      {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7
1597    jmp                tx2q
1598.pass2:
1599    packssdw             m0, m4
1600    packssdw             m1, m5
1601    packssdw             m2, m6
1602    packssdw             m3, m7
1603    vpbroadcastd         m7, [pw_1697x16]
1604    vpbroadcastd         m8, [pw_2048]
1605    pmulhrsw             m4, m7, m0
1606    pmulhrsw             m5, m7, m1
1607    pmulhrsw             m6, m7, m2
1608    pmulhrsw             m7, m3
1609    REPX      {paddsw x, x}, m0, m1, m2, m3
1610    paddsw               m0, m4
1611    paddsw               m1, m5
1612    paddsw               m2, m6
1613    paddsw               m3, m7
1614    vpbroadcastd         m4, [pixel_10bpc_max]
1615    call .pass2_end
1616    RET
1617ALIGN function_align
1618.pass2_end:
1619    punpckhwd            m7, m0, m1
1620    punpcklwd            m0, m1
1621    punpckhwd            m1, m2, m3
1622    punpcklwd            m2, m3
1623    lea                  r6, [strideq*5]
1624    pxor                 m3, m3
1625    punpckhdq            m5, m0, m2 ; 2 3   6 7
1626    punpckldq            m0, m2     ; 0 1   4 5
1627    punpckldq            m6, m7, m1 ; 8 9   c d
1628    punpckhdq            m7, m1     ; a b   e f
1629    pmulhrsw             m0, m8
1630    call .write_2x4x2
1631    pmulhrsw             m0, m5, m8
1632    call .write_2x4x2
1633    pmulhrsw             m0, m6, m8
1634    lea                dstq, [dstq+strideq*4]
1635    call .write_2x4x2
1636    pmulhrsw             m0, m7, m8
1637    call .write_2x4x2
1638    ret
1639ALIGN function_align
1640.write_2x4x2:
1641    movq                xm1, [dstq+strideq*0]
1642    movhps              xm1, [dstq+strideq*1]
1643    vpbroadcastq         m2, [dstq+strideq*4]
1644    vpblendd             m1, m2, 0x30
1645    vpbroadcastq         m2, [dstq+r6       ]
1646    vpblendd             m1, m2, 0xc0
1647    mova          [cq+32*0], m3
1648    mova          [cq+32*1], m3
1649    add                  cq, 32*2
1650    paddw                m1, m0
1651    pmaxsw               m1, m3
1652    pminsw               m1, m4
1653    vextracti128        xm2, m1, 1
1654    movq   [dstq+strideq*0], xm1
1655    movhps [dstq+strideq*1], xm1
1656    movq   [dstq+strideq*4], xm2
1657    movhps [dstq+r6       ], xm2
1658    lea                dstq, [dstq+strideq*2]
1659    ret
1660
1661INV_TXFM_4X16_FN dct, dct,      12
1662INV_TXFM_4X16_FN dct, identity, 12
1663INV_TXFM_4X16_FN dct, adst,     12
1664INV_TXFM_4X16_FN dct, flipadst, 12
1665
1666cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
1667    jmp m(idct_4x16_internal_10bpc).pass1
1668.pass2:
1669    punpckldq            m8, m0, m1
1670    punpckhdq            m0, m1
1671    punpckldq            m9, m2, m3
1672    punpckhdq            m2, m3
1673    punpckldq            m1, m4, m5
1674    punpckhdq            m4, m5
1675    punpckldq            m3, m6, m7
1676    punpckhdq            m6, m7
1677    punpcklqdq           m5, m0, m2         ;  2  6
1678    punpckhqdq          m12, m0, m2         ;  3  7
1679    punpcklqdq           m0, m8, m9         ;  0  4
1680    punpckhqdq          m10, m8, m9         ;  1  5
1681    punpcklqdq           m2, m1, m3         ;  8 12
1682    punpckhqdq          m13, m1, m3         ;  9 13
1683    punpcklqdq           m9, m4, m6         ; 10 14
1684    punpckhqdq           m4, m6             ; 11 15
1685    vperm2i128           m1,  m5,  m9, 0x20 ;  2 10
1686    vperm2i128           m3,  m9,  m5, 0x31 ; 14  6
1687    vpermq              m11,  m4, q1302     ; 15 11
1688    ; interleave
1689    REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10
1690    vpbroadcastd         m8, [clip_18b_min]
1691    vpbroadcastd         m9, [clip_18b_max]
1692    REPX     {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13
1693    REPX     {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13
1694    call m(idct_16x4_internal_10bpc).pass1_main
1695    vpermq               m6, m12, q1302 ;  7  3
1696    vpermq               m5, m13, q3120 ;  9 13
1697    call m(idct_16x4_internal_10bpc).pass1_main2
1698    call m(idct_16x4_internal_10bpc).pass1_main3
1699    REPX       {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
1700    packssdw             m0, m1
1701    packssdw             m1, m2, m3
1702    packssdw             m2, m4, m5
1703    packssdw             m3, m6, m7
1704    mova                 m4, [idct16_12_shuf]
1705    REPX  {vpermd x, m4, x}, m0, m1, m2, m3
1706    vpbroadcastd         m9, [pw_16384]
1707    vpbroadcastd         m8, [pixel_12bpc_max]
1708    call m(idct_4x16_internal_10bpc).pass2_end
1709    RET
1710
1711INV_TXFM_4X16_FN adst, dct,      12
1712INV_TXFM_4X16_FN adst, adst,     12
1713INV_TXFM_4X16_FN adst, flipadst, 12
1714INV_TXFM_4X16_FN adst, identity, 12
1715
1716cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
1717    call .main_pass1
1718    psrad                m0, m4, 12
1719    psrad                m1, m5, 12
1720    psrad                m2, 12
1721    psrad                m3, 12
1722    psrad                m4, m8, 12
1723    psrad                m5, m9, 12
1724    psrad                m6, 12
1725    psrad                m7, 12
1726    jmp                tx2q
1727.pass2:
1728    vpbroadcastd        m12, [clip_18b_min]
1729    vpbroadcastd        m13, [clip_18b_max]
1730    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
1731    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
1732    call .transpose_16x4
1733    call m(iadst_4x16_internal_10bpc).main2
1734    pshufd               m4, m5, q1032
1735    psrad                m5, m6, 3
1736    pshufd               m6, m7, q1032
1737    psrad                m7, m8, 3
1738    REPX {pshufd x, x, q1032}, m0, m2
1739    REPX       {psrad x, 3}, m0, m1, m2, m3, m4, m6
1740.pass2_end:
1741    packssdw             m0, m1
1742    packssdw             m1, m2, m3
1743    packssdw             m2, m4, m5
1744    packssdw             m3, m6, m7
1745    mova                 m4, [iadst16_12_shuf]
1746    REPX  {vpermd x, m4, x}, m0, m1, m2, m3
1747    vpbroadcastd         m9, [pw_16384]
1748    vpbroadcastd         m8, [pixel_12bpc_max]
1749    lea                  r6, [strideq*3]
1750    pxor                 m7, m7
1751    pmulhrsw             m0, m9
1752    call m(iadst_4x16_internal_10bpc).write_4x4
1753    pmulhrsw             m0, m9, m1
1754    call m(iadst_4x16_internal_10bpc).write_4x4
1755    pmulhrsw             m0, m9, m2
1756    call m(iadst_4x16_internal_10bpc).write_4x4
1757    pmulhrsw             m0, m9, m3
1758    call m(iadst_4x16_internal_10bpc).write_4x4
1759    RET
1760ALIGN function_align
1761.transpose_16x4:
1762    ; transpose & interleave
1763    punpckldq            m8, m0, m1
1764    punpckhdq            m0, m1
1765    punpckldq            m9, m2, m3
1766    punpckhdq            m2, m3
1767    punpckldq            m1, m4, m5
1768    punpckhdq            m4, m5
1769    punpckldq            m3, m6, m7
1770    punpckhdq            m6, m7
1771    punpcklqdq          m10, m8, m0
1772    punpckhqdq           m0, m8
1773    punpcklqdq          m11, m9, m2
1774    punpckhqdq           m2, m9
1775    punpcklqdq           m8, m1, m4
1776    punpckhqdq           m4, m1
1777    punpcklqdq           m9, m3, m6
1778    punpckhqdq           m6, m3
1779    vperm2i128           m5,  m0,  m2, 0x31   ;  7  5
1780    vperm2i128           m7,  m0,  m2, 0x20   ;  3  1
1781    vperm2i128           m0, m10, m11, 0x20   ;  0  2
1782    vperm2i128           m2, m10, m11, 0x31   ;  4  6
1783    vperm2i128           m1,  m4,  m6, 0x31   ; 15 13
1784    vperm2i128           m3,  m4,  m6, 0x20   ; 11  9
1785    vperm2i128           m4,  m8,  m9, 0x20   ;  8 10
1786    vperm2i128           m6,  m8,  m9, 0x31   ; 12 14
1787    ret
1788ALIGN function_align
1789.main_pass1:
1790    call m(iadst_16x4_internal_10bpc).main
1791    vpbroadcastd         m6, [pd_3072]
1792    paddd               m10, m4, m5
1793    psubd                m4, m3
1794    psubd                m5, m3
1795    paddd                m3, m10
1796    psubd                m8, m7, m1
1797    paddd                m7, m9
1798    psubd                m9, m1
1799    paddd                m7, m1
1800    REPX      {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7
1801    REPX      {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7
1802    paddd                m6, m0
1803    ret
1804
1805INV_TXFM_4X16_FN flipadst, dct,      12
1806INV_TXFM_4X16_FN flipadst, adst,     12
1807INV_TXFM_4X16_FN flipadst, flipadst, 12
1808INV_TXFM_4X16_FN flipadst, identity, 12
1809
1810cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
1811    call m(iadst_4x16_internal_12bpc).main_pass1
1812    psrad                m0, m3, 12
1813    psrad                m1, m2, 12
1814    psrad                m2, m5, 12
1815    psrad                m3, m4, 12
1816    psrad                m4, m7, 12
1817    psrad                m5, m6, 12
1818    psrad                m6, m9, 12
1819    psrad                m7, m8, 12
1820    jmp                tx2q
1821.pass2:
1822    vpbroadcastd        m12, [clip_18b_min]
1823    vpbroadcastd        m13, [clip_18b_max]
1824    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
1825    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
1826    call m(iadst_4x16_internal_12bpc).transpose_16x4
1827    call m(iadst_4x16_internal_10bpc).main2
1828    pshufd               m4, m3, q1032
1829    psrad                m3, m5, 3
1830    psrad                m5, m2, 3
1831    pshufd               m2, m6, q1032
1832    pshufd               m6, m1, q1032
1833    psrad                m1, m7, 3
1834    psrad                m7, m0, 3
1835    pshufd               m0, m8, q1032
1836    REPX       {psrad x, 3}, m0, m2, m4, m6
1837    jmp m(iadst_4x16_internal_12bpc).pass2_end
1838
1839INV_TXFM_4X16_FN identity, dct,      12
1840INV_TXFM_4X16_FN identity, adst,     12
1841INV_TXFM_4X16_FN identity, flipadst, 12
1842INV_TXFM_4X16_FN identity, identity, 12
1843
1844cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
1845    vpbroadcastd         m8, [pd_1697]
1846    mova                 m0, [cq+32*0]
1847    mova                 m4, [cq+32*1]
1848    mova                 m1, [cq+32*2]
1849    mova                 m5, [cq+32*3]
1850    vpbroadcastd         m9, [pd_6144]
1851    pmulld               m2, m8, m0
1852    pmulld               m6, m8, m4
1853    pmulld               m3, m8, m1
1854    pmulld               m7, m8, m5
1855    mova                m10, [cq+32*4]
1856    mova                m11, [cq+32*5]
1857    mova                m12, [cq+32*6]
1858    mova                m13, [cq+32*7]
1859    REPX     {paddd  x, m9}, m2, m6, m3, m7
1860    REPX     {psrad  x, 12}, m2, m6, m3, m7
1861    paddd                m0, m2
1862    pmulld               m2, m8, m10
1863    paddd                m4, m6
1864    pmulld               m6, m8, m11
1865    paddd                m1, m3
1866    pmulld               m3, m8, m12
1867    paddd                m5, m7
1868    pmulld               m7, m8, m13
1869    REPX     {psrad  x, 1 }, m0, m4, m1, m5
1870    REPX     {paddd  x, m9}, m2, m6, m3, m7
1871    REPX     {psrad  x, 12}, m2, m6, m3, m7
1872    paddd                m2, m10
1873    paddd                m6, m11
1874    paddd                m3, m12
1875    paddd                m7, m13
1876    REPX     {psrad  x, 1 }, m2, m6, m3, m7
1877    jmp                tx2q
1878.pass2:
1879    vpbroadcastd        m12, [clip_18b_min]
1880    vpbroadcastd        m13, [clip_18b_max]
1881    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
1882    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
1883    vpbroadcastd         m8, [pd_5793]
1884    vpbroadcastd         m9, [pd_1024]
1885    REPX     {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
1886    REPX     {paddd  x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
1887    REPX     {psrad  x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
1888    packssdw             m0, m4
1889    packssdw             m1, m5
1890    packssdw             m2, m6
1891    packssdw             m3, m7
1892    vpbroadcastd         m8, [pw_16384]
1893    vpbroadcastd         m4, [pixel_12bpc_max]
1894    call m(iidentity_4x16_internal_10bpc).pass2_end
1895    RET
1896
1897%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth
1898    INV_TXFM_FN          %1, %2, 0, 8x4, %3
1899%ifidn %1_%2, dct_dct
1900    vpbroadcastd         m2, [dconly_%3bpc]
1901%if %3 = 10
1902.dconly:
1903    imul                r6d, [cq], 181
1904    mov                [cq], eobd ; 0
1905    or                  r3d, 4
1906    add                 r6d, 128
1907    sar                 r6d, 8
1908    imul                r6d, 181
1909    add                 r6d, 128
1910    sar                 r6d, 8
1911    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
1912%else
1913    jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly
1914%endif
1915%endif
1916%endmacro
1917
1918INV_TXFM_8X4_FN dct, dct
1919INV_TXFM_8X4_FN dct, identity
1920INV_TXFM_8X4_FN dct, adst
1921INV_TXFM_8X4_FN dct, flipadst
1922
1923cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
1924    vpbroadcastd         m8, [clip_18b_min]
1925    vpbroadcastd         m9, [clip_18b_max]
1926.pass1:
1927    vbroadcasti128       m1, [cq+16*1]
1928    vbroadcasti128       m0, [cq+16*5]
1929    vbroadcasti128       m2, [cq+16*3]
1930    vbroadcasti128       m3, [cq+16*7]
1931    vpbroadcastd         m6, [pd_2896]
1932    shufpd               m1, m0, 0x0c ; 1 5
1933    shufpd               m3, m2, 0x0c ; 7 3
1934    vbroadcasti128       m0, [cq+16*0]
1935    vbroadcasti128       m4, [cq+16*2]
1936    vbroadcasti128       m2, [cq+16*4]
1937    vbroadcasti128       m5, [cq+16*6]
1938    vpbroadcastd         m7, [pd_2048]
1939    shufpd               m0, m4, 0x0c ; 0 2
1940    shufpd               m2, m5, 0x0c ; 4 6
1941    REPX {pmulld x, m6}, m1, m3, m0, m2
1942    REPX {paddd  x, m7}, m1, m3, m0, m2
1943    REPX {psrad  x, 12}, m1, m3, m0, m2
1944    call .main
1945    psubd                m3, m0, m4  ; out7 out6 (interleaved)
1946    paddd                m0, m4      ; out0 out1 (interleaved)
1947    paddd                m1, m2, m5  ; out3 out2 (interleaved)
1948    psubd                m2, m5      ; out4 out5 (interleaved)
1949    pshufd               m1, m1, q1032
1950    pshufd               m3, m3, q1032
1951    jmp                tx2q
1952.pass2:
1953    vbroadcasti128       m4, [deint_shuf]
1954    packssdw             m0, m1
1955    packssdw             m2, m3
1956    vperm2i128           m1, m0, m2, 0x31
1957    vinserti128          m0, xm2, 1
1958    pshufb               m0, m4
1959    pshufb               m1, m4
1960    IDCT4_1D_PACKED_WORD  0, 1, 2, 3, 4, 7
1961    vpermq               m0, m0, q3120 ; out0 out1
1962    vpermq               m2, m1, q2031 ; out2 out3
1963    jmp m(iadst_8x4_internal_10bpc).end
1964ALIGN function_align
1965.main:
1966    ITX_MULSUB_2D         1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1
1967    IDCT4_1D_PACKED       0, 2, 4, 5, 6, 7
1968    vpbroadcastd         m6, [pd_2896]
1969    punpcklqdq           m4, m1, m3   ; t4a  t7a
1970    punpckhqdq           m1, m3       ; t5a  t6a
1971    psubd                m3, m4, m1   ; t5a  t6a
1972    paddd                m4, m1       ; t4   t7
1973    REPX     {pmaxsd x, m8}, m3, m4, m0, m2
1974    REPX     {pminsd x, m9}, m3, m4, m0, m2
1975    pmulld               m3, m6
1976    pshufd               m1, m3, q1032
1977    paddd                m3, m7
1978    psubd                m5, m3, m1
1979    paddd                m1, m3
1980    psrad                m5, 12
1981    psrad                m1, 12
1982    vpblendd             m5, m4, 0x33 ; t4   t5
1983    punpckhqdq           m4, m1       ; t7   t6
1984    ret
1985
1986INV_TXFM_8X4_FN adst, dct
1987INV_TXFM_8X4_FN adst, adst
1988INV_TXFM_8X4_FN adst, flipadst
1989INV_TXFM_8X4_FN adst, identity
1990
1991cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
1992    call m(iadst_4x8_internal_10bpc).main
1993    vpblendd             m3, m0, m4, 0x33 ; out6 out7
1994    vpblendd             m0, m4, 0xcc     ; out0 out1
1995    pshufd               m1, m5, q1032
1996    psignd               m2, m6           ; out4 out5
1997    psignd               m1, m6           ; out2 out3
1998    jmp                tx2q
1999.pass2:
2000    call .pass2_main
2001    vpermq               m0, m0, q3120 ; out0 out1
2002    vpermq               m2, m1, q3120 ; out2 out3
2003.end:
2004    vpbroadcastd         m1, [pw_2048]
2005    pmulhrsw             m0, m1
2006    pmulhrsw             m1, m2
2007    vpbroadcastd         m5, [pixel_10bpc_max]
2008.end2:
2009    mova                xm2, [dstq+strideq*0]
2010    vinserti128          m2, [dstq+strideq*1], 1
2011    lea                  r6, [dstq+strideq*2]
2012    mova                xm3, [r6  +strideq*0]
2013    vinserti128          m3, [r6  +strideq*1], 1
2014    pxor                 m4, m4
2015    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
2016    paddw                m0, m2
2017    paddw                m1, m3
2018    pmaxsw               m0, m4
2019    pmaxsw               m1, m4
2020    pminsw               m0, m5
2021    pminsw               m1, m5
2022    mova         [dstq+strideq*0], xm0
2023    vextracti128 [dstq+strideq*1], m0, 1
2024    mova         [r6  +strideq*0], xm1
2025    vextracti128 [r6  +strideq*1], m1, 1
2026    RET
2027ALIGN function_align
2028.pass2_main:
2029    vbroadcasti128       m4, [deint_shuf]
2030    packssdw             m0, m1
2031    packssdw             m2, m3
2032    lea                  r6, [deint_shuf+128]
2033    vperm2i128           m1, m0, m2, 0x31
2034    vinserti128          m0, xm2, 1
2035    pshufb               m0, m4
2036    pshufb               m1, m4
2037    jmp m(iadst_8x4_internal_8bpc).main
2038ALIGN function_align
2039.main:
2040    vpbroadcastd         m1, [pd_2896]
2041    pmulld               m0, m1, [cq+32*0]
2042    pmulld               m3, m1, [cq+32*3]
2043    pmulld               m2, m1, [cq+32*2]
2044    pmulld               m1,     [cq+32*1]
2045    vpbroadcastd         m4, [pd_2048]
2046    REPX      {paddd x, m4}, m0, m3, m2, m1
2047    REPX      {psrad x, 12}, m0, m3, m2, m1
2048.main2:
2049    IADST4_1D
2050    ret
2051
2052INV_TXFM_8X4_FN flipadst, dct
2053INV_TXFM_8X4_FN flipadst, adst
2054INV_TXFM_8X4_FN flipadst, flipadst
2055INV_TXFM_8X4_FN flipadst, identity
2056
2057cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2
2058    call m(iadst_4x8_internal_10bpc).main
2059    shufpd               m3, m4, m0, 0x05
2060    shufpd               m0, m4, 0x05
2061    psignd               m2, m6
2062    pshufd               m6, m6, q1032
2063    pshufd               m1, m2, q1032
2064    psignd               m2, m5, m6
2065    jmp                tx2q
2066.pass2:
2067    call m(iadst_8x4_internal_10bpc).pass2_main
2068    vpermq               m2, m0, q2031
2069    vpermq               m0, m1, q2031
2070    jmp m(iadst_8x4_internal_10bpc).end
2071
2072INV_TXFM_8X4_FN identity, dct
2073INV_TXFM_8X4_FN identity, adst
2074INV_TXFM_8X4_FN identity, flipadst
2075INV_TXFM_8X4_FN identity, identity
2076
2077cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2
2078.pass1:
2079    vpbroadcastd         m4, [pd_2896]
2080    vpermq               m0, [cq+32*0], q3120
2081    vpermq               m1, [cq+32*1], q3120
2082    vpermq               m2, [cq+32*2], q3120
2083    vpermq               m3, [cq+32*3], q3120
2084    vpbroadcastd         m7, [pd_2048]
2085    REPX     {pmulld x, m4}, m0, m1, m2, m3
2086    REPX     {paddd  x, m7}, m0, m1, m2, m3
2087    REPX     {psrad  x, 12}, m0, m1, m2, m3
2088    REPX     {paddd  x, x }, m0, m1, m2, m3
2089    jmp                tx2q
2090.pass2:
2091    vpbroadcastd         m5, [pixel_10bpc_max]
2092    vpbroadcastd         m4, [pw_1697x8]
2093    packssdw             m0, m1
2094    packssdw             m2, m3
2095    pmulhrsw             m1, m4, m0
2096    pmulhrsw             m4, m2
2097    paddsw               m0, m1
2098    paddsw               m2, m4
2099    packssdw             m7, m7 ; pw_2048
2100.pass2_end:
2101    punpckhwd            m1, m0, m2
2102    punpcklwd            m0, m2
2103    lea                  r6, [dstq+strideq*2]
2104    punpckhwd            m2, m0, m1
2105    punpcklwd            m0, m1
2106    pmulhrsw             m2, m7
2107    pmulhrsw             m0, m7
2108    punpckhwd            m1, m0, m2
2109    punpcklwd            m0, m2
2110    mova                xm2, [dstq+strideq*0]
2111    vinserti128          m2, [r6  +strideq*0], 1
2112    mova                xm3, [dstq+strideq*1]
2113    vinserti128          m3, [r6  +strideq*1], 1
2114    pxor                 m4, m4
2115    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
2116    paddw                m0, m2
2117    paddw                m1, m3
2118    pmaxsw               m0, m4
2119    pmaxsw               m1, m4
2120    pminsw               m0, m5
2121    pminsw               m1, m5
2122    mova         [dstq+strideq*0], xm0
2123    mova         [dstq+strideq*1], xm1
2124    vextracti128 [r6  +strideq*0], m0, 1
2125    vextracti128 [r6  +strideq*1], m1, 1
2126    RET
2127
2128INV_TXFM_8X4_FN dct, dct,      12
2129INV_TXFM_8X4_FN dct, identity, 12
2130INV_TXFM_8X4_FN dct, adst,     12
2131INV_TXFM_8X4_FN dct, flipadst, 12
2132
2133cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
2134    vpbroadcastd         m8, [clip_20b_min]
2135    vpbroadcastd         m9, [clip_20b_max]
2136    jmp m(idct_8x4_internal_10bpc).pass1
2137.pass2:
2138    vpbroadcastd         m8, [clip_18b_min]
2139    vpbroadcastd         m9, [clip_18b_max]
2140    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
2141    REPX     {pminsd x, m9}, m0, m1, m2, m3
2142    call m(iadst_8x4_internal_12bpc).transpose_4x8
2143    IDCT4_1D              0, 1, 2, 3, 4, 5, 6, 7
2144    jmp m(iadst_8x4_internal_12bpc).end
2145
2146INV_TXFM_8X4_FN adst, dct,      12
2147INV_TXFM_8X4_FN adst, adst,     12
2148INV_TXFM_8X4_FN adst, flipadst, 12
2149INV_TXFM_8X4_FN adst, identity, 12
2150
2151cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
2152    vpbroadcastd         m8, [clip_20b_min]
2153    vpbroadcastd         m9, [clip_20b_max]
2154    call m(iadst_4x8_internal_10bpc).main2
2155    vpblendd             m3, m0, m4, 0x33 ; out6 out7
2156    vpblendd             m0, m4, 0xcc     ; out0 out1
2157    pshufd               m1, m5, q1032
2158    psignd               m2, m6           ; out4 out5
2159    psignd               m1, m6           ; out2 out3
2160    jmp                tx2q
2161.pass2:
2162    vpbroadcastd         m8, [clip_18b_min]
2163    vpbroadcastd         m9, [clip_18b_max]
2164    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
2165    REPX     {pminsd x, m9}, m0, m1, m2, m3
2166    call .pass2_main
2167    vpbroadcastd         m5, [pd_2048]
2168    paddd                m0, m5, m4
2169    paddd                m1, m5, m6
2170    paddd                m2, m5
2171    paddd                m3, m5
2172.pass2_end:
2173    REPX      {psrad x, 12}, m0, m1, m2, m3
2174.end:
2175    vpbroadcastd         m4, [pw_16384]
2176    REPX       {psrad x, 3}, m0, m1, m2, m3
2177    packssdw             m0, m1
2178    packssdw             m2, m3
2179    pmulhrsw             m0, m4
2180    pmulhrsw             m1, m2, m4
2181    vpermq               m0, m0, q3120 ; out0 out1
2182    vpermq               m1, m1, q3120 ; out2 out3
2183    vpbroadcastd         m5, [pixel_12bpc_max]
2184    jmp m(iadst_8x4_internal_10bpc).end2
2185ALIGN function_align
2186.pass2_main:
2187    call .transpose_4x8
2188    jmp m(iadst_8x4_internal_10bpc).main2
2189ALIGN function_align
2190.transpose_4x8:
2191    ; deinterleave
2192    pshufd               m0, m0, q3120
2193    pshufd               m1, m1, q3120
2194    pshufd               m2, m2, q3120
2195    pshufd               m3, m3, q3120
2196    ; transpose
2197    punpcklqdq           m4, m0, m1
2198    punpckhqdq           m0, m1
2199    punpcklqdq           m5, m2, m3
2200    punpckhqdq           m2, m3
2201    vperm2i128           m1, m0, m2, 0x20   ; out1
2202    vperm2i128           m3, m0, m2, 0x31   ; out3
2203    vperm2i128           m2, m4, m5, 0x31   ; out2
2204    vperm2i128           m0, m4, m5, 0x20   ; out0
2205    ret
2206
2207INV_TXFM_8X4_FN flipadst, dct,      12
2208INV_TXFM_8X4_FN flipadst, adst,     12
2209INV_TXFM_8X4_FN flipadst, flipadst, 12
2210INV_TXFM_8X4_FN flipadst, identity, 12
2211
2212cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2
2213    vpbroadcastd         m8, [clip_20b_min]
2214    vpbroadcastd         m9, [clip_20b_max]
2215    call m(iadst_4x8_internal_10bpc).main2
2216    shufpd               m3, m4, m0, 0x05
2217    shufpd               m0, m4, 0x05
2218    psignd               m2, m6
2219    pshufd               m6, m6, q1032
2220    pshufd               m1, m2, q1032
2221    psignd               m2, m5, m6
2222    jmp                tx2q
2223.pass2:
2224    vpbroadcastd         m8, [clip_18b_min]
2225    vpbroadcastd         m9, [clip_18b_max]
2226    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
2227    REPX     {pminsd x, m9}, m0, m1, m2, m3
2228    call m(iadst_8x4_internal_12bpc).pass2_main
2229    vpbroadcastd         m5, [pd_2048]
2230    paddd                m0, m5, m3
2231    paddd                m1, m5, m2
2232    paddd                m3, m5, m4
2233    paddd                m2, m5, m6
2234    jmp m(iadst_8x4_internal_12bpc).pass2_end
2235
2236INV_TXFM_8X4_FN identity, dct,      12
2237INV_TXFM_8X4_FN identity, adst,     12
2238INV_TXFM_8X4_FN identity, flipadst, 12
2239INV_TXFM_8X4_FN identity, identity, 12
2240
2241cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2
2242    jmp m(iidentity_8x4_internal_10bpc).pass1
2243.pass2:
2244    ; m0 = in0 in1 (interleaved)
2245    ; m1 = in2 in3 (interleaved)
2246    ; m2 = in4 in5 (interleaved)
2247    ; m3 = in6 in7 (interleaved)
2248    vpbroadcastd         m8, [clip_18b_min]
2249    vpbroadcastd         m9, [clip_18b_max]
2250    REPX     {pmaxsd x, m8}, m0, m1, m2, m3
2251    REPX     {pminsd x, m9}, m0, m1, m2, m3
2252    vpbroadcastd         m4, [pd_5793]
2253    REPX     {pmulld x, m4}, m0, m1, m2, m3
2254    REPX     {paddd  x, m7}, m0, m1, m2, m3
2255    REPX     {psrad  x, 15}, m0, m1, m2, m3
2256    vpbroadcastd         m5, [pixel_12bpc_max]
2257    vpbroadcastd         m7, [pw_16384]
2258    packssdw             m0, m1
2259    packssdw             m2, m3
2260    jmp m(iidentity_8x4_internal_10bpc).pass2_end
2261
2262%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth
2263    INV_TXFM_FN          %1, %2, 0, 8x8, %3
2264%ifidn %1_%2, dct_dct
2265    vpbroadcastd         m2, [dconly_%3bpc]
2266%if %3 = 10
2267.dconly:
2268    imul                r6d, [cq], 181
2269    mov                [cq], eobd ; 0
2270    or                  r3d, 8
2271.dconly2:
2272    add                 r6d, 384
2273    sar                 r6d, 9
2274.dconly3:
2275    imul                r6d, 181
2276    add                 r6d, 2176
2277    sar                 r6d, 12
2278    movd                xm0, r6d
2279    paddsw              xm0, xm2
2280    vpbroadcastw         m0, xm0
2281.dconly_loop:
2282    mova                xm1, [dstq+strideq*0]
2283    vinserti128          m1, [dstq+strideq*1], 1
2284    paddsw               m1, m0
2285    psubusw              m1, m2
2286    mova         [dstq+strideq*0], xm1
2287    vextracti128 [dstq+strideq*1], m1, 1
2288    lea                dstq, [dstq+strideq*2]
2289    sub                 r3d, 2
2290    jg .dconly_loop
2291    RET
2292%else
2293    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly
2294%endif
2295%endif
2296%endmacro
2297
2298%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2]
2299    ITX_MULSUB_2D        %8, %1, %9, %10, %11, %12,  401, 4076 ; t1a, t0a
2300    ITX_MULSUB_2D        %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a
2301    ITX_MULSUB_2D        %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a
2302    ITX_MULSUB_2D        %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a
2303    psubd               m%9, m%3, m%7 ; t6
2304    paddd               m%3, m%7      ; t2
2305    psubd               m%7, m%1, m%5 ; t4
2306    paddd               m%1, m%5      ; t0
2307    psubd               m%5, m%6, m%2 ; t7
2308    paddd               m%6, m%2      ; t3
2309    psubd               m%2, m%8, m%4 ; t5
2310    paddd               m%8, m%4      ; t1
2311    REPX   {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
2312    REPX   {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
2313    ITX_MULSUB_2D        %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a
2314    ITX_MULSUB_2D        %5, %9, %4, %10, %11, %12, 3784, %11  ; t6a, t7a
2315    psubd              m%10, m%7, m%9 ;  t7
2316    paddd               m%7, m%9      ;  out6
2317    vpbroadcastd        m%9, [pd_1448]
2318    psubd               m%4, m%8, m%6 ;  t3
2319    paddd               m%8, m%6      ; -out7
2320    psubd               m%6, m%1, m%3 ;  t2
2321    paddd               m%1, m%3      ;  out0
2322    psubd               m%3, m%2, m%5 ;  t6
2323    paddd               m%2, m%5      ; -out1
2324    REPX   {pmaxsd x, m%13}, m%6, m%4, m%3, m%10
2325    REPX   {pminsd x, m%14}, m%6, m%4, m%3, m%10
2326    REPX   {pmulld x, m%9 }, m%6, m%4, m%3, m%10
2327    psubd               m%5, m%6, m%4  ; (t2 - t3) * 1448
2328    paddd               m%4, m%6       ; (t2 + t3) * 1448
2329    psubd               m%6, m%3, m%10 ; (t6 - t7) * 1448
2330    paddd               m%3, m%10      ; (t6 + t7) * 1448
2331%endmacro
2332
2333INV_TXFM_8X8_FN dct, dct
2334INV_TXFM_8X8_FN dct, identity
2335INV_TXFM_8X8_FN dct, adst
2336INV_TXFM_8X8_FN dct, flipadst
2337
2338cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
2339    vpbroadcastd        m12, [clip_18b_min]
2340    vpbroadcastd        m13, [clip_18b_max]
2341.pass1:
2342    mova                 m0, [cq+32*0]
2343    mova                 m1, [cq+32*1]
2344    mova                 m2, [cq+32*2]
2345    mova                 m3, [cq+32*3]
2346    mova                 m4, [cq+32*4]
2347    mova                 m5, [cq+32*5]
2348    mova                 m6, [cq+32*6]
2349    mova                 m7, [cq+32*7]
2350    vpbroadcastd        m11, [pd_2048]
2351    call .main
2352    call .round_shift1
2353    jmp                tx2q
2354.pass2:
2355    call .transpose_8x8_packed
2356    call m(idct_8x8_internal_8bpc).main
2357    vpbroadcastd        m12, [pw_2048]
2358    vpermq               m0, m0, q3120
2359    vpermq               m1, m1, q2031
2360    vpermq               m2, m2, q3120
2361    vpermq               m3, m3, q2031
2362    pmulhrsw             m0, m12
2363    pmulhrsw             m1, m12
2364    call .write_8x4_start
2365    pmulhrsw             m0, m2, m12
2366    pmulhrsw             m1, m3, m12
2367    call .write_8x4
2368    RET
2369ALIGN function_align
2370.write_8x4_start:
2371    vpbroadcastd        m11, [pixel_10bpc_max]
2372    lea                  r6, [strideq*3]
2373    pxor                m10, m10
2374.write_8x4:
2375    mova                xm8, [dstq+strideq*0]
2376    vinserti128          m8, [dstq+strideq*1], 1
2377    mova                xm9, [dstq+strideq*2]
2378    vinserti128          m9, [dstq+r6       ], 1
2379    mova          [cq+32*0], m10
2380    mova          [cq+32*1], m10
2381    mova          [cq+32*2], m10
2382    mova          [cq+32*3], m10
2383    add                  cq, 32*4
2384    paddw                m0, m8
2385    paddw                m1, m9
2386    pmaxsw               m0, m10
2387    pmaxsw               m1, m10
2388    pminsw               m0, m11
2389    pminsw               m1, m11
2390    mova         [dstq+strideq*0], xm0
2391    vextracti128 [dstq+strideq*1], m0, 1
2392    mova         [dstq+strideq*2], xm1
2393    vextracti128 [dstq+r6       ], m1, 1
2394    lea                dstq, [dstq+strideq*4]
2395    ret
2396ALIGN function_align
2397.transpose_8x8_packed:
2398    packssdw             m0, m4
2399    packssdw             m1, m5
2400    packssdw             m2, m6
2401    packssdw             m3, m7
2402    lea                  r6, [deint_shuf+128]
2403    punpckhwd            m4, m0, m1
2404    punpcklwd            m0, m1
2405    punpckhwd            m1, m2, m3
2406    punpcklwd            m2, m3
2407    punpckhdq            m3, m0, m2
2408    punpckldq            m0, m2
2409    punpckhdq            m2, m4, m1
2410    punpckldq            m4, m1
2411    vinserti128          m1, m3, xm2, 1
2412    vperm2i128           m3, m2, 0x31
2413    vperm2i128           m2, m0, m4, 0x31
2414    vinserti128          m0, xm4, 1
2415    ret
2416ALIGN function_align
2417.main_rect2:
2418    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
2419    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
2420.main:
2421    ITX_MULSUB_2D         5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a
2422    ITX_MULSUB_2D         1, 7, 8, 9, 10, 11,  799, 4017 ; t4a t7a
2423    ITX_MULSUB_2D         2, 6, 8, 9, 10, 11, 1567, 3784 ; t2  t3
2424    paddd                m8, m1, m5 ; t4
2425    psubd                m1, m5     ; t5a
2426    paddd                m9, m7, m3 ; t7
2427    psubd                m7, m3     ; t6a
2428    vpbroadcastd         m3, [pd_2896]
2429    REPX    {pmaxsd x, m12}, m1, m8, m7, m9
2430    REPX    {pminsd x, m13}, m1, m8, m7, m9
2431    REPX    {pmulld x, m3 }, m0, m4, m7, m1
2432    paddd                m0, m11
2433    paddd                m7, m11
2434    psubd                m5, m0, m4
2435    paddd                m0, m4
2436    psubd                m4, m7, m1
2437    paddd                m7, m1
2438    REPX    {psrad  x, 12 }, m5, m0, m4, m7
2439    psubd                m3, m0, m6 ; dct4 out3
2440    paddd                m0, m6     ; dct4 out0
2441    paddd                m6, m5, m2 ; dct4 out1
2442    psubd                m5, m2     ; dct4 out2
2443    REPX    {pmaxsd x, m12}, m0, m6, m5, m3
2444    REPX    {pminsd x, m13}, m0, m6, m5, m3
2445    ret
2446ALIGN function_align
2447.round_shift1:
2448    pcmpeqd              m1, m1
2449    REPX      {psubd x, m1}, m0, m6, m5, m3
2450    paddd                m1, m6, m7 ; out1
2451    psubd                m6, m7     ; out6
2452    psubd                m7, m0, m9 ; out7
2453    paddd                m0, m9     ; out0
2454    paddd                m2, m5, m4 ; out2
2455    psubd                m5, m4     ; out5
2456    psubd                m4, m3, m8 ; out4
2457    paddd                m3, m8     ; out3
2458    REPX      {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
2459    ret
2460
2461INV_TXFM_8X8_FN adst, dct
2462INV_TXFM_8X8_FN adst, adst
2463INV_TXFM_8X8_FN adst, flipadst
2464INV_TXFM_8X8_FN adst, identity
2465
2466cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
2467    vpbroadcastd        m12, [clip_18b_min]
2468    vpbroadcastd        m13, [clip_18b_max]
2469.pass1:
2470    call .main
2471    call .main_end
2472    jmp                tx2q
2473.pass2:
2474    call m(idct_8x8_internal_10bpc).transpose_8x8_packed
2475    pshufd               m4, m0, q1032
2476    pshufd               m5, m1, q1032
2477    call m(iadst_8x8_internal_8bpc).main_pass2
2478    vpbroadcastd         m5, [pw_2048]
2479    vpbroadcastd       xm12, [pw_4096]
2480    psubw               m12, m5
2481    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
2482    pmulhrsw             m0, m12
2483    pmulhrsw             m1, m12
2484    call m(idct_8x8_internal_10bpc).write_8x4_start
2485    pmulhrsw             m0, m2, m12
2486    pmulhrsw             m1, m3, m12
2487    call m(idct_8x8_internal_10bpc).write_8x4
2488    RET
2489ALIGN function_align
2490.main:
2491    mova                 m0, [cq+32*0]
2492    mova                 m7, [cq+32*7]
2493    mova                 m1, [cq+32*1]
2494    mova                 m6, [cq+32*6]
2495    mova                 m2, [cq+32*2]
2496    mova                 m5, [cq+32*5]
2497    mova                 m3, [cq+32*3]
2498    mova                 m4, [cq+32*4]
2499    vpbroadcastd        m11, [pd_2048]
2500.main2:
2501    IADST8_1D             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
2502    psrld                m8, 10 ; pd_1
2503    vpbroadcastd         m9, [pd_3072]
2504    ret
2505ALIGN function_align
2506.main_end:
2507    paddd                m0, m8
2508    psubd                m1, m8, m1
2509    paddd                m6, m8
2510    psubd                m7, m8, m7
2511    REPX      {psrad x, 1 }, m0, m1, m6, m7
2512    ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12
2513    ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12
2514    psubd                m8, m9, m8 ; pd_3071
2515    paddd                m2, m9
2516    psubd                m3, m8, m3
2517    paddd                m4, m9
2518    psubd                m5, m8, m5
2519    REPX      {psrad x, 12}, m2, m3, m4, m5
2520    ret
2521
2522INV_TXFM_8X8_FN flipadst, dct
2523INV_TXFM_8X8_FN flipadst, adst
2524INV_TXFM_8X8_FN flipadst, flipadst
2525INV_TXFM_8X8_FN flipadst, identity
2526
2527cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
2528    vpbroadcastd        m12, [clip_18b_min]
2529    vpbroadcastd        m13, [clip_18b_max]
2530.pass1:
2531    call m(iadst_8x8_internal_10bpc).main
2532    call .main_end
2533    jmp                tx2q
2534.pass2:
2535    call m(idct_8x8_internal_10bpc).transpose_8x8_packed
2536    pshufd               m4, m0, q1032
2537    pshufd               m5, m1, q1032
2538    call m(iadst_8x8_internal_8bpc).main_pass2
2539    vpbroadcastd        m12, [pw_2048]
2540    vpbroadcastd        xm5, [pw_4096]
2541    psubw               m12, m5
2542    vpermq               m8, m3, q2031
2543    vpermq               m9, m2, q2031
2544    vpermq               m2, m1, q2031
2545    vpermq               m3, m0, q2031
2546    pmulhrsw             m0, m8, m12
2547    pmulhrsw             m1, m9, m12
2548    call m(idct_8x8_internal_10bpc).write_8x4_start
2549    pmulhrsw             m0, m2, m12
2550    pmulhrsw             m1, m3, m12
2551    call m(idct_8x8_internal_10bpc).write_8x4
2552    RET
2553ALIGN function_align
2554.main_end:
2555    paddd               m10, m8, m0
2556    psubd                m0, m8, m7
2557    psubd                m7, m8, m1
2558    paddd                m1, m8, m6
2559    psrad                m0, 1
2560    psrad                m1, 1
2561    psrad                m6, m7, 1
2562    psrad                m7, m10, 1
2563    psubd                m8, m9, m8 ; pd_6143
2564    psubd               m10, m8, m5
2565    paddd                m5, m9, m2
2566    psubd                m2, m8, m3
2567    paddd                m3, m9, m4
2568    psrad                m4, m2, 12
2569    psrad                m2, m10, 12
2570    psrad                m3, 12
2571    psrad                m5, 12
2572    ret
2573
2574INV_TXFM_8X8_FN identity, dct
2575INV_TXFM_8X8_FN identity, adst
2576INV_TXFM_8X8_FN identity, flipadst
2577INV_TXFM_8X8_FN identity, identity
2578
2579cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
2580.pass1:
2581    mova                 m0, [cq+32*0]
2582    mova                 m1, [cq+32*1]
2583    mova                 m2, [cq+32*2]
2584    mova                 m3, [cq+32*3]
2585    mova                 m4, [cq+32*4]
2586    mova                 m5, [cq+32*5]
2587    mova                 m6, [cq+32*6]
2588    mova                 m7, [cq+32*7]
2589    jmp                tx2q
2590.pass2:
2591    packssdw             m3, m7
2592    vpbroadcastd         m7, [pixel_10bpc_max]
2593.pass2_main:
2594    packssdw             m0, m4
2595    packssdw             m1, m5
2596    packssdw             m2, m6
2597    vpbroadcastd        m12, [pw_4096]
2598    punpckhwd            m4, m0, m1
2599    punpcklwd            m0, m1
2600    punpckhwd            m1, m2, m3
2601    punpcklwd            m2, m3
2602    punpckhdq            m3, m0, m2
2603    punpckldq            m0, m2
2604    punpckldq            m2, m4, m1
2605    punpckhdq            m4, m1
2606    punpckhqdq           m1, m0, m2 ; 1 5
2607    punpcklqdq           m0, m2     ; 0 4
2608    punpcklqdq           m2, m3, m4 ; 2 6
2609    punpckhqdq           m3, m4     ; 3 7
2610    pmulhrsw             m0, m12
2611    pmulhrsw             m1, m12
2612    call .write_2x8x2_start
2613    pmulhrsw             m0, m2, m12
2614    pmulhrsw             m1, m3, m12
2615    call .write_2x8x2_zero
2616    RET
2617.write_2x8x2_start:
2618    lea                  r6, [strideq*5]
2619    pxor                 m6, m6
2620.write_2x8x2_zero:
2621    mova          [cq+32*0], m6
2622    mova          [cq+32*1], m6
2623    mova          [cq+32*2], m6
2624    mova          [cq+32*3], m6
2625    add                  cq, 32*4
2626.write_2x8x2:
2627    mova                xm4, [dstq+strideq*0]
2628    vinserti128          m4, [dstq+strideq*4], 1
2629    mova                xm5, [dstq+strideq*1]
2630    vinserti128          m5, [dstq+r6       ], 1
2631    paddw                m0, m4
2632    paddw                m1, m5
2633    pmaxsw               m0, m6
2634    pmaxsw               m1, m6
2635    pminsw               m0, m7
2636    pminsw               m1, m7
2637    mova         [dstq+strideq*0], xm0
2638    mova         [dstq+strideq*1], xm1
2639    vextracti128 [dstq+strideq*4], m0, 1
2640    vextracti128 [dstq+r6       ], m1, 1
2641    lea                dstq, [dstq+strideq*2]
2642    ret
2643
2644%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4]
2645    punpckldq            m%9,  m%1,  m%2 ; aibj emfn
2646    punpckhdq            m%1,  m%2       ; ckdl gohp
2647    punpckldq           m%10,  m%3,  m%4 ; qyrz uCvD
2648    punpckhdq            m%3,  m%4       ; sAtB wExF
2649    punpckldq           m%11,  m%5,  m%6 ; GOHP KSLT
2650    punpckhdq            m%5,  m%6       ; IQJR MUNV
2651    punpckldq           m%12,  m%7,  m%8 ; WeXf aibj
2652    punpckhdq            m%7,  m%8       ; YgZh ckdl
2653    punpcklqdq           m%2,  m%9, m%10 ; aiqy emuC
2654    punpckhqdq           m%9, m%10       ; bjrz fnvD
2655    punpcklqdq           m%4,  m%1,  m%3 ; cksA gowE
2656    punpckhqdq          m%10,  m%1,  m%3 ; dltB hpxF
2657    punpcklqdq           m%6, m%11, m%12 ; GOWe KSai
2658    punpckhqdq          m%11, m%12       ; HPXf LTbj
2659    punpcklqdq           m%8,  m%5,  m%7 ; IQYg MUck
2660    punpckhqdq          m%12,  m%5,  m%7 ; JRZh NVdl
2661    vperm2i128           m%1,  m%2,  m%6, 0x20   ; out0
2662    vperm2i128           m%5,  m%2,  m%6, 0x31   ; out4
2663    vperm2i128           m%2,  m%9, m%11, 0x20   ; out1
2664    vperm2i128           m%6,  m%9, m%11, 0x31   ; out5
2665    vperm2i128           m%3,  m%4,  m%8, 0x20   ; out2
2666    vperm2i128           m%7,  m%4,  m%8, 0x31   ; out6
2667    vperm2i128           m%4, m%10, m%12, 0x20   ; out3
2668    vperm2i128           m%8, m%10, m%12, 0x31   ; out7
2669%endmacro
2670
2671INV_TXFM_8X8_FN dct, dct,      12
2672INV_TXFM_8X8_FN dct, identity, 12
2673INV_TXFM_8X8_FN dct, adst,     12
2674INV_TXFM_8X8_FN dct, flipadst, 12
2675
2676cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
2677    vpbroadcastd        m12, [clip_20b_min]
2678    vpbroadcastd        m13, [clip_20b_max]
2679    jmp m(idct_8x8_internal_10bpc).pass1
2680.pass2:
2681    vpbroadcastd        m12, [clip_18b_min]
2682    vpbroadcastd        m13, [clip_18b_max]
2683    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
2684    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
2685    call .transpose_8x8
2686    vpbroadcastd        m11, [pd_2048]
2687    call m(idct_8x8_internal_10bpc).main
2688    call .round_shift4
2689    jmp m(iadst_8x8_internal_12bpc).pass2_end
2690ALIGN function_align
2691.write_8x4_start:
2692    vpbroadcastd        m11, [pixel_12bpc_max]
2693    lea                  r6, [strideq*3]
2694    pxor                m10, m10
2695    ret
2696ALIGN function_align
2697.transpose_8x8:
2698    TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
2699    ret
2700ALIGN function_align
2701.round_shift4:
2702    vpbroadcastd         m1, [pd_8]
2703    REPX      {paddd x, m1}, m0, m6, m5, m3
2704    paddd                m1, m6, m7 ; out1
2705    psubd                m6, m7     ; out6
2706    psubd                m7, m0, m9 ; out7
2707    paddd                m0, m9     ; out0
2708    paddd                m2, m5, m4 ; out2
2709    psubd                m5, m4     ; out5
2710    psubd                m4, m3, m8 ; out4
2711    paddd                m3, m8     ; out3
2712    REPX       {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
2713    ret
2714
2715INV_TXFM_8X8_FN adst, dct,      12
2716INV_TXFM_8X8_FN adst, adst,     12
2717INV_TXFM_8X8_FN adst, flipadst, 12
2718INV_TXFM_8X8_FN adst, identity, 12
2719
2720cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
2721    vpbroadcastd        m12, [clip_20b_min]
2722    vpbroadcastd        m13, [clip_20b_max]
2723    jmp m(iadst_8x8_internal_10bpc).pass1
2724.pass2:
2725    call .pass2_main
2726.pass2_end:
2727    packssdw             m0, m1
2728    packssdw             m1, m2, m3
2729    REPX {vpermq x, x, q3120}, m0, m1
2730    call m(idct_8x8_internal_12bpc).write_8x4_start
2731    call m(idct_8x8_internal_10bpc).write_8x4
2732    packssdw             m0, m4, m5
2733    packssdw             m1, m6, m7
2734    REPX {vpermq x, x, q3120}, m0, m1
2735    call m(idct_8x8_internal_10bpc).write_8x4
2736    RET
2737ALIGN function_align
2738.pass2_main:
2739    vpbroadcastd        m12, [clip_18b_min]
2740    vpbroadcastd        m13, [clip_18b_max]
2741    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
2742    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
2743    call m(idct_8x8_internal_12bpc).transpose_8x8
2744    vpbroadcastd        m11, [pd_2048]
2745.pass2_main2:
2746    call m(iadst_8x8_internal_10bpc).main2
2747    pslld                m9, m8, 3  ; pd_8
2748    paddd                m0, m9
2749    psubd                m1, m9, m1 ; 8+x
2750    paddd                m6, m9
2751    psubd                m7, m9, m7
2752    REPX       {psrad x, 4}, m0, m1, m6, m7
2753    vpbroadcastd         m9, [pd_17408]
2754    psubd                m8, m9, m8 ; 17407
2755    paddd                m2, m9
2756    psubd                m3, m8, m3
2757    paddd                m4, m9
2758    psubd                m5, m8, m5
2759    REPX      {psrad x, 15}, m2, m3, m4, m5
2760    ret
2761
2762INV_TXFM_8X8_FN flipadst, dct,      12
2763INV_TXFM_8X8_FN flipadst, adst,     12
2764INV_TXFM_8X8_FN flipadst, flipadst, 12
2765INV_TXFM_8X8_FN flipadst, identity, 12
2766
2767cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
2768    vpbroadcastd        m12, [clip_20b_min]
2769    vpbroadcastd        m13, [clip_20b_max]
2770    jmp m(iflipadst_8x8_internal_10bpc).pass1
2771.pass2:
2772    call m(iadst_8x8_internal_12bpc).pass2_main
2773    packssdw             m7, m7, m6
2774    packssdw             m6, m1, m0
2775    packssdw             m1, m5, m4
2776    vpermq               m0, m7, q3120
2777    vpermq               m1, m1, q3120
2778    call m(idct_8x8_internal_12bpc).write_8x4_start
2779    call m(idct_8x8_internal_10bpc).write_8x4
2780    packssdw             m0, m3, m2
2781    vpermq               m0, m0, q3120
2782    vpermq               m1, m6, q3120
2783    call m(idct_8x8_internal_10bpc).write_8x4
2784    RET
2785
2786INV_TXFM_8X8_FN identity, dct,      12
2787INV_TXFM_8X8_FN identity, adst,     12
2788INV_TXFM_8X8_FN identity, flipadst, 12
2789INV_TXFM_8X8_FN identity, identity, 12
2790
2791cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
2792    jmp m(iidentity_8x8_internal_10bpc).pass1
2793.pass2:
2794    packssdw             m3, m7
2795    vpbroadcastd         m7, [pixel_12bpc_max]
2796    jmp m(iidentity_8x8_internal_10bpc).pass2_main
2797
2798%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
2799    INV_TXFM_FN          %1, %2, %3, 8x16, %4
2800%ifidn %1_%2, dct_dct
2801    imul                r6d, [cq], 181
2802    vpbroadcastd         m2, [dconly_%4bpc]
2803    mov                [cq], eobd ; 0
2804    or                  r3d, 16
2805    add                 r6d, 128
2806    sar                 r6d, 8
2807    imul                r6d, 181
2808    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2
2809%endif
2810%endmacro
2811
2812INV_TXFM_8X16_FN dct, dct
2813INV_TXFM_8X16_FN dct, identity, 35
2814INV_TXFM_8X16_FN dct, adst
2815INV_TXFM_8X16_FN dct, flipadst
2816
2817cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
2818%undef cmp
2819    vpbroadcastd        m12, [clip_18b_min]
2820    vpbroadcastd        m13, [clip_18b_max]
2821.pass1:
2822    vpbroadcastd        m14, [pd_2896]
2823    vpbroadcastd        m11, [pd_2048]
2824    cmp                eobd, 43
2825    jl .fast
2826    add                  cq, 32
2827    call .pass1_main
2828    sub                  cq, 32
2829    mova         [cq+32* 1], m0
2830    mova         [cq+32* 3], m1
2831    mova         [cq+32* 5], m2
2832    mova         [cq+32* 7], m3
2833    mova         [cq+32* 9], m4
2834    mova         [cq+32*11], m5
2835    mova         [cq+32*13], m6
2836    mova                m15, m7
2837    call .pass1_main
2838    mova                 m8, [cq+32* 1]
2839    mova                 m9, [cq+32* 3]
2840    mova                m10, [cq+32* 5]
2841    mova                m11, [cq+32* 7]
2842    mova                m12, [cq+32* 9]
2843    mova                m13, [cq+32*11]
2844    mova                m14, [cq+32*13]
2845    jmp                tx2q
2846.fast:
2847    call .pass1_main
2848    pxor                 m8, m8
2849    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
2850    jmp                tx2q
2851.pass2:
2852    call .transpose
2853    call m(idct_8x16_internal_8bpc).main
2854    vpbroadcastd        m12, [pw_2048]
2855    REPX {vpermq x, x, q3120}, m0, m2, m4, m6
2856    REPX {vpermq x, x, q2031}, m1, m3, m5, m7
2857.end:
2858    pmulhrsw             m0, m12
2859    pmulhrsw             m1, m12
2860    call m(idct_8x8_internal_10bpc).write_8x4_start
2861    pmulhrsw             m0, m2, m12
2862    pmulhrsw             m1, m3, m12
2863    call m(idct_8x8_internal_10bpc).write_8x4
2864    pmulhrsw             m0, m4, m12
2865    pmulhrsw             m1, m5, m12
2866    call m(idct_8x8_internal_10bpc).write_8x4
2867    pmulhrsw             m0, m6, m12
2868    pmulhrsw             m1, m7, m12
2869    call m(idct_8x8_internal_10bpc).write_8x4
2870    RET
2871ALIGN function_align
2872.transpose:
2873    packssdw             m0, m8
2874    packssdw             m1, m9
2875    packssdw             m2, m10
2876    packssdw             m3, m11
2877    packssdw             m4, m12
2878    packssdw             m5, m13
2879    packssdw             m6, m14
2880    packssdw             m7, m15
2881    lea                  r6, [deint_shuf+128]
2882    punpckhwd            m8, m0, m1
2883    punpcklwd            m0, m1
2884    punpckhwd            m1, m2, m3
2885    punpcklwd            m2, m3
2886    punpcklwd            m3, m4, m5
2887    punpckhwd            m4, m5
2888    punpckhwd            m5, m6, m7
2889    punpcklwd            m6, m7
2890    punpckhdq            m7, m3, m6
2891    punpckldq            m3, m6
2892    punpckhdq            m6, m4, m5
2893    punpckldq            m4, m5
2894    punpckhdq            m5, m8, m1
2895    punpckldq            m8, m1
2896    punpckhdq            m1, m0, m2
2897    punpckldq            m0, m2
2898    vperm2i128           m2, m0, m3, 0x31
2899    vinserti128          m0, xm3, 1
2900    vperm2i128           m3, m1, m7, 0x31
2901    vinserti128          m1, xm7, 1
2902    vperm2i128           m7, m5, m6, 0x31
2903    vinserti128          m5, xm6, 1
2904    vperm2i128           m6, m8, m4, 0x31
2905    vinserti128          m4, m8, xm4, 1
2906    ret
2907ALIGN function_align
2908.pass1_main:
2909    pmulld               m0, m14, [cq+32* 0]
2910    pmulld               m1, m14, [cq+32* 2]
2911    pmulld               m2, m14, [cq+32* 4]
2912    pmulld               m3, m14, [cq+32* 6]
2913    pmulld               m4, m14, [cq+32* 8]
2914    pmulld               m5, m14, [cq+32*10]
2915    pmulld               m6, m14, [cq+32*12]
2916    pmulld               m7, m14, [cq+32*14]
2917    call m(idct_8x8_internal_10bpc).main_rect2
2918    jmp  m(idct_8x8_internal_10bpc).round_shift1
2919ALIGN function_align
2920.main_evenhalf:
2921    paddd                m1, m6, m7  ; idct8 out1
2922    psubd                m6, m7      ; idct8 out6
2923    psubd                m7, m0, m9  ; idct8 out7
2924    paddd                m0, m9      ; idct8 out0
2925    paddd                m2, m5, m4  ; idct8 out2
2926    psubd                m5, m4      ; idct8 out5
2927    psubd                m4, m3, m8  ; idct8 out4
2928    paddd                m3, m8      ; idct8 out3
2929    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
2930    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
2931    ret
2932.main_oddhalf_fast_rect2:
2933    REPX     {paddd x, m11}, m0, m1, m2, m3
2934    REPX     {psrad x, 12 }, m0, m1, m2, m3
2935.main_oddhalf_fast: ; lower half zero
2936    vpbroadcastd         m7, [pd_4076]
2937    vpbroadcastd         m8, [pd_401]
2938    vpbroadcastd         m6, [pd_m1189]
2939    vpbroadcastd         m9, [pd_3920]
2940    vpbroadcastd         m5, [pd_3612]
2941    vpbroadcastd        m10, [pd_1931]
2942    vpbroadcastd         m4, [pd_m2598]
2943    vpbroadcastd        m15, [pd_3166]
2944    pmulld               m7, m0
2945    pmulld               m0, m8
2946    pmulld               m6, m1
2947    pmulld               m1, m9
2948    pmulld               m5, m2
2949    pmulld               m2, m10
2950    pmulld               m4, m3
2951    pmulld               m3, m15
2952    jmp .main_oddhalf_fast2
2953.main_oddhalf_rect2:
2954    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
2955    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
2956.main_oddhalf:
2957    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  401, 4076 ; t8a,  t15a
2958    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
2959    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
2960    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3166, 2598 ; t9a,  t14a
2961.main_oddhalf_fast2:
2962    REPX     {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
2963    REPX     {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
2964    psubd                m8, m0, m4 ; t9
2965    paddd                m0, m4     ; t8
2966    psubd                m4, m6, m2 ; t10
2967    paddd                m2, m6     ; t11
2968    psubd                m6, m1, m5 ; t13
2969    paddd                m5, m1     ; t12
2970    psubd                m1, m7, m3 ; t14
2971    paddd                m7, m3     ; t15
2972    REPX    {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
2973    REPX    {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
2974    vpbroadcastd        m15, [pd_3784]
2975    vpbroadcastd        m10, [pd_1567]
2976    ITX_MULSUB_2D         1, 8, 3, 9, _, 11, 10, 15
2977    ITX_MULSUB_2D         6, 4, 3, 9, _, 11, 10, 15, 2
2978    psubd                m3, m1, m4 ; t10
2979    paddd                m1, m4     ; t9
2980    psubd                m4, m0, m2 ; t11a
2981    paddd                m0, m2     ; t8a
2982    psubd                m2, m8, m6 ; t13
2983    paddd                m6, m8     ; t14
2984    psubd                m8, m7, m5 ; t12a
2985    paddd                m7, m5     ; t15a
2986    REPX    {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
2987    REPX    {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
2988    REPX    {pmulld x, m14}, m2, m8, m3, m4
2989    paddd                m2, m11
2990    paddd                m8, m11
2991    paddd                m5, m2, m3 ; t13a
2992    psubd                m2, m3     ; t10a
2993    psubd                m3, m8, m4 ; t11
2994    paddd                m4, m8     ; t12
2995    REPX      {psrad x, 12}, m5, m2, m3, m4
2996    mova          [r6-32*4], m7
2997    mova          [r6-32*3], m6
2998    mova          [r6-32*2], m5
2999    mova          [r6-32*1], m4
3000    mova          [r6+32*0], m3
3001    mova          [r6+32*1], m2
3002    mova          [r6+32*2], m1
3003    mova          [r6+32*3], m0
3004    ret
3005
3006INV_TXFM_8X16_FN adst, dct
3007INV_TXFM_8X16_FN adst, adst
3008INV_TXFM_8X16_FN adst, flipadst
3009INV_TXFM_8X16_FN adst, identity, 35
3010
3011cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
3012%undef cmp
3013    vpbroadcastd        m12, [clip_18b_min]
3014    vpbroadcastd        m13, [clip_18b_max]
3015.pass1:
3016    vpbroadcastd        m14, [pd_2896]
3017    vpbroadcastd        m11, [pd_2048]
3018    cmp                eobd, 43
3019    jl .fast
3020    add                  cq, 32
3021    call .pass1_main
3022    call m(iadst_8x8_internal_10bpc).main_end
3023    sub                  cq, 32
3024    mova         [cq+32* 1], m0
3025    mova         [cq+32* 3], m1
3026    mova         [cq+32* 5], m2
3027    mova         [cq+32* 7], m3
3028    mova         [cq+32* 9], m4
3029    mova         [cq+32*11], m5
3030    mova         [cq+32*13], m6
3031    mova                m15, m7
3032    call .pass1_main
3033    call m(iadst_8x8_internal_10bpc).main_end
3034    mova                 m8, [cq+32* 1]
3035    mova                 m9, [cq+32* 3]
3036    mova                m10, [cq+32* 5]
3037    mova                m11, [cq+32* 7]
3038    mova                m12, [cq+32* 9]
3039    mova                m13, [cq+32*11]
3040    mova                m14, [cq+32*13]
3041    jmp                tx2q
3042.fast:
3043    call .pass1_main
3044    call m(iadst_8x8_internal_10bpc).main_end
3045    pxor                 m8, m8
3046    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
3047    jmp                tx2q
3048.pass2:
3049    call m(idct_8x16_internal_10bpc).transpose
3050    call m(iadst_8x16_internal_8bpc).main
3051    call m(iadst_8x16_internal_8bpc).main_pass2_end
3052    vpbroadcastd         m8, [pw_2048]
3053    vpbroadcastd       xm12, [pw_4096]
3054    REPX {vpermq x, x, q2031}, m0, m1, m2, m3
3055    REPX {vpermq x, x, q3120}, m4, m5, m6, m7
3056    psubw               m12, m8
3057    jmp m(idct_8x16_internal_10bpc).end
3058ALIGN function_align
3059.pass1_main:
3060    pmulld               m0, m14, [cq+32* 0]
3061    pmulld               m7, m14, [cq+32*14]
3062    pmulld               m1, m14, [cq+32* 2]
3063    pmulld               m6, m14, [cq+32*12]
3064    pmulld               m2, m14, [cq+32* 4]
3065    pmulld               m5, m14, [cq+32*10]
3066    pmulld               m3, m14, [cq+32* 6]
3067    pmulld               m4, m14, [cq+32* 8]
3068    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
3069    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
3070    jmp m(iadst_8x8_internal_10bpc).main2
3071
3072INV_TXFM_8X16_FN flipadst, dct
3073INV_TXFM_8X16_FN flipadst, adst
3074INV_TXFM_8X16_FN flipadst, flipadst
3075INV_TXFM_8X16_FN flipadst, identity, 35
3076
3077cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
3078%undef cmp
3079    vpbroadcastd        m12, [clip_18b_min]
3080    vpbroadcastd        m13, [clip_18b_max]
3081.pass1:
3082    vpbroadcastd        m14, [pd_2896]
3083    vpbroadcastd        m11, [pd_2048]
3084    cmp                eobd, 43
3085    jl .fast
3086    add                  cq, 32
3087    call m(iadst_8x16_internal_10bpc).pass1_main
3088    call m(iflipadst_8x8_internal_10bpc).main_end
3089    sub                  cq, 32
3090    mova         [cq+32* 1], m0
3091    mova         [cq+32* 3], m1
3092    mova         [cq+32* 5], m2
3093    mova         [cq+32* 7], m3
3094    mova         [cq+32* 9], m4
3095    mova         [cq+32*11], m5
3096    mova         [cq+32*13], m6
3097    mova                m15, m7
3098    call m(iadst_8x16_internal_10bpc).pass1_main
3099    call m(iflipadst_8x8_internal_10bpc).main_end
3100    mova                 m8, [cq+32* 1]
3101    mova                 m9, [cq+32* 3]
3102    mova                m10, [cq+32* 5]
3103    mova                m11, [cq+32* 7]
3104    mova                m12, [cq+32* 9]
3105    mova                m13, [cq+32*11]
3106    mova                m14, [cq+32*13]
3107    jmp                tx2q
3108.fast:
3109    call m(iadst_8x16_internal_10bpc).pass1_main
3110    call m(iflipadst_8x8_internal_10bpc).main_end
3111    pxor                 m8, m8
3112    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
3113    jmp                tx2q
3114.pass2:
3115    call m(idct_8x16_internal_10bpc).transpose
3116    call m(iadst_8x16_internal_8bpc).main
3117    call m(iadst_8x16_internal_8bpc).main_pass2_end
3118    vpbroadcastd        m12, [pw_2048]
3119    vpbroadcastd       xm13, [pw_4096]
3120    mova                m11, m0
3121    vpermq               m0, m7, q2031
3122    mova                m10, m1
3123    vpermq               m1, m6, q2031
3124    mova                 m9, m2
3125    vpermq               m2, m5, q2031
3126    mova                 m8, m3
3127    vpermq               m3, m4, q2031
3128    vpermq               m4, m8, q3120
3129    vpermq               m5, m9, q3120
3130    vpermq               m6, m10, q3120
3131    vpermq               m7, m11, q3120
3132    psubw               m12, m13
3133    jmp m(idct_8x16_internal_10bpc).end
3134
3135INV_TXFM_8X16_FN identity, dct
3136INV_TXFM_8X16_FN identity, adst
3137INV_TXFM_8X16_FN identity, flipadst
3138INV_TXFM_8X16_FN identity, identity
3139
3140%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384]
3141    pmulhrsw            m%2, m%3, m%1
3142%if %0 == 4 ; if downshifting by 1
3143%ifnum %4
3144    pmulhrsw            m%2, m%4
3145%else ; without rounding
3146    psraw               m%2, 1
3147%endif
3148%else
3149    paddsw              m%1, m%1
3150%endif
3151    paddsw              m%1, m%2
3152%endmacro
3153
3154cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2
3155.pass1:
3156    vpbroadcastd        m15, [pd_2896]
3157    pmulld               m0, m15, [cq+32* 0]
3158    pmulld               m8, m15, [cq+32* 1]
3159    pmulld               m1, m15, [cq+32* 2]
3160    pmulld               m9, m15, [cq+32* 3]
3161    pmulld               m2, m15, [cq+32* 4]
3162    pmulld              m10, m15, [cq+32* 5]
3163    pmulld               m3, m15, [cq+32* 6]
3164    pmulld              m11, m15, [cq+32* 7]
3165    pmulld               m4, m15, [cq+32* 8]
3166    pmulld              m12, m15, [cq+32* 9]
3167    pmulld               m5, m15, [cq+32*10]
3168    pmulld              m13, m15, [cq+32*11]
3169    pmulld               m6, m15, [cq+32*12]
3170    pmulld              m14, m15, [cq+32*13]
3171    pmulld               m7, m15, [cq+32*14]
3172    pmulld              m15,      [cq+32*15]
3173    mova               [cq], m7
3174    vpbroadcastd         m7, [pd_2048]
3175    REPX     {paddd  x, m7}, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
3176                             m8,  m9,  m10, m11, m12, m13, m14, m15
3177    paddd                m7, [cq]
3178    REPX     {psrad  x, 12}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
3179                             m8,  m9,  m10, m11, m12, m13, m14, m15
3180    jmp                tx2q
3181.pass2:
3182    packssdw             m0, m8
3183    packssdw             m1, m9
3184    packssdw             m2, m10
3185    packssdw             m3, m11
3186    packssdw             m4, m12
3187    packssdw             m5, m13
3188    packssdw             m6, m14
3189    packssdw            m13, m7, m15
3190    vpbroadcastd         m8, [pw_1697x16]
3191    REPX {IDTX16   x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13
3192    vpbroadcastd         m7, [pixel_10bpc_max]
3193    vpbroadcastd        m12, [pw_2048]
3194    call .pass2_end
3195    RET
3196ALIGN function_align
3197.pass2_end:
3198    punpckhwd            m9, m0, m1
3199    punpcklwd            m0, m1
3200    punpckhwd            m1, m6, m13
3201    punpcklwd            m6, m13
3202    punpckhwd           m13, m4, m5
3203    punpcklwd            m4, m5
3204    punpcklwd            m5, m2, m3
3205    punpckhwd            m2, m3
3206    punpckhdq            m3, m0, m5
3207    punpckldq            m0, m5
3208    punpckhdq           m11, m9, m2
3209    punpckldq            m9, m2
3210    punpckldq            m2, m4, m6
3211    punpckhdq            m4, m6
3212    punpckldq            m6, m13, m1
3213    punpckhdq           m13, m1
3214    punpckhqdq           m1, m0, m2
3215    punpcklqdq           m0, m2
3216    punpcklqdq           m2, m3, m4
3217    punpckhqdq           m3, m4
3218    punpcklqdq           m8, m9, m6
3219    punpckhqdq           m9, m6
3220    punpcklqdq          m10, m11, m13
3221    punpckhqdq          m11, m13
3222    pmulhrsw             m0, m12
3223    pmulhrsw             m1, m12
3224    call m(iidentity_8x8_internal_10bpc).write_2x8x2_start
3225    pmulhrsw             m0, m12, m2
3226    pmulhrsw             m1, m12, m3
3227    call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
3228    pmulhrsw             m0, m12, m8
3229    pmulhrsw             m1, m12, m9
3230    lea                dstq, [dstq+strideq*4]
3231    call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
3232    pmulhrsw             m0, m12, m10
3233    pmulhrsw             m1, m12, m11
3234    call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero
3235    ret
3236
3237INV_TXFM_8X16_FN dct, dct,       0, 12
3238INV_TXFM_8X16_FN dct, identity, 35, 12
3239INV_TXFM_8X16_FN dct, adst,      0, 12
3240INV_TXFM_8X16_FN dct, flipadst,  0, 12
3241
3242cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
3243    vpbroadcastd        m12, [clip_20b_min]
3244    vpbroadcastd        m13, [clip_20b_max]
3245    jmp m(idct_8x16_internal_10bpc).pass1
3246.pass2:
3247    lea                  r6, [rsp+32*4]
3248    call .transpose
3249    vpbroadcastd        m12, [clip_18b_min]
3250    vpbroadcastd        m13, [clip_18b_max]
3251    mova         [cq+32* 8], m0
3252    mova         [cq+32*10], m2
3253    mova         [cq+32*12], m4
3254    mova         [cq+32*14], m6
3255    pmaxsd               m0, m12, [cq+32* 1]
3256    pmaxsd               m4, m12, m1
3257    pmaxsd               m1, m12, [cq+32* 3]
3258    pmaxsd               m2, m12, [cq+32* 5]
3259    pmaxsd               m6, m12, m5
3260    pmaxsd               m5, m12, m3
3261    pmaxsd               m3, m12, [cq+32* 7]
3262    pmaxsd               m7, m12
3263    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
3264    vpbroadcastd        m11, [pd_2048]
3265    vpbroadcastd        m14, [pd_2896]
3266    call m(idct_8x16_internal_10bpc).main_oddhalf
3267    pmaxsd               m0, m12, [cq+32* 0]
3268    pmaxsd               m1, m12, [cq+32* 2]
3269    pmaxsd               m2, m12, [cq+32* 4]
3270    pmaxsd               m3, m12, [cq+32* 6]
3271    pmaxsd               m4, m12, [cq+32* 8]
3272    pmaxsd               m5, m12, [cq+32*10]
3273    pmaxsd               m6, m12, [cq+32*12]
3274    pmaxsd               m7, m12, [cq+32*14]
3275    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
3276    call m(idct_8x8_internal_10bpc).main
3277    call m(idct_8x16_internal_10bpc).main_evenhalf
3278    vpbroadcastd        m11, [pd_8]
3279    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
3280    call m(idct_16x8_internal_10bpc).pass1_rotations
3281    REPX       {psrad x, 4}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
3282                             m8,  m9,  m10, m11, m12, m13, m14, m15
3283.end:
3284    packssdw             m0, m1
3285    packssdw             m1, m2, m3
3286    packssdw             m2, m4, m5
3287    packssdw             m3, m6, m7
3288    packssdw             m4, m8, m9
3289    packssdw             m5, m10, m11
3290    packssdw             m6, m12, m13
3291    packssdw             m7, m14, m15
3292    vpermq               m0, m0, q3120
3293    vpermq               m1, m1, q3120
3294    call m(idct_8x8_internal_12bpc).write_8x4_start
3295    call m(idct_8x8_internal_10bpc).write_8x4
3296    vpermq               m0, m2, q3120
3297    vpermq               m1, m3, q3120
3298    call m(idct_8x8_internal_10bpc).write_8x4
3299    vpermq               m0, m4, q3120
3300    vpermq               m1, m5, q3120
3301    call m(idct_8x8_internal_10bpc).write_8x4
3302    vpermq               m0, m6, q3120
3303    vpermq               m1, m7, q3120
3304    call m(idct_8x8_internal_10bpc).write_8x4
3305    RET
3306ALIGN function_align
3307.transpose:
3308    mova         [cq+32* 8], m8
3309    mova         [cq+32* 9], m9
3310    mova         [cq+32*10], m10
3311    mova         [cq+32*11], m11
3312    call m(idct_8x8_internal_12bpc).transpose_8x8
3313    mova         [cq+32* 0], m0
3314    mova         [cq+32* 1], m1
3315    mova         [cq+32* 2], m2
3316    mova         [cq+32* 3], m3
3317    mova         [cq+32* 4], m4
3318    mova         [cq+32* 5], m5
3319    mova         [cq+32* 6], m6
3320    mova         [cq+32* 7], m7
3321    mova                 m0, [cq+32* 8]
3322    mova                 m1, [cq+32* 9]
3323    mova                 m2, [cq+32*10]
3324    mova                 m3, [cq+32*11]
3325    mova                 m4, m12
3326    mova                 m5, m13
3327    mova                 m6, m14
3328    mova                 m7, m15
3329    jmp m(idct_8x8_internal_12bpc).transpose_8x8
3330
3331INV_TXFM_8X16_FN adst, dct,       0, 12
3332INV_TXFM_8X16_FN adst, adst,      0, 12
3333INV_TXFM_8X16_FN adst, flipadst,  0, 12
3334INV_TXFM_8X16_FN adst, identity, 35, 12
3335
3336cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
3337    vpbroadcastd        m12, [clip_20b_min]
3338    vpbroadcastd        m13, [clip_20b_max]
3339    jmp m(iadst_8x16_internal_10bpc).pass1
3340.pass2:
3341    lea                  r6, [rsp+32*4]
3342    call .pass2_main
3343    call m(iadst_16x8_internal_10bpc).pass1_rotations
3344.pass2_end:
3345    REPX      {psrad x, 4 }, m0,  m1,  m2,  m3,  m12, m13, m14, m15
3346    REPX      {psrad x, 15}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
3347    jmp m(idct_8x16_internal_12bpc).end
3348ALIGN function_align
3349.pass2_main:
3350    call m(idct_8x16_internal_12bpc).transpose
3351    vpbroadcastd        m13, [clip_18b_min]
3352    vpbroadcastd        m14, [clip_18b_max]
3353    mova         [cq+32* 8], m0
3354    mova         [cq+32*11], m3
3355    mova         [cq+32*12], m4
3356    mova         [cq+32*15], m7
3357    pmaxsd               m0, m13, [cq+32* 2] ;  2
3358    pmaxsd               m3, m13, m1         ;  9
3359    pmaxsd               m1, m13, m5         ; 13
3360    pmaxsd               m4, m13, m2         ; 10
3361    pmaxsd               m2, m13, [cq+32* 6] ;  6
3362    pmaxsd               m5, m13, [cq+32* 5] ;  5
3363    pmaxsd               m6, m13, m6         ; 14
3364    pmaxsd               m7, m13, [cq+32* 1] ;  1
3365    REPX    {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
3366    vpbroadcastd        m12, [pd_2048]
3367    vpbroadcastd        m15, [pd_2896]
3368    call m(iadst_16x8_internal_10bpc).main_part1
3369    pmaxsd               m0, m13, [cq+32* 0] ;  0
3370    pmaxsd               m1, m13, [cq+32*15] ; 15
3371    pmaxsd               m2, m13, [cq+32* 4] ;  4
3372    pmaxsd               m3, m13, [cq+32*11] ; 11
3373    pmaxsd               m4, m13, [cq+32* 8] ;  8
3374    pmaxsd               m5, m13, [cq+32* 7] ;  7
3375    pmaxsd               m6, m13, [cq+32*12] ; 12
3376    pmaxsd               m7, m13, [cq+32* 3] ;  3
3377    REPX    {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
3378    call m(iadst_16x8_internal_10bpc).main_part2
3379    vpbroadcastd        m14, [pd_17408]
3380    psrld               m15, 11              ; pd_1
3381    psubd               m13, m14, m15        ; pd_17407
3382    pslld               m15, 3               ; pd_8
3383    ret
3384
3385INV_TXFM_8X16_FN flipadst, dct,       0, 12
3386INV_TXFM_8X16_FN flipadst, adst,      0, 12
3387INV_TXFM_8X16_FN flipadst, flipadst,  0, 12
3388INV_TXFM_8X16_FN flipadst, identity, 35, 12
3389
3390cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
3391    vpbroadcastd        m12, [clip_20b_min]
3392    vpbroadcastd        m13, [clip_20b_max]
3393    jmp m(iflipadst_8x16_internal_10bpc).pass1
3394.pass2:
3395    lea                  r6, [rsp+32*4]
3396    call m(iadst_8x16_internal_12bpc).pass2_main
3397    call m(iflipadst_16x8_internal_10bpc).pass1_rotations
3398    jmp m(iadst_8x16_internal_12bpc).pass2_end
3399
3400INV_TXFM_8X16_FN identity, dct,      0, 12
3401INV_TXFM_8X16_FN identity, adst,     0, 12
3402INV_TXFM_8X16_FN identity, flipadst, 0, 12
3403INV_TXFM_8X16_FN identity, identity, 0, 12
3404
3405cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
3406    jmp m(iidentity_8x16_internal_10bpc).pass1
3407.pass2:
3408    call .pass2_main
3409    packssdw             m0, m8
3410    packssdw             m1, m9
3411    packssdw             m2, m10
3412    packssdw             m3, m11
3413    packssdw             m4, m12
3414    packssdw             m5, m13
3415    packssdw             m6, m14
3416    packssdw            m13, m7, m15
3417    vpbroadcastd         m7, [pixel_12bpc_max]
3418    vpbroadcastd        m12, [pw_16384]
3419    call m(iidentity_8x16_internal_10bpc).pass2_end
3420    RET
3421ALIGN function_align
3422.pass2_main:
3423    mova               [cq], m7
3424    vpbroadcastd         m7, [clip_18b_min]
3425    REPX     {pmaxsd x, m7}, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
3426                             m8,  m9,  m10, m11, m12, m13, m14, m15
3427    pmaxsd               m7, [cq]
3428    mova               [cq], m15
3429    vpbroadcastd        m15, [clip_18b_max]
3430    REPX    {pminsd x, m15}, m0,  m1,  m2,  m3,  m4,  m5,  m6, m7, \
3431                             m8,  m9,  m10, m11, m12, m13, m14
3432    pminsd              m15, [cq]
3433    mova               [cq], m7
3434    vpbroadcastd         m7, [pd_5793]
3435    REPX     {pmulld x, m7}, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
3436                             m8,  m9,  m10, m11, m12, m13, m14, m15
3437    pmulld               m7, [cq]
3438    mova               [cq], m15
3439    vpbroadcastd        m15, [pd_1024]
3440    REPX    {paddd  x, m15}, m0,  m1,  m2,  m3,  m4,  m5,  m6, m7, \
3441                             m8,  m9,  m10, m11, m12, m13, m14
3442    paddd               m15, [cq]
3443    REPX     {psrad  x, 14}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
3444                             m8,  m9,  m10, m11, m12, m13, m14, m15
3445    ret
3446
3447%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth
3448    INV_TXFM_FN          %1, %2, 0, 16x4, %3
3449%ifidn %1_%2, dct_dct
3450    vpbroadcastd         m3, [dconly_%3bpc]
3451%if %3 = 10
3452.dconly:
3453    imul                r6d, [cq], 181
3454    mov                [cq], eobd ; 0
3455    or                  r3d, 4
3456.dconly2:
3457    add                 r6d, 384
3458    sar                 r6d, 9
3459.dconly3:
3460    imul                r6d, 181
3461    add                 r6d, 2176
3462    sar                 r6d, 12
3463    movd                xm0, r6d
3464    paddsw              xm0, xm3
3465    vpbroadcastw         m0, xm0
3466.dconly_loop:
3467    paddsw               m1, m0, [dstq+strideq*0]
3468    paddsw               m2, m0, [dstq+strideq*1]
3469    psubusw              m1, m3
3470    psubusw              m2, m3
3471    mova   [dstq+strideq*0], m1
3472    mova   [dstq+strideq*1], m2
3473    lea                dstq, [dstq+strideq*2]
3474    sub                 r3d, 2
3475    jg .dconly_loop
3476    RET
3477%else
3478    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly
3479%endif
3480%endif
3481%endmacro
3482
3483INV_TXFM_16X4_FN dct, dct
3484INV_TXFM_16X4_FN dct, identity
3485INV_TXFM_16X4_FN dct, adst
3486INV_TXFM_16X4_FN dct, flipadst
3487
3488cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
3489    vpbroadcastd         m8, [clip_18b_min]
3490    vpbroadcastd         m9, [clip_18b_max]
3491.pass1:
3492    vbroadcasti128       m0, [cq+16* 0]
3493    vbroadcasti128       m4, [cq+16* 4]
3494    vbroadcasti128       m1, [cq+16* 2]
3495    vbroadcasti128       m7, [cq+16* 6]
3496    vbroadcasti128       m5, [cq+16*10]
3497    vbroadcasti128       m2, [cq+16* 8]
3498    vbroadcasti128       m6, [cq+16*12]
3499    vbroadcasti128       m3, [cq+16*14]
3500    shufpd               m0, m4, 0x0c ;  0  4
3501    shufpd               m1, m5, 0x0c ;  2 10
3502    shufpd               m2, m6, 0x0c ;  8 12
3503    shufpd               m3, m7, 0x0c ; 14  6
3504    call .pass1_main
3505    vbroadcasti128      m10, [cq+16* 1]
3506    vbroadcasti128       m4, [cq+16* 5]
3507    vbroadcasti128      m11, [cq+16*15]
3508    vbroadcasti128       m5, [cq+16*11]
3509    shufpd              m10, m4, 0x0c ;  1  5
3510    shufpd              m11, m5, 0x0c ; 15 11
3511    vbroadcasti128       m5, [cq+16* 9]
3512    vbroadcasti128       m4, [cq+16*13]
3513    shufpd               m5, m4, 0x0c ;  9 13
3514    vbroadcasti128       m6, [cq+16* 7]
3515    vbroadcasti128       m4, [cq+16* 3]
3516    shufpd               m6, m4, 0x0c ;  7  3
3517    call .pass1_main2
3518    pcmpeqd              m4, m4
3519    REPX      {psubd x, m4}, m0, m1, m2, m3
3520    call .pass1_main3
3521    REPX      {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
3522    jmp                tx2q
3523.pass2:
3524    call .transpose_4x16_packed
3525    lea                  r6, [deint_shuf+128]
3526    call m(idct_16x4_internal_8bpc).main
3527.end:
3528    vpbroadcastd         m4, [pw_2048]
3529    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
3530    vpbroadcastd         m5, [pixel_10bpc_max]
3531.end2:
3532    paddw                m0, [dstq+strideq*0]
3533    paddw                m1, [dstq+strideq*1]
3534.end3:
3535    lea                  r6, [dstq+strideq*2]
3536    paddw                m2, [r6  +strideq*0]
3537    paddw                m3, [r6  +strideq*1]
3538    pxor                 m4, m4
3539    REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
3540    REPX     {pmaxsw x, m4}, m0, m1, m2, m3
3541    REPX     {pminsw x, m5}, m0, m1, m2, m3
3542    mova   [dstq+strideq*0], m0
3543    mova   [dstq+strideq*1], m1
3544    mova   [r6  +strideq*0], m2
3545    mova   [r6  +strideq*1], m3
3546    RET
3547ALIGN function_align
3548.pass1_main:
3549    vpbroadcastd         m7, [pd_2048]
3550    call m(idct_8x4_internal_10bpc).main
3551    psubd                m3, m0, m4   ; idct8 out7 out6
3552    paddd                m0, m4       ; idct8 out0 out1
3553    paddd                m1, m2, m5   ; idct8 out3 out2
3554    psubd                m2, m5       ; idct8 out4 out5
3555    ret
3556ALIGN function_align
3557.pass1_main2:
3558    ITX_MULSUB_2D        10, 11, 4, 12, 13, 7,  401_1931, 4076_3612, 1
3559    ITX_MULSUB_2D         5,  6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
3560    vbroadcasti128      m12, [pd_3784_m3784]
3561    psubd                m4, m10, m5
3562    paddd               m10, m5       ;  t8  t11
3563    psignd               m4, m12      ;  t9  t10
3564    psubd                m5, m11, m6
3565    paddd               m11, m6       ; t15  t12
3566    psignd               m5, m12      ; t14  t13
3567    vpbroadcastd         m6, [pd_1567]
3568    vpbroadcastd        m13, [pd_3784]
3569    REPX     {pmaxsd x, m8}, m5, m4
3570    REPX     {pminsd x, m9}, m5, m4
3571    pmulld              m12, m5
3572    pmulld               m5, m6
3573    vbroadcasti128       m6, [pd_1567_m1567]
3574    pmulld              m13, m4
3575    pmulld               m4, m6
3576    REPX     {pmaxsd x, m8}, m10, m11, m0, m1
3577    REPX     {pminsd x, m9}, m10, m11, m0, m1
3578    paddd               m12, m7
3579    paddd                m5, m7
3580    paddd                m4, m12
3581    psubd                m5, m13
3582    psrad                m4, 12       ; t14a t10a
3583    psrad                m5, 12       ; t9a  t13a
3584    vpbroadcastd        m12, [pd_2896]
3585    punpckhqdq           m6, m11, m5
3586    punpcklqdq          m11, m4
3587    punpckhqdq           m4, m10, m4
3588    punpcklqdq          m10, m5
3589    psubd                m5, m11, m6  ; t12a t13
3590    paddd               m11, m6       ; t15a t14
3591    psubd                m6, m10, m4  ; t11a t10
3592    paddd               m10, m4       ; t8a  t9
3593    REPX     {pmaxsd x, m8}, m5, m6
3594    REPX     {pminsd x, m9}, m5, m6
3595    pmulld               m5, m12
3596    pmulld               m6, m12
3597    REPX     {pmaxsd x, m8}, m2, m3, m11, m10
3598    REPX     {pminsd x, m9}, m2, m3, m11, m10
3599    ret
3600ALIGN function_align
3601.pass1_main3:
3602    paddd                m5, m7
3603    psubd                m4, m5, m6
3604    paddd                m5, m6
3605    psrad                m4, 12      ; t11 t10a
3606    psrad                m5, 12      ; t12 t13a
3607    psubd                m7, m0, m11 ; out15 out14
3608    paddd                m0, m11     ; out0  out1
3609    psubd                m6, m1, m5  ; out12 out13
3610    paddd                m1, m5      ; out3  out2
3611    psubd                m5, m2, m4  ; out11 out10
3612    paddd                m2, m4      ; out4  out5
3613    psubd                m4, m3, m10 ; out8  out9
3614    paddd                m3, m10     ; out7  out6
3615    REPX {pshufd x, x, q1032}, m1, m3, m5, m7
3616    ret
3617ALIGN function_align
3618.transpose_4x16_packed:
3619    vbroadcasti128       m8, [deint_shuf]
3620    packssdw             m0, m1
3621    packssdw             m2, m3
3622    packssdw             m4, m5
3623    packssdw             m6, m7
3624    REPX     {pshufb x, m8}, m0, m2, m4, m6
3625    punpckhqdq           m1, m0, m2
3626    punpcklqdq           m0, m2
3627    punpckhqdq           m2, m4, m6
3628    punpcklqdq           m4, m6
3629    vperm2i128           m3, m1, m2, 0x31
3630    vinserti128          m1, xm2, 1
3631    vperm2i128           m2, m0, m4, 0x31
3632    vinserti128          m0, xm4, 1
3633    ret
3634
3635INV_TXFM_16X4_FN adst, dct
3636INV_TXFM_16X4_FN adst, adst
3637INV_TXFM_16X4_FN adst, flipadst
3638INV_TXFM_16X4_FN adst, identity
3639
3640cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
3641    vpbroadcastd        m12, [clip_18b_min]
3642    vpbroadcastd        m13, [clip_18b_max]
3643.pass1:
3644    call m(iadst_4x16_internal_10bpc).main
3645    psrad               m11, 11 ; pd_1
3646    REPX     {paddd x, m11}, m0, m1, m2, m3
3647    paddd                m4, m5, m11
3648    paddd                m5, m6, m11
3649    paddd                m6, m7, m11
3650    paddd                m7, m8, m11
3651.pass1_end:
3652    REPX {pshufd x, x, q1032}, m0, m2, m4, m6
3653    REPX       {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
3654    jmp                tx2q
3655.pass2:
3656    call m(idct_16x4_internal_10bpc).transpose_4x16_packed
3657    lea                  r6, [deint_shuf+128]
3658    call m(iadst_16x4_internal_8bpc).main
3659    jmp m(idct_16x4_internal_10bpc).end
3660ALIGN function_align
3661.main:
3662    vpbroadcastd         m6, [pd_1321]
3663    mova                 m0, [cq+32*0]
3664    mova                 m1, [cq+32*1]
3665    vpbroadcastd         m7, [pd_2482]
3666    mova                 m2, [cq+32*6]
3667    mova                 m3, [cq+32*7]
3668    pmulld               m4, m0, m6
3669    pmulld               m5, m1, m6    ; 1321*in0
3670    pmulld               m9, m2, m7
3671    pmulld               m8, m3, m7    ; 2482*in3
3672    paddd                m4, m9
3673    paddd                m8, m5        ; 1321*in0 + 2482*in3
3674    pmulld               m5, m0, m7
3675    pmulld               m9, m1, m7    ; 2482*in0
3676    paddd                m0, m2
3677    paddd                m1, m3        ; in0 + in3
3678    paddd                m7, m6        ; pd_3803
3679    pmulld               m2, m7
3680    pmulld               m3, m7        ; 3803*in3
3681    psubd                m5, m2
3682    psubd                m9, m3        ; 2482*in0 - 3803*in3
3683    mova                 m2, [cq+32*4]
3684    pmulld              m10, m7, m2
3685    pmulld               m3, m6, m2
3686    psubd                m2, m0
3687    mova                 m0, [cq+32*5]
3688    pmulld               m7, m0        ; 3803*in2
3689    pmulld               m6, m0        ; 1321*in2
3690    psubd                m0, m1        ; in2 - in0 - in3
3691    vpbroadcastd         m1, [pd_m3344]
3692    paddd                m4, m10
3693    paddd                m7, m8        ; t0
3694    psubd                m5, m3
3695    psubd                m9, m6        ; t1
3696    pmulld               m2, m1
3697    pmulld               m0, m1        ; t2
3698    pmulld               m3, m1, [cq+32*2]
3699    pmulld               m1, [cq+32*3] ; -t3
3700    ret
3701ALIGN function_align
3702.main_end:
3703    ; expects: m6 = rnd
3704    paddd                m5, m6
3705    paddd                m9, m6
3706    paddd               m10, m4, m5
3707    paddd                m4, m6
3708    paddd                m8, m7, m6
3709    paddd                m7, m9
3710    psubd                m4, m3        ; out0 (unshifted)
3711    psubd                m5, m3        ; out1 (unshifted)
3712    paddd                m2, m6        ; out2 (unshifted)
3713    paddd                m3, m10       ; out3 (unshifted)
3714    psubd                m8, m1        ; out4 (unshifted)
3715    psubd                m9, m1        ; out5 (unshifted)
3716    paddd                m6, m0        ; out6 (unshifted)
3717    paddd                m7, m1        ; out7 (unshifted)
3718    ret
3719
3720INV_TXFM_16X4_FN flipadst, dct
3721INV_TXFM_16X4_FN flipadst, adst
3722INV_TXFM_16X4_FN flipadst, flipadst
3723INV_TXFM_16X4_FN flipadst, identity
3724
3725cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
3726    vpbroadcastd        m12, [clip_18b_min]
3727    vpbroadcastd        m13, [clip_18b_max]
3728.pass1:
3729    call m(iadst_4x16_internal_10bpc).main
3730    psrad               m11, 11 ; pd_1
3731    paddd                m4, m3, m11
3732    paddd                m3, m5, m11
3733    paddd                m5, m2, m11
3734    paddd                m2, m6, m11
3735    paddd                m6, m1, m11
3736    paddd                m1, m7, m11
3737    paddd                m7, m0, m11
3738    paddd                m0, m8, m11
3739    jmp m(iadst_16x4_internal_10bpc).pass1_end
3740.pass2:
3741    call m(idct_16x4_internal_10bpc).transpose_4x16_packed
3742    lea                  r6, [deint_shuf+128]
3743    call m(iadst_16x4_internal_8bpc).main
3744    vpbroadcastd         m4, [pw_2048]
3745    pmulhrsw             m5, m3, m4
3746    pmulhrsw             m6, m2, m4
3747    pmulhrsw             m2, m1, m4
3748    pmulhrsw             m3, m0, m4
3749    paddw                m0, m5, [dstq+strideq*0]
3750    paddw                m1, m6, [dstq+strideq*1]
3751    vpbroadcastd         m5, [pixel_10bpc_max]
3752    jmp m(idct_16x4_internal_10bpc).end3
3753
3754INV_TXFM_16X4_FN identity, dct
3755INV_TXFM_16X4_FN identity, adst
3756INV_TXFM_16X4_FN identity, flipadst
3757INV_TXFM_16X4_FN identity, identity
3758
3759cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2
3760    vpbroadcastd         m8, [pd_5793]
3761    vpermq               m0, [cq+32*0], q3120 ; 0 1
3762    vpermq               m1, [cq+32*1], q3120 ; 2 3
3763    vpermq               m2, [cq+32*2], q3120 ; 4 5
3764    vpermq               m3, [cq+32*3], q3120 ; 6 7
3765    vpermq               m4, [cq+32*4], q3120 ; 8 9
3766    vpermq               m5, [cq+32*5], q3120 ; a b
3767    vpermq               m6, [cq+32*6], q3120 ; c d
3768    vpermq               m7, [cq+32*7], q3120 ; e f
3769    vpbroadcastd         m9, [pd_3072]
3770    REPX     {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
3771    REPX     {paddd  x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
3772    REPX     {psrad  x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
3773    jmp                tx2q
3774.pass2:
3775    call m(idct_16x4_internal_10bpc).transpose_4x16_packed
3776    vpbroadcastd         m7, [pw_1697x8]
3777    pmulhrsw             m4, m7, m0
3778    pmulhrsw             m5, m7, m1
3779    pmulhrsw             m6, m7, m2
3780    pmulhrsw             m7, m3
3781    paddsw               m0, m4
3782    paddsw               m1, m5
3783    paddsw               m2, m6
3784    paddsw               m3, m7
3785    jmp m(idct_16x4_internal_10bpc).end
3786
3787INV_TXFM_16X4_FN dct, dct,      12
3788INV_TXFM_16X4_FN dct, identity, 12
3789INV_TXFM_16X4_FN dct, adst,     12
3790INV_TXFM_16X4_FN dct, flipadst, 12
3791
3792cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
3793    vpbroadcastd         m8, [clip_20b_min]
3794    vpbroadcastd         m9, [clip_20b_max]
3795    jmp m(idct_16x4_internal_10bpc).pass1
3796.pass2:
3797    vpbroadcastd        m12, [clip_18b_min]
3798    vpbroadcastd        m13, [clip_18b_max]
3799    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
3800    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
3801    ; deinterleave
3802    REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
3803    ; transpose
3804    punpcklqdq           m8, m0, m1
3805    punpckhqdq           m0, m1
3806    punpcklqdq           m9, m2, m3
3807    punpckhqdq           m2, m3
3808    punpcklqdq          m10, m4, m5
3809    punpckhqdq           m4, m5
3810    punpcklqdq          m11, m6, m7
3811    punpckhqdq           m6, m7
3812    vperm2i128           m3,  m0,  m2, 0x31   ; out6
3813    vperm2i128           m1,  m0,  m2, 0x20   ; out2
3814    vperm2i128           m7,  m4,  m6, 0x31   ; out7
3815    vperm2i128           m5,  m4,  m6, 0x20   ; out3
3816    vperm2i128          m13, m10, m11, 0x31   ; out5
3817    vperm2i128          m12, m10, m11, 0x20   ; out1
3818    vperm2i128          m11,  m8,  m9, 0x31   ; out4
3819    vperm2i128          m10,  m8,  m9, 0x20   ; out0
3820    call m(idct_4x16_internal_10bpc).pass1_main
3821    pmulld               m0, m6, m10
3822    pmulld               m2, m6, m11
3823    pmulld               m4, m6, m12
3824    pmulld               m6, m13
3825    vpbroadcastd        m10, [pd_17408]
3826    call m(idct_4x16_internal_10bpc).pass1_main2
3827    REPX       {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7
3828    packssdw             m0, m4
3829    packssdw             m1, m5
3830    packssdw             m2, m6
3831    packssdw             m3, m7
3832    vpbroadcastd         m5, [pixel_12bpc_max]
3833    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
3834    jmp m(idct_16x4_internal_10bpc).end2
3835
3836INV_TXFM_16X4_FN adst, dct,      12
3837INV_TXFM_16X4_FN adst, adst,     12
3838INV_TXFM_16X4_FN adst, flipadst, 12
3839INV_TXFM_16X4_FN adst, identity, 12
3840
3841cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
3842    vpbroadcastd        m12, [clip_20b_min]
3843    vpbroadcastd        m13, [clip_20b_max]
3844    jmp m(iadst_16x4_internal_10bpc).pass1
3845.pass2:
3846    call .pass2_main
3847    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
3848    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
3849    jmp m(idct_16x4_internal_10bpc).end2
3850ALIGN function_align
3851.pass2_main:
3852    vpbroadcastd        m12, [clip_18b_min]
3853    vpbroadcastd        m13, [clip_18b_max]
3854    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7
3855    pmaxsd               m8, m4, m12
3856    pmaxsd               m9, m5, m12
3857    REPX    {pminsd x, m13}, m0, m1, m2, m3
3858    call m(iadst_8x4_internal_12bpc).transpose_4x8
3859    mova          [cq+32*0], m0
3860    mova          [cq+32*2], m1
3861    mova          [cq+32*4], m2
3862    mova          [cq+32*6], m3
3863    pminsd               m0, m8, m13
3864    pminsd               m1, m9, m13
3865    pminsd               m2, m6, m13
3866    pminsd               m3, m7, m13
3867    call m(iadst_8x4_internal_12bpc).transpose_4x8
3868    mova          [cq+32*1], m0
3869    mova          [cq+32*3], m1
3870    mova          [cq+32*5], m2
3871    mova          [cq+32*7], m3
3872    call m(iadst_16x4_internal_10bpc).main
3873    vpbroadcastd         m6, [pd_2048]
3874    call m(iadst_16x4_internal_10bpc).main_end
3875    psrad                m0, m4, 15
3876    psrad                m1, m5, 15
3877    psrad                m2, 15
3878    psrad                m3, 15
3879    psrad                m4, m8, 15
3880    psrad                m5, m9, 15
3881    psrad                m6, 15
3882    psrad                m7, 15
3883    packssdw             m0, m4
3884    packssdw             m1, m5
3885    packssdw             m2, m6
3886    packssdw             m3, m7
3887    vpbroadcastd         m4, [pw_16384]
3888    vpbroadcastd         m5, [pixel_12bpc_max]
3889    ret
3890
3891INV_TXFM_16X4_FN flipadst, dct,      12
3892INV_TXFM_16X4_FN flipadst, adst,     12
3893INV_TXFM_16X4_FN flipadst, flipadst, 12
3894INV_TXFM_16X4_FN flipadst, identity, 12
3895
3896cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
3897    vpbroadcastd        m12, [clip_20b_min]
3898    vpbroadcastd        m13, [clip_20b_max]
3899    jmp m(iflipadst_16x4_internal_10bpc).pass1
3900.pass2:
3901    call m(iadst_16x4_internal_12bpc).pass2_main
3902    vpermq               m7, m0, q3120
3903    vpermq               m6, m1, q3120
3904    vpermq               m1, m2, q3120
3905    vpermq               m0, m3, q3120
3906    pmulhrsw             m0, m4
3907    pmulhrsw             m1, m4
3908    pmulhrsw             m2, m6, m4
3909    pmulhrsw             m3, m7, m4
3910    jmp m(idct_16x4_internal_10bpc).end2
3911
3912INV_TXFM_16X4_FN identity, dct,      12
3913INV_TXFM_16X4_FN identity, adst,     12
3914INV_TXFM_16X4_FN identity, flipadst, 12
3915INV_TXFM_16X4_FN identity, identity, 12
3916
3917cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2
3918    vpbroadcastd         m8, [pd_1697]
3919    vpermq               m0, [cq+32*0], q3120 ; 0 1
3920    vpermq               m1, [cq+32*1], q3120 ; 2 3
3921    vpermq               m2, [cq+32*2], q3120 ; 4 5
3922    vpermq               m3, [cq+32*3], q3120 ; 6 7
3923    vpbroadcastd         m9, [pd_3072]
3924    pmulld               m4, m8, m0
3925    pmulld               m5, m8, m1
3926    pmulld               m6, m8, m2
3927    pmulld               m7, m8, m3
3928    vpermq              m10, [cq+32*4], q3120 ; 8 9
3929    vpermq              m11, [cq+32*5], q3120 ; a b
3930    vpermq              m12, [cq+32*6], q3120 ; c d
3931    vpermq              m13, [cq+32*7], q3120 ; e f
3932    REPX     {paddd  x, m9}, m4, m5, m6, m7
3933    REPX     {psrad  x, 12}, m4, m5, m6, m7
3934    paddd                m0, m4
3935    pmulld               m4, m8, m10
3936    paddd                m1, m5
3937    pmulld               m5, m8, m11
3938    paddd                m2, m6
3939    pmulld               m6, m8, m12
3940    paddd                m3, m7
3941    pmulld               m7, m8, m13
3942    REPX     {paddd  x, m9}, m4, m5, m6, m7
3943    REPX     {psrad  x, 12}, m4, m5, m6, m7
3944    paddd                m4, m10
3945    paddd                m5, m11
3946    paddd                m6, m12
3947    paddd                m7, m13
3948    jmp                tx2q
3949.pass2:
3950    vpbroadcastd        m12, [clip_18b_min]
3951    vpbroadcastd        m13, [clip_18b_max]
3952    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
3953    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
3954    vpbroadcastd         m8, [pd_5793]
3955    vpbroadcastd         m9, [pd_2048]
3956    REPX     {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
3957    REPX     {paddd  x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
3958    REPX     {psrad  x, 15}, m0, m1, m2, m3, m4, m5, m6, m7
3959    call m(idct_16x4_internal_10bpc).transpose_4x16_packed
3960    vpbroadcastd         m4, [pw_16384]
3961    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
3962    vpbroadcastd         m5, [pixel_12bpc_max]
3963    jmp m(idct_16x4_internal_10bpc).end2
3964
3965%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth
3966    INV_TXFM_FN          %1, %2, 0, 16x8, %3
3967%ifidn %1_%2, dct_dct
3968    imul                r6d, [cq], 181
3969    vpbroadcastd         m3, [dconly_%3bpc]
3970    mov                [cq], eobd ; 0
3971    or                  r3d, 8
3972    add                 r6d, 128
3973    sar                 r6d, 8
3974    imul                r6d, 181
3975    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
3976%endif
3977%endmacro
3978
3979INV_TXFM_16X8_FN dct, dct
3980INV_TXFM_16X8_FN dct, identity
3981INV_TXFM_16X8_FN dct, adst
3982INV_TXFM_16X8_FN dct, flipadst
3983
3984cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
3985    vpbroadcastd        m12, [clip_18b_min]
3986    vpbroadcastd        m13, [clip_18b_max]
3987.pass1:
3988    vpbroadcastd        m14, [pd_2896]
3989    pmulld               m0, m14, [cq+32* 1]
3990    pmulld               m1, m14, [cq+32* 3]
3991    pmulld               m2, m14, [cq+32* 5]
3992    pmulld               m3, m14, [cq+32* 7]
3993    pmulld               m4, m14, [cq+32* 9]
3994    pmulld               m5, m14, [cq+32*11]
3995    pmulld               m6, m14, [cq+32*13]
3996    pmulld               m7, m14, [cq+32*15]
3997    vpbroadcastd        m11, [pd_2048]
3998    lea                  r6, [rsp+32*4]
3999    call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
4000    pmulld               m0, m14, [cq+32* 0]
4001    pmulld               m1, m14, [cq+32* 2]
4002    pmulld               m2, m14, [cq+32* 4]
4003    pmulld               m3, m14, [cq+32* 6]
4004    pmulld               m4, m14, [cq+32* 8]
4005    pmulld               m5, m14, [cq+32*10]
4006    pmulld               m6, m14, [cq+32*12]
4007    pmulld               m7, m14, [cq+32*14]
4008    call m(idct_8x8_internal_10bpc).main_rect2
4009    call m(idct_8x16_internal_10bpc).main_evenhalf
4010    psrld               m11, 11 ; pd_1
4011    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
4012    call .pass1_rotations
4013    REPX       {psrad x, 1}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
4014                             m8,  m9,  m10, m11, m12, m13, m14, m15
4015    jmp                tx2q
4016.pass2:
4017    call .transpose
4018    call m(idct_16x8_internal_8bpc).main
4019    vpbroadcastd        m10, [pw_2048]
4020.end:
4021    pmulhrsw             m0, m10
4022    pmulhrsw             m1, m10
4023    pmulhrsw             m2, m10
4024    pmulhrsw             m3, m10
4025    call .write_16x4_start
4026.end2:
4027    pmulhrsw             m0, m4, m10
4028    pmulhrsw             m1, m5, m10
4029    pmulhrsw             m2, m6, m10
4030    pmulhrsw             m3, m7, m10
4031    call .write_16x4_zero
4032    RET
4033ALIGN function_align
4034.pass1_rotations:
4035    mova                m14, [r6-32*4]
4036    mova                m13, [r6-32*3]
4037    mova                m12, [r6-32*2]
4038    mova                m11, [r6-32*1]
4039    mova                m10, [r6+32*0]
4040    mova                 m9, [r6+32*1]
4041    mova                 m8, [r6+32*2]
4042    psubd               m15, m0, m14       ; out15
4043    paddd                m0, m14           ; out0
4044    psubd               m14, m1, m13       ; out14
4045    paddd                m1, m13           ; out1
4046    psubd               m13, m2, m12       ; out13
4047    paddd                m2, m12           ; out2
4048    psubd               m12, m3, m11       ; out12
4049    paddd                m3, m11           ; out3
4050    psubd               m11, m4, m10       ; out11
4051    paddd                m4, m10           ; out4
4052    psubd               m10, m5, m9        ; out10
4053    paddd                m5, m9            ; out5
4054    psubd                m9, m6, m8        ; out9
4055    paddd                m6, m8            ; out6
4056    psubd                m8, m7, [r6+32*3] ; out8
4057    paddd                m7, [r6+32*3]     ; out7
4058    ret
4059ALIGN function_align
4060.transpose:
4061    lea                  r6, [deint_shuf+128]
4062.transpose2:
4063    packssdw             m0, m8
4064    packssdw             m1, m9
4065    packssdw             m2, m10
4066    packssdw             m3, m11
4067    packssdw             m4, m12
4068    packssdw             m5, m13
4069    packssdw             m6, m14
4070    packssdw             m7, m15
4071.transpose3:
4072    punpckhwd            m8, m0, m1
4073    punpcklwd            m0, m1
4074    punpcklwd            m1, m2, m3
4075    punpckhwd            m2, m3
4076    punpckhwd            m3, m4, m5
4077    punpcklwd            m4, m5
4078    punpckhwd            m5, m6, m7
4079    punpcklwd            m6, m7
4080    punpckhdq            m7, m4, m6
4081    punpckldq            m4, m6
4082    punpckldq            m6, m8, m2
4083    punpckhdq            m8, m2
4084    punpckhdq            m2, m0, m1
4085    punpckldq            m0, m1
4086    punpckhdq            m1, m3, m5
4087    punpckldq            m3, m5
4088    punpcklqdq           m5, m6, m3
4089    punpckhqdq           m6, m3
4090    punpckhqdq           m3, m2, m7
4091    punpcklqdq           m2, m7
4092    punpcklqdq           m7, m8, m1
4093    punpckhqdq           m8, m1
4094    punpckhqdq           m1, m0, m4
4095    punpcklqdq           m0, m4
4096    vperm2i128           m4, m0, m5, 0x31
4097    vinserti128          m0, xm5, 1
4098    vperm2i128           m5, m1, m6, 0x31
4099    vinserti128          m1, xm6, 1
4100    vperm2i128           m6, m2, m7, 0x31
4101    vinserti128          m2, xm7, 1
4102    vperm2i128           m7, m3, m8, 0x31
4103    vinserti128          m3, xm8, 1
4104    ret
4105ALIGN function_align
4106.write_16x4_start:
4107    vpbroadcastd         m9, [pixel_10bpc_max]
4108    lea                  r3, [strideq*3]
4109    pxor                 m8, m8
4110.write_16x4_zero:
4111    REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7
4112    add                  cq, 32*8
4113.write_16x4:
4114    paddw                m0, [dstq+strideq*0]
4115    paddw                m1, [dstq+strideq*1]
4116    paddw                m2, [dstq+strideq*2]
4117    paddw                m3, [dstq+r3       ]
4118    REPX     {pmaxsw x, m8}, m0, m1, m2, m3
4119    REPX     {pminsw x, m9}, m0, m1, m2, m3
4120    mova   [dstq+strideq*0], m0
4121    mova   [dstq+strideq*1], m1
4122    mova   [dstq+strideq*2], m2
4123    mova   [dstq+r3       ], m3
4124    lea                dstq, [dstq+strideq*4]
4125    ret
4126
4127INV_TXFM_16X8_FN adst, dct
4128INV_TXFM_16X8_FN adst, adst
4129INV_TXFM_16X8_FN adst, flipadst
4130INV_TXFM_16X8_FN adst, identity
4131
4132cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
4133    vpbroadcastd        m13, [clip_18b_min]
4134    vpbroadcastd        m14, [clip_18b_max]
4135.pass1:
4136    lea                  r6, [rsp+32*4]
4137    call .main
4138    vpbroadcastd        m14, [pd_3072]
4139    psrld               m15, 11       ; pd_1
4140    psubd               m13, m14, m15 ; pd_3071
4141    call .pass1_rotations
4142.pass1_end:
4143    REPX      {psrad x, 1 }, m0,  m1,  m2,  m3,  m12, m13, m14, m15
4144    REPX      {psrad x, 12}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
4145    jmp                tx2q
4146.pass2:
4147    call m(idct_16x8_internal_10bpc).transpose
4148    call m(iadst_16x8_internal_8bpc).main
4149    call m(iadst_16x8_internal_8bpc).main_pass2_end
4150    vpbroadcastd        m10, [pw_2048]
4151    pxor                m11, m11
4152    psubw               m11, m10
4153    pmulhrsw             m0, m10
4154    pmulhrsw             m1, m11
4155    pmulhrsw             m2, m10
4156    pmulhrsw             m3, m11
4157    call m(idct_16x8_internal_10bpc).write_16x4_start
4158    pmulhrsw             m0, m4, m10
4159    pmulhrsw             m1, m5, m11
4160    pmulhrsw             m2, m6, m10
4161    pmulhrsw             m3, m7, m11
4162    call m(idct_16x8_internal_10bpc).write_16x4_zero
4163    RET
4164ALIGN function_align
4165.pass1_rotations:
4166    paddd                m0, m15
4167    psubd                m1, m15, m1
4168    paddd                m2, m15
4169    psubd                m3, m15, m3
4170    paddd                m4, m14
4171    psubd                m5, m13, m5
4172    paddd                m6, m14
4173    psubd                m7, m13, m7
4174    paddd                m8, m14, m9
4175    psubd                m9, m13, m10
4176    paddd               m10, m14, m11
4177    psubd               m11, m13, m12
4178    paddd               m12, m15, [r6-32*1]
4179    psubd               m13, m15, [r6-32*2]
4180    paddd               m14, m15, [r6-32*3]
4181    psubd               m15,      [r6-32*4]
4182    ret
4183ALIGN function_align
4184.main:
4185    ; expects: m13 = clip_min   m14 = clip_max
4186    vpbroadcastd        m15, [pd_2896]
4187    pmulld               m0, m15, [cq+32* 2]
4188    pmulld               m1, m15, [cq+32*13]
4189    pmulld               m2, m15, [cq+32* 6]
4190    pmulld               m3, m15, [cq+32* 9]
4191    pmulld               m4, m15, [cq+32*10]
4192    pmulld               m5, m15, [cq+32* 5]
4193    pmulld               m6, m15, [cq+32*14]
4194    pmulld               m7, m15, [cq+32* 1]
4195    vpbroadcastd        m12, [pd_2048]
4196    REPX     {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
4197    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
4198    call .main_part1
4199    pmulld               m0, m15, [cq+32* 0]
4200    pmulld               m1, m15, [cq+32*15]
4201    pmulld               m2, m15, [cq+32* 4]
4202    pmulld               m3, m15, [cq+32*11]
4203    pmulld               m4, m15, [cq+32* 8]
4204    pmulld               m5, m15, [cq+32* 7]
4205    pmulld               m6, m15, [cq+32*12]
4206    pmulld               m7, m15, [cq+32* 3]
4207    REPX     {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
4208    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
4209.main_part2:
4210    ITX_MULSUB_2D         1, 0, 8, 9, 10, 12,  201, 4091
4211    ITX_MULSUB_2D         3, 2, 8, 9, 10, 12, 1751, 3703
4212    ITX_MULSUB_2D         5, 4, 8, 9, 10, 12, 3035, 2751
4213    ITX_MULSUB_2D         7, 6, 8, 9, 10, 12, 3857, 1380
4214    psubd                m8, m0, m4 ; t8a
4215    paddd                m0, m4     ; t0a
4216    psubd                m4, m1, m5 ; t9a
4217    paddd                m1, m5     ; t1a
4218    psubd                m5, m2, m6 ; t12a
4219    paddd                m2, m6     ; t4a
4220    psubd                m6, m3, m7 ; t13a
4221    paddd                m7, m3     ; t5a
4222    REPX    {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
4223    REPX    {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
4224    vpbroadcastd        m11, [pd_4017]
4225    vpbroadcastd        m10, [pd_799]
4226    ITX_MULSUB_2D         8, 4, 3, 9, _, 12, 10, 11
4227    ITX_MULSUB_2D         6, 5, 3, 9, _, 12, 11, 10
4228    psubd                m3, m0, m2 ; t4
4229    paddd                m0, m2     ; t0
4230    psubd                m2, m1, m7 ; t5
4231    paddd                m1, m7     ; t1
4232    psubd                m7, m4, m6 ; t12a
4233    paddd                m4, m6     ; t8a
4234    psubd                m6, m8, m5 ; t13a
4235    paddd                m5, m8     ; t9a
4236    REPX    {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
4237    REPX    {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5
4238    vpbroadcastd        m11, [pd_3784]
4239    vpbroadcastd        m10, [pd_1567]
4240    ITX_MULSUB_2D         3, 2, 8, 9, _, 12, 10, 11
4241    ITX_MULSUB_2D         7, 6, 8, 9, _, 12, 10, 11
4242    pminsd              m10, m14, [r6-32*4] ;  t2
4243    pminsd               m8, m14, [r6-32*3] ;  t3
4244    psubd                m9, m0, m10        ;  t2a
4245    paddd                m0, m10            ;  out0
4246    psubd               m10, m1, m8         ;  t3a
4247    paddd                m1, m8             ; -out15
4248    pmaxsd               m9, m13
4249    pmaxsd              m10, m13
4250    pminsd               m9, m14
4251    pminsd              m10, m14
4252    mova          [r6-32*4], m1
4253    mova                m11, [r6-32*1]      ;  t7a
4254    mova                 m1, [r6-32*2]      ;  t6a
4255    psubd                m8, m3, m11        ;  t7
4256    paddd               m11, m3             ;  out12
4257    paddd                m3, m2, m1         ; -out3
4258    psubd                m2, m1             ;  t6
4259    pmaxsd               m8, m13
4260    pmaxsd               m2, m13
4261    pminsd               m8, m14
4262    pminsd               m2, m14
4263    mova          [r6-32*1], m11
4264    mova          [r6-32*3], m2
4265    mova                 m1, [r6+32*3]      ;  t15
4266    mova                 m2, [r6+32*2]      ;  t14
4267    paddd               m12, m7, m1         ; -out13
4268    psubd                m7, m1             ;  t15a
4269    psubd               m11, m6, m2         ;  t14a
4270    paddd                m2, m6             ;  out2
4271    pmaxsd               m7, m13
4272    pmaxsd              m11, m13
4273    pminsd               m7, m14
4274    pminsd              m11, m14
4275    mova          [r6-32*2], m12
4276    pminsd               m1, m14, [r6+32*0] ;  t10a
4277    pminsd              m12, m14, [r6+32*1] ;  t11a
4278    psubd                m6, m4, m1         ;  t10
4279    paddd                m1, m4             ; -out1
4280    psubd                m4, m5, m12        ;  t11
4281    paddd                m5, m12            ;  out14
4282    vpbroadcastd        m12, [pd_1448]
4283    pmaxsd               m6, m13
4284    pmaxsd               m4, m13
4285    pminsd               m6, m14
4286    pminsd               m4, m14
4287    REPX    {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4
4288    pmulld              m12, [r6-32*3]      ;  t6
4289    mova          [r6-32*3], m5
4290    paddd                m5, m11, m7        ; -out5  (unshifted)
4291    psubd               m11, m7             ;  out10 (unshifted)
4292    paddd                m7, m9, m10        ; -out7  (unshifted)
4293    psubd                m9, m10            ;  out8  (unshifted)
4294    psubd               m10, m6, m4         ; -out9  (unshifted)
4295    paddd                m6, m4             ;  out6  (unshifted)
4296    paddd                m4, m12, m8        ;  out4  (unshifted)
4297    psubd               m12, m8             ; -out11 (unshifted)
4298    ret
4299.main_part1:
4300    ITX_MULSUB_2D         1, 0, 8, 9, 10, 12,  995, 3973
4301    ITX_MULSUB_2D         3, 2, 8, 9, 10, 12, 2440, 3290
4302    ITX_MULSUB_2D         5, 4, 8, 9, 10, 12, 3513, 2106
4303    ITX_MULSUB_2D         7, 6, 8, 9, 10, 12, 4052,  601
4304    psubd                m8, m0, m4 ; t10a
4305    paddd                m0, m4     ; t2a
4306    psubd                m4, m1, m5 ; t11a
4307    paddd                m1, m5     ; t3a
4308    psubd                m5, m2, m6 ; t14a
4309    paddd                m2, m6     ; t6a
4310    psubd                m6, m3, m7 ; t15a
4311    paddd                m7, m3     ; t7a
4312    REPX    {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
4313    REPX    {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
4314    vpbroadcastd        m11, [pd_2276]
4315    vpbroadcastd        m10, [pd_3406]
4316    ITX_MULSUB_2D         8, 4, 3, 9, _, 12, 10, 11
4317    ITX_MULSUB_2D         6, 5, 3, 9, _, 12, 11, 10
4318    psubd                m3, m0, m2 ; t6
4319    paddd                m0, m2     ; t2
4320    psubd                m2, m1, m7 ; t7
4321    paddd                m1, m7     ; t3
4322    psubd                m7, m4, m6 ; t14a
4323    paddd                m4, m6     ; t10a
4324    psubd                m6, m8, m5 ; t15a
4325    paddd                m5, m8     ; t11a
4326    REPX    {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
4327    REPX    {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later
4328    vpbroadcastd        m11, [pd_1567]
4329    vpbroadcastd        m10, [pd_3784]
4330    ITX_MULSUB_2D         2, 3, 8, 9, _, 12, 10, 11
4331    ITX_MULSUB_2D         6, 7, 8, 9, _, 12, 10, 11
4332    mova          [r6-32*4], m0
4333    mova          [r6-32*3], m1
4334    mova          [r6+32*0], m4
4335    mova          [r6+32*1], m5
4336    mova          [r6-32*2], m2
4337    mova          [r6-32*1], m3
4338    mova          [r6+32*2], m6
4339    mova          [r6+32*3], m7
4340    ret
4341
4342INV_TXFM_16X8_FN flipadst, dct
4343INV_TXFM_16X8_FN flipadst, adst
4344INV_TXFM_16X8_FN flipadst, flipadst
4345INV_TXFM_16X8_FN flipadst, identity
4346
4347cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
4348    vpbroadcastd        m13, [clip_18b_min]
4349    vpbroadcastd        m14, [clip_18b_max]
4350.pass1:
4351    lea                  r6, [rsp+32*4]
4352    call m(iadst_16x8_internal_10bpc).main
4353    vpbroadcastd        m14, [pd_3072]
4354    psrld               m15, 11
4355    psubd               m13, m14, m15
4356    call .pass1_rotations
4357    jmp m(iadst_16x8_internal_10bpc).pass1_end
4358.pass2:
4359    call m(idct_16x8_internal_10bpc).transpose
4360    call m(iadst_16x8_internal_8bpc).main
4361    call m(iadst_16x8_internal_8bpc).main_pass2_end
4362    vpbroadcastd        m10, [pw_2048]
4363    pxor                m11, m11
4364    psubw               m11, m10
4365    mova                m12, m0
4366    pmulhrsw             m0, m7, m11
4367    mova                 m7, m1
4368    pmulhrsw             m1, m6, m10
4369    mova                 m6, m2
4370    pmulhrsw             m2, m5, m11
4371    mova                 m5, m3
4372    pmulhrsw             m3, m4, m10
4373    call m(idct_16x8_internal_10bpc).write_16x4_start
4374    pmulhrsw             m0, m5, m11
4375    pmulhrsw             m1, m6, m10
4376    pmulhrsw             m2, m7, m11
4377    pmulhrsw             m3, m12, m10
4378    call m(idct_16x8_internal_10bpc).write_16x4_zero
4379    RET
4380ALIGN function_align
4381.pass1_rotations:
4382    psubd                m8, m13, m7
4383    paddd                m7, m14, m9
4384    paddd                m9, m14, m6
4385    psubd                m6, m13, m10
4386    psubd               m10, m13, m5
4387    paddd                m5, m14, m11
4388    paddd               m11, m14, m4
4389    psubd                m4, m13, m12
4390    psubd               m12, m15, m3
4391    paddd                m3, m15, [r6-32*1]
4392    paddd               m13, m15, m2
4393    psubd                m2, m15, [r6-32*2]
4394    psubd               m14, m15, m1
4395    mova                 m1, m15
4396    paddd               m15, m0
4397    psubd                m0, m1, [r6-32*4]
4398    paddd                m1,     [r6-32*3]
4399    ret
4400
4401INV_TXFM_16X8_FN identity, dct
4402INV_TXFM_16X8_FN identity, adst
4403INV_TXFM_16X8_FN identity, flipadst
4404INV_TXFM_16X8_FN identity, identity
4405
4406cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
4407.pass1:
4408    vpbroadcastd        m15, [pd_2896]
4409    pmulld               m0, m15, [cq+32* 0]
4410    pmulld               m1, m15, [cq+32* 1]
4411    pmulld               m2, m15, [cq+32* 2]
4412    pmulld               m3, m15, [cq+32* 3]
4413    pmulld               m4, m15, [cq+32* 4]
4414    pmulld               m5, m15, [cq+32* 5]
4415    pmulld               m6, m15, [cq+32* 6]
4416    pmulld               m7, m15, [cq+32* 7]
4417    pmulld               m8, m15, [cq+32* 8]
4418    pmulld               m9, m15, [cq+32* 9]
4419    pmulld              m10, m15, [cq+32*10]
4420    pmulld              m11, m15, [cq+32*11]
4421    pmulld              m12, m15, [cq+32*12]
4422    pmulld              m13, m15, [cq+32*13]
4423    pmulld              m14, m15, [cq+32*14]
4424    pmulld              m15,      [cq+32*15]
4425    mova              [rsp], m7
4426    vpbroadcastd         m7, [pd_2048]
4427    REPX    {paddd  x, m7 }, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
4428                             m8,  m9,  m10, m11, m12, m13, m14, m15
4429    paddd                m7, [rsp]
4430    REPX    {psrad  x, 12 }, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
4431                             m8,  m9,  m10, m11, m12, m13, m14, m15
4432    mova              [rsp], m15
4433    vpbroadcastd        m15, [pd_5793]
4434    REPX    {pmulld x, m15}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
4435                             m8,  m9,  m10, m11, m12, m13, m14
4436    pmulld              m15, [rsp]
4437    mova              [rsp], m7
4438    vpbroadcastd         m7, [pd_3072]
4439    REPX    {paddd  x, m7 }, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
4440                             m8,  m9,  m10, m11, m12, m13, m14, m15
4441    paddd                m7, [rsp]
4442    REPX    {psrad  x, 12 }, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
4443                             m8,  m9,  m10, m11, m12, m13, m14, m15
4444    jmp                tx2q
4445.pass2:
4446    call m(idct_16x8_internal_10bpc).transpose
4447    vpbroadcastd        m10, [pw_4096]
4448    jmp m(idct_16x8_internal_10bpc).end
4449
4450INV_TXFM_16X8_FN dct, dct,      12
4451INV_TXFM_16X8_FN dct, identity, 12
4452INV_TXFM_16X8_FN dct, adst,     12
4453INV_TXFM_16X8_FN dct, flipadst, 12
4454
4455cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
4456    vpbroadcastd        m12, [clip_20b_min]
4457    vpbroadcastd        m13, [clip_20b_max]
4458    jmp m(idct_16x8_internal_10bpc).pass1
4459.pass2:
4460    call .pass2_main
4461    RET
4462ALIGN function_align
4463.pass2_main:
4464    call m(idct_8x16_internal_12bpc).transpose
4465    vpbroadcastd        m12, [clip_18b_min]
4466    vpbroadcastd        m13, [clip_18b_max]
4467    vpbroadcastd        m11, [pd_2048]
4468    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
4469    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
4470    call m(idct_8x8_internal_10bpc).main
4471    call m(idct_8x8_internal_12bpc).round_shift4
4472    mova         [cq+32* 8], m0
4473    mova         [cq+32* 9], m1
4474    mova         [cq+32*10], m2
4475    mova         [cq+32*11], m3
4476    mova         [cq+32*12], m4
4477    mova         [cq+32*13], m5
4478    mova         [cq+32*14], m6
4479    mova         [cq+32*15], m7
4480    pmaxsd               m0, m12, [cq+32*0]
4481    pmaxsd               m1, m12, [cq+32*1]
4482    pmaxsd               m2, m12, [cq+32*2]
4483    pmaxsd               m3, m12, [cq+32*3]
4484    pmaxsd               m4, m12, [cq+32*4]
4485    pmaxsd               m5, m12, [cq+32*5]
4486    pmaxsd               m6, m12, [cq+32*6]
4487    pmaxsd               m7, m12, [cq+32*7]
4488    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
4489    call m(idct_8x8_internal_10bpc).main
4490    call m(idct_8x8_internal_12bpc).round_shift4
4491.end:
4492    packssdw             m0, [cq+32* 8]
4493    packssdw             m1, [cq+32* 9]
4494    packssdw             m2, [cq+32*10]
4495    packssdw             m3, [cq+32*11]
4496    packssdw             m4, [cq+32*12]
4497    packssdw             m5, [cq+32*13]
4498    packssdw             m6, [cq+32*14]
4499    packssdw             m7, [cq+32*15]
4500    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
4501    call .write_16x4_start
4502    call m(idct_16x8_internal_10bpc).write_16x4_zero
4503    vpermq               m0, m4, q3120
4504    vpermq               m1, m5, q3120
4505    vpermq               m2, m6, q3120
4506    vpermq               m3, m7, q3120
4507    jmp m(idct_16x8_internal_10bpc).write_16x4_zero
4508ALIGN function_align
4509.write_16x4_start:
4510    vpbroadcastd         m9, [pixel_12bpc_max]
4511    lea                  r3, [strideq*3]
4512    pxor                 m8, m8
4513    ret
4514
4515INV_TXFM_16X8_FN adst, dct,      12
4516INV_TXFM_16X8_FN adst, adst,     12
4517INV_TXFM_16X8_FN adst, flipadst, 12
4518INV_TXFM_16X8_FN adst, identity, 12
4519
4520cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
4521    vpbroadcastd        m13, [clip_20b_min]
4522    vpbroadcastd        m14, [clip_20b_max]
4523    jmp m(iadst_16x8_internal_10bpc).pass1
4524.pass2:
4525    call .pass2_main
4526    call m(idct_16x8_internal_12bpc).end
4527    RET
4528ALIGN function_align
4529.pass2_main:
4530    call m(idct_8x16_internal_12bpc).transpose
4531    vpbroadcastd        m12, [clip_18b_min]
4532    vpbroadcastd        m13, [clip_18b_max]
4533    vpbroadcastd        m11, [pd_2048]
4534    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
4535    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
4536    call m(iadst_8x8_internal_12bpc).pass2_main2
4537    mova         [cq+32* 8], m0
4538    mova         [cq+32* 9], m1
4539    mova         [cq+32*10], m2
4540    mova         [cq+32*11], m3
4541    mova         [cq+32*12], m4
4542    mova         [cq+32*13], m5
4543    mova         [cq+32*14], m6
4544    mova         [cq+32*15], m7
4545    pmaxsd               m0, m12, [cq+32*0]
4546    pmaxsd               m1, m12, [cq+32*1]
4547    pmaxsd               m2, m12, [cq+32*2]
4548    pmaxsd               m3, m12, [cq+32*3]
4549    pmaxsd               m4, m12, [cq+32*4]
4550    pmaxsd               m5, m12, [cq+32*5]
4551    pmaxsd               m6, m12, [cq+32*6]
4552    pmaxsd               m7, m12, [cq+32*7]
4553    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
4554    call m(iadst_8x8_internal_12bpc).pass2_main2
4555    ret
4556
4557INV_TXFM_16X8_FN flipadst, dct,      12
4558INV_TXFM_16X8_FN flipadst, adst,     12
4559INV_TXFM_16X8_FN flipadst, flipadst, 12
4560INV_TXFM_16X8_FN flipadst, identity, 12
4561
4562cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
4563    vpbroadcastd        m13, [clip_20b_min]
4564    vpbroadcastd        m14, [clip_20b_max]
4565    jmp m(iflipadst_16x8_internal_10bpc).pass1
4566.pass2:
4567    call m(iadst_16x8_internal_12bpc).pass2_main
4568    packssdw            m13, m0, [cq+32* 8]
4569    packssdw            m12, m1, [cq+32* 9]
4570    packssdw            m11, m2, [cq+32*10]
4571    packssdw            m10, m3, [cq+32*11]
4572    packssdw             m3, m4, [cq+32*12]
4573    packssdw             m2, m5, [cq+32*13]
4574    packssdw             m1, m6, [cq+32*14]
4575    packssdw             m0, m7, [cq+32*15]
4576    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
4577    call m(idct_16x8_internal_12bpc).write_16x4_start
4578    call m(idct_16x8_internal_10bpc).write_16x4_zero
4579    vpermq               m0, m10, q3120
4580    vpermq               m1, m11, q3120
4581    vpermq               m2, m12, q3120
4582    vpermq               m3, m13, q3120
4583    call m(idct_16x8_internal_10bpc).write_16x4_zero
4584    RET
4585
4586INV_TXFM_16X8_FN identity, dct,      12
4587INV_TXFM_16X8_FN identity, adst,     12
4588INV_TXFM_16X8_FN identity, flipadst, 12
4589INV_TXFM_16X8_FN identity, identity, 12
4590
4591cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
4592    jmp m(iidentity_16x8_internal_10bpc).pass1
4593.pass2:
4594    call m(idct_16x8_internal_10bpc).transpose2
4595    vpbroadcastd        m10, [pw_4096]
4596    pmulhrsw             m0, m10
4597    pmulhrsw             m1, m10
4598    pmulhrsw             m2, m10
4599    pmulhrsw             m3, m10
4600    call m(idct_16x8_internal_12bpc).write_16x4_start
4601    call m(idct_16x8_internal_10bpc).write_16x4_zero
4602    jmp m(idct_16x8_internal_10bpc).end2
4603
4604%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth
4605    INV_TXFM_FN          %1, %2, %3, 16x16, %4
4606%ifidn %1_%2, dct_dct
4607    imul                r6d, [cq], 181
4608    vpbroadcastd         m3, [dconly_%4bpc]
4609    mov                [cq], eobd ; 0
4610    or                  r3d, 16
4611    add                 r6d, 640
4612    sar                 r6d, 10
4613    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
4614%endif
4615%endmacro
4616
4617INV_TXFM_16X16_FN dct, dct
4618INV_TXFM_16X16_FN dct, identity, 28
4619INV_TXFM_16X16_FN dct, adst
4620INV_TXFM_16X16_FN dct, flipadst
4621
4622cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
4623    vpbroadcastd        m12, [clip_18b_min]
4624    vpbroadcastd        m13, [clip_18b_max]
4625.pass1:
4626    vpbroadcastd        m11, [pd_2048]
4627    vpbroadcastd        m14, [pd_2896]
4628    lea                  r6, [rsp+32*4]
4629    sub                eobd, 36
4630    jl .fast
4631    add                  cq, 32
4632    call .main
4633    sub                  cq, 32
4634    mova                m10, [r6-32*4]
4635    mova                 m9, [r6-32*3]
4636    mova                 m8, [r6-32*2]
4637    psubd               m15, m0, m10 ; out15
4638    paddd                m0, m10     ; out0
4639    psubd               m10, m1, m9  ; out14
4640    paddd                m1, m9      ; out1
4641    psubd                m9, m2, m8  ; out13
4642    paddd                m2, m8      ; out2
4643    REPX       {psrad x, 2}, m0, m1, m2
4644    mova          [r6-32*4], m0
4645    mova          [r6-32*3], m1
4646    mova          [r6-32*2], m2
4647    mova                 m2, [r6-32*1]
4648    mova                 m1, [r6+32*0]
4649    mova                 m0, [r6+32*1]
4650    REPX       {psrad x, 2}, m9, m10, m15
4651    psubd                m8, m3, m2 ; out12
4652    paddd                m3, m2     ; out3
4653    psubd                m2, m4, m1 ; out11
4654    paddd                m4, m1     ; out4
4655    psubd                m1, m5, m0 ; out10
4656    paddd                m5, m0     ; out5
4657    REPX       {psrad x, 2}, m3, m4, m5
4658    mova          [r6-32*1], m3
4659    mova          [r6+32*0], m4
4660    mova          [r6+32*1], m5
4661    mova                 m4, [r6+32*2]
4662    mova                 m3, [r6+32*3]
4663    REPX       {psrad x, 2}, m1, m2, m8
4664    psubd                m5, m6, m4 ; out9
4665    paddd                m6, m4     ; out6
4666    psubd                m4, m7, m3 ; out8
4667    paddd                m7, m3     ; out7
4668    REPX       {psrad x, 2}, m6, m7, m4, m5
4669    mova          [r6+32*2], m6
4670    mova          [r6+32*3], m7
4671    add                  r6, 32*8
4672    mova          [r6-32*4], m4
4673    mova          [r6-32*3], m5
4674    mova          [r6-32*2], m1
4675    mova          [r6-32*1], m2
4676    mova          [r6+32*0], m8
4677    mova          [r6+32*1], m9
4678    mova          [r6+32*2], m10
4679    mova          [r6+32*3], m15
4680.fast:
4681    add                  r6, 32*8
4682    call .main
4683    mova                m14, [r6-32*4]
4684    mova                m13, [r6-32*3]
4685    mova                m12, [r6-32*2]
4686    mova                m11, [r6-32*1]
4687    mova                m10, [r6+32*0]
4688    mova                 m9, [r6+32*1]
4689    mova                 m8, [r6+32*2]
4690    psubd               m15, m0, m14       ; out15
4691    paddd                m0, m14           ; out0
4692    psubd               m14, m1, m13       ; out14
4693    paddd                m1, m13           ; out1
4694    psubd               m13, m2, m12       ; out13
4695    paddd                m2, m12           ; out2
4696    psubd               m12, m3, m11       ; out12
4697    paddd                m3, m11           ; out3
4698    psubd               m11, m4, m10       ; out11
4699    paddd                m4, m10           ; out4
4700    psubd               m10, m5, m9        ; out10
4701    paddd                m5, m9            ; out5
4702    psubd                m9, m6, m8        ; out9
4703    paddd                m6, m8            ; out6
4704    psubd                m8, m7, [r6+32*3] ; out8
4705    paddd                m7, [r6+32*3]     ; out7
4706    sub                  r6, 32*8
4707    REPX       {psrad x, 2}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
4708                             m8,  m9,  m10, m11, m12, m13, m14, m15
4709    jmp                tx2q
4710.pass2:
4711    call .transpose
4712    lea                  r6, [pw_5+128]
4713    mova              [rsp], m15
4714    call m(idct_16x16_internal_8bpc).main
4715    mova                 m1, [rsp+32*1]
4716.end:
4717    call .write_16x16
4718    RET
4719ALIGN function_align
4720.write_16x16:
4721    mova [rsp+gprsize+32*0], m8
4722    mova [rsp+gprsize+32*1], m9
4723    mova [rsp+gprsize+32*2], m12
4724    vpbroadcastd        m12, [pw_2048]
4725    pmulhrsw             m0, m12
4726    pmulhrsw             m1, m12
4727    pmulhrsw             m2, m12
4728    pmulhrsw             m3, m12
4729    call m(idct_16x8_internal_10bpc).write_16x4_start
4730.write_16x16_2:
4731    pmulhrsw             m0, m12, m4
4732    pmulhrsw             m1, m12, m5
4733    pmulhrsw             m2, m12, m6
4734    pmulhrsw             m3, m12, m7
4735    call m(idct_16x8_internal_10bpc).write_16x4_zero
4736    pmulhrsw             m0, m12, [rsp+gprsize+32*0]
4737    pmulhrsw             m1, m12, [rsp+gprsize+32*1]
4738    pmulhrsw             m2, m12, m10
4739    pmulhrsw             m3, m12, m11
4740    call m(idct_16x8_internal_10bpc).write_16x4_zero
4741    pmulhrsw             m0, m12, [rsp+gprsize+32*2]
4742    pmulhrsw             m1, m12, m13
4743    pmulhrsw             m2, m12, m14
4744    pmulhrsw             m3, m12, m15
4745    jmp m(idct_16x8_internal_10bpc).write_16x4_zero
4746ALIGN function_align
4747.transpose:
4748    test               eobd, eobd
4749    jl .transpose_fast
4750    packssdw             m8, [r6-32*4]
4751    packssdw             m9, [r6-32*3]
4752    packssdw            m10, [r6-32*2]
4753    packssdw            m11, [r6-32*1]
4754    packssdw            m12, [r6+32*0]
4755    packssdw            m13, [r6+32*1]
4756    packssdw            m14, [r6+32*2]
4757    packssdw            m15, [r6+32*3]
4758    sub                  r6, 32*8
4759    packssdw             m0, [r6-32*4]
4760    packssdw             m1, [r6-32*3]
4761    packssdw             m2, [r6-32*2]
4762    packssdw             m3, [r6-32*1]
4763    packssdw             m4, [r6+32*0]
4764    packssdw             m5, [r6+32*1]
4765    packssdw             m6, [r6+32*2]
4766    packssdw             m7, [r6+32*3]
4767    mova               [r6], m8
4768    punpckhwd            m8, m0, m1
4769    punpcklwd            m0, m1
4770    punpcklwd            m1, m2, m3
4771    punpckhwd            m2, m3
4772    punpckhwd            m3, m6, m7
4773    punpcklwd            m6, m7
4774    punpcklwd            m7, m4, m5
4775    punpckhwd            m4, m5
4776    punpckldq            m5, m8, m2
4777    punpckhdq            m8, m2
4778    punpckhdq            m2, m0, m1
4779    punpckldq            m0, m1
4780    punpckhdq            m1, m7, m6
4781    punpckldq            m7, m6
4782    punpckhdq            m6, m4, m3
4783    punpckldq            m4, m3
4784    punpckhqdq           m3, m2, m1
4785    punpcklqdq           m2, m1
4786    punpckhqdq           m1, m0, m7
4787    punpcklqdq           m0, m7
4788    punpcklqdq           m7, m8, m6
4789    punpckhqdq           m8, m6
4790    punpckhqdq           m6, m5, m4
4791    punpcklqdq           m5, m4
4792    mova                 m4, [r6]
4793    mova               [r6], m8
4794    punpcklwd            m8, m4, m9
4795    punpckhwd            m4, m9
4796    punpcklwd            m9, m10, m11
4797    punpckhwd           m10, m11
4798    punpckhwd           m11, m14, m15
4799    punpcklwd           m14, m15
4800    punpckhwd           m15, m12, m13
4801    punpcklwd           m12, m13
4802    punpckldq           m13, m4, m10
4803    punpckhdq            m4, m10
4804    punpckhdq           m10, m8, m9
4805    punpckldq            m8, m9
4806    punpckhdq            m9, m12, m14
4807    punpckldq           m12, m14
4808    punpckhdq           m14, m15, m11
4809    punpckldq           m15, m11
4810    punpckhqdq          m11, m10, m9
4811    punpcklqdq          m10, m9
4812    punpckhqdq           m9, m8, m12
4813    punpcklqdq           m8, m12
4814    punpcklqdq          m12, m13, m15
4815    punpckhqdq          m13, m15
4816    punpckhqdq          m15, m4, m14
4817    punpcklqdq          m14, m4, m14
4818    vperm2i128           m4, m0, m8, 0x31
4819    vinserti128          m0, xm8, 1
4820    vinserti128          m8, m5, xm12, 1
4821    vperm2i128          m12, m5, 0x13
4822    vperm2i128           m5, m1, m9, 0x31
4823    vinserti128          m1, xm9, 1
4824    vinserti128          m9, m6, xm13, 1
4825    vperm2i128          m13, m6, 0x13
4826    vperm2i128           m6, m2, m10, 0x31
4827    vinserti128          m2, xm10, 1
4828    vinserti128         m10, m7, xm14, 1
4829    vperm2i128          m14, m7, 0x13
4830    vperm2i128           m7, m3, m11, 0x31
4831    vinserti128          m3, xm11, 1
4832    mova               xm11, [r6]
4833    vinserti128         m11, xm15, 1
4834    vinserti128         m15, [r6+16], 0
4835    ret
4836.transpose_fast:
4837    call m(idct_16x8_internal_10bpc).transpose2
4838    pxor                 m8, m8
4839    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
4840    ret
4841ALIGN function_align
4842.main:
4843    mova                 m0, [cq+64* 1]
4844    mova                 m1, [cq+64* 3]
4845    mova                 m2, [cq+64* 5]
4846    mova                 m3, [cq+64* 7]
4847    mova                 m4, [cq+64* 9]
4848    mova                 m5, [cq+64*11]
4849    mova                 m6, [cq+64*13]
4850    mova                 m7, [cq+64*15]
4851    call m(idct_8x16_internal_10bpc).main_oddhalf
4852    mova                 m0, [cq+64* 0]
4853    mova                 m1, [cq+64* 2]
4854    mova                 m2, [cq+64* 4]
4855    mova                 m3, [cq+64* 6]
4856    mova                 m4, [cq+64* 8]
4857    mova                 m5, [cq+64*10]
4858    mova                 m6, [cq+64*12]
4859    mova                 m7, [cq+64*14]
4860    call m(idct_8x8_internal_10bpc).main
4861    call m(idct_8x16_internal_10bpc).main_evenhalf
4862    psrld               m10, m11, 10 ; pd_2
4863    REPX    {paddd  x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
4864    ret
4865
4866INV_TXFM_16X16_FN adst, dct
4867INV_TXFM_16X16_FN adst, adst
4868INV_TXFM_16X16_FN adst, flipadst
4869
4870cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
4871    vpbroadcastd        m13, [clip_18b_min]
4872    vpbroadcastd        m14, [clip_18b_max]
4873.pass1:
4874    vpbroadcastd        m15, [pd_2896]
4875    lea                  r6, [rsp+32*4]
4876    sub                eobd, 36
4877    jl .fast
4878    add                  cq, 32
4879    call .main
4880    sub                  cq, 32
4881    vpbroadcastd         m8, [pd_5120]
4882    paddd                m4, m8
4883    paddd                m6, m8
4884    paddd                m9, m8
4885    paddd               m11, m8
4886    vpbroadcastd         m8, [pd_5119]
4887    psubd                m5, m8, m5
4888    psubd                m7, m8, m7
4889    psubd               m10, m8, m10
4890    psubd               m12, m8, m12
4891    REPX      {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12
4892    mova          [r6+32*0], m4
4893    mova          [r6+32*1], m5
4894    mova          [r6+32*2], m6
4895    mova          [r6+32*3], m7
4896    psrld                m4, m15, 10 ; pd_2
4897    paddd                m0, m4
4898    psubd                m1, m4, m1
4899    paddd                m2, m4
4900    psubd                m3, m4, m3
4901    psubd                m7, m4, [r6-32*4]
4902    paddd                m6, m4, [r6-32*3]
4903    psubd                m5, m4, [r6-32*2]
4904    paddd                m4,     [r6-32*1]
4905    REPX      {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
4906    mova          [r6-32*4], m0
4907    mova          [r6-32*3], m1
4908    mova          [r6-32*2], m2
4909    mova          [r6-32*1], m3
4910    add                  r6, 32*8
4911    mova          [r6-32*4], m9
4912    mova          [r6-32*3], m10
4913    mova          [r6-32*2], m11
4914    mova          [r6-32*1], m12
4915    mova          [r6+32*0], m4
4916    mova          [r6+32*1], m5
4917    mova          [r6+32*2], m6
4918    mova          [r6+32*3], m7
4919.fast:
4920    add                  r6, 32*8
4921    call .main
4922    vpbroadcastd        m14, [pd_5120]
4923    vpbroadcastd        m13, [pd_5119]
4924    psrld               m15, 10 ; pd_2
4925    paddd                m0, m15
4926    psubd                m1, m15, m1
4927    paddd                m2, m15
4928    psubd                m3, m15, m3
4929    paddd                m4, m14
4930    psubd                m5, m13, m5
4931    paddd                m6, m14
4932    psubd                m7, m13, m7
4933    paddd                m8, m14, m9
4934    psubd                m9, m13, m10
4935    paddd               m10, m14, m11
4936    psubd               m11, m13, m12
4937    paddd               m12, m15, [r6-32*1]
4938    psubd               m13, m15, [r6-32*2]
4939    paddd               m14, m15, [r6-32*3]
4940    psubd               m15,      [r6-32*4]
4941.pass1_end:
4942    REPX      {psrad x, 2 }, m0,  m1,  m2,  m3,  m12, m13, m14, m15
4943    REPX      {psrad x, 13}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
4944    sub                  r6, 32*8
4945    jmp                tx2q
4946.pass2:
4947    call m(idct_16x16_internal_10bpc).transpose
4948    lea                  r6, [pw_5+128]
4949    mova              [rsp], m15
4950    call m(iadst_16x16_internal_8bpc).main
4951    call m(iadst_16x16_internal_8bpc).main_pass2_end
4952    mova         [rsp+32*0], m8
4953    mova         [rsp+32*2], m12
4954    mova         [rsp+32*3], m13
4955    vpbroadcastd        m12, [pw_2048]
4956    pxor                m13, m13
4957    psubw               m13, m12
4958    pmulhrsw             m0, m12
4959    pmulhrsw             m1, m13, [rsp+32*1]
4960    mova         [rsp+32*1], m9
4961    pmulhrsw             m2, m12
4962    pmulhrsw             m3, m13
4963    call m(idct_16x8_internal_10bpc).write_16x4_start
4964    pmulhrsw             m0, m12, m4
4965    pmulhrsw             m1, m13, m5
4966    pmulhrsw             m2, m12, m6
4967    pmulhrsw             m3, m13, m7
4968    call m(idct_16x8_internal_10bpc).write_16x4_zero
4969    pmulhrsw             m0, m12, [rsp+32*0]
4970    pmulhrsw             m1, m13, [rsp+32*1]
4971    pmulhrsw             m2, m12, m10
4972    pmulhrsw             m3, m13, m11
4973    call m(idct_16x8_internal_10bpc).write_16x4_zero
4974    pmulhrsw             m0, m12, [rsp+32*2]
4975    pmulhrsw             m1, m13, [rsp+32*3]
4976    pmulhrsw             m2, m12, m14
4977    pmulhrsw             m3, m13, m15
4978    call m(idct_16x8_internal_10bpc).write_16x4_zero
4979    RET
4980ALIGN function_align
4981.main:
4982    mova                 m0, [cq+64* 2]
4983    mova                 m1, [cq+64*13]
4984    mova                 m2, [cq+64* 6]
4985    mova                 m3, [cq+64* 9]
4986    mova                 m4, [cq+64*10]
4987    mova                 m5, [cq+64* 5]
4988    mova                 m6, [cq+64*14]
4989    mova                 m7, [cq+64* 1]
4990    vpbroadcastd        m12, [pd_2048]
4991    call m(iadst_16x8_internal_10bpc).main_part1
4992    mova                 m0, [cq+64* 0]
4993    mova                 m1, [cq+64*15]
4994    mova                 m2, [cq+64* 4]
4995    mova                 m3, [cq+64*11]
4996    mova                 m4, [cq+64* 8]
4997    mova                 m5, [cq+64* 7]
4998    mova                 m6, [cq+64*12]
4999    mova                 m7, [cq+64* 3]
5000    jmp m(iadst_16x8_internal_10bpc).main_part2
5001
5002INV_TXFM_16X16_FN flipadst, dct
5003INV_TXFM_16X16_FN flipadst, adst
5004INV_TXFM_16X16_FN flipadst, flipadst
5005
5006cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
5007    vpbroadcastd        m13, [clip_18b_min]
5008    vpbroadcastd        m14, [clip_18b_max]
5009.pass1:
5010    vpbroadcastd        m15, [pd_2896]
5011    lea                  r6, [rsp+32*4]
5012    sub                eobd, 36
5013    jl .fast
5014    add                  cq, 32
5015    call m(iadst_16x16_internal_10bpc).main
5016    sub                  cq, 32
5017    vpbroadcastd         m8, [pd_5120]
5018    paddd               m11, m8
5019    paddd                m9, m8
5020    paddd                m6, m8
5021    paddd                m4, m8
5022    vpbroadcastd         m8, [pd_5119]
5023    psubd               m12, m8, m12
5024    psubd               m10, m8, m10
5025    psubd                m7, m8, m7
5026    psubd                m5, m8, m5
5027    REPX      {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4
5028    mova          [r6+32*0], m12
5029    mova          [r6+32*1], m11
5030    mova          [r6+32*2], m10
5031    mova          [r6+32*3], m9
5032    psrld                m9, m15, 10 ; pd_2
5033    psubd                m3, m9, m3
5034    paddd                m2, m9
5035    psubd                m1, m9, m1
5036    paddd                m0, m9
5037    psubd               m12, m9, [r6-32*4]
5038    paddd               m11, m9, [r6-32*3]
5039    psubd               m10, m9, [r6-32*2]
5040    paddd                m9,     [r6-32*1]
5041    REPX      {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0
5042    mova          [r6-32*4], m12
5043    mova          [r6-32*3], m11
5044    mova          [r6-32*2], m10
5045    mova          [r6-32*1], m9
5046    add                  r6, 32*8
5047    mova          [r6-32*4], m7
5048    mova          [r6-32*3], m6
5049    mova          [r6-32*2], m5
5050    mova          [r6-32*1], m4
5051    mova          [r6+32*0], m3
5052    mova          [r6+32*1], m2
5053    mova          [r6+32*2], m1
5054    mova          [r6+32*3], m0
5055.fast:
5056    add                  r6, 32*8
5057    call m(iadst_16x16_internal_10bpc).main
5058    vpbroadcastd        m14, [pd_5120]
5059    vpbroadcastd        m13, [pd_5119]
5060    psrld               m15, 10 ; pd_2
5061    psubd                m8, m13, m7
5062    paddd                m7, m14, m9
5063    paddd                m9, m14, m6
5064    psubd                m6, m13, m10
5065    psubd               m10, m13, m5
5066    paddd                m5, m14, m11
5067    paddd               m11, m14, m4
5068    psubd                m4, m13, m12
5069    psubd               m12, m15, m3
5070    paddd                m3, m15, [r6-32*1]
5071    paddd               m13, m15, m2
5072    psubd                m2, m15, [r6-32*2]
5073    psubd               m14, m15, m1
5074    mova                 m1, m15
5075    paddd               m15, m0
5076    psubd                m0, m1, [r6-32*4]
5077    paddd                m1,     [r6-32*3]
5078    jmp m(iadst_16x16_internal_10bpc).pass1_end
5079.pass2:
5080    call m(idct_16x16_internal_10bpc).transpose
5081    lea                  r6, [pw_5+128]
5082    mova              [rsp], m15
5083    call m(iadst_16x16_internal_8bpc).main
5084    call m(iadst_16x16_internal_8bpc).main_pass2_end
5085    mova         [rsp+32*3], m3
5086    mova         [rsp+32*2], m2
5087    mova         [rsp+32*0], m0
5088    mova                 m2, m13
5089    mova                 m3, m12
5090    vpbroadcastd        m12, [pw_2048]
5091    pxor                m13, m13
5092    psubw               m13, m12
5093    pmulhrsw             m0, m13, m15
5094    pmulhrsw             m1, m12, m14
5095    pmulhrsw             m2, m13
5096    pmulhrsw             m3, m12
5097    mova                m14, m8
5098    mova                m15, m9
5099    call m(idct_16x8_internal_10bpc).write_16x4_start
5100    pmulhrsw             m0, m13, m11
5101    pmulhrsw             m1, m12, m10
5102    pmulhrsw             m2, m13, m15
5103    pmulhrsw             m3, m12, m14
5104    call m(idct_16x8_internal_10bpc).write_16x4_zero
5105    pmulhrsw             m0, m13, m7
5106    pmulhrsw             m1, m12, m6
5107    pmulhrsw             m2, m13, m5
5108    pmulhrsw             m3, m12, m4
5109    call m(idct_16x8_internal_10bpc).write_16x4_zero
5110    pmulhrsw             m0, m13, [rsp+32*3]
5111    pmulhrsw             m1, m12, [rsp+32*2]
5112    pmulhrsw             m2, m13, [rsp+32*1]
5113    pmulhrsw             m3, m12, [rsp+32*0]
5114    call m(idct_16x8_internal_10bpc).write_16x4_zero
5115    RET
5116
5117INV_TXFM_16X16_FN identity, dct, -92
5118INV_TXFM_16X16_FN identity, identity
5119
5120cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
5121    vpbroadcastd        m15, [pd_5793]
5122    vpbroadcastd         m7, [pd_5120]
5123    lea                  r6, [rsp+32*4]
5124    sub                eobd, 36
5125    jl .fast
5126    mov                  r3, -32*8*4
5127.righthalf:
5128    pmulld               m0, m15, [cq+r3+32*33]
5129    pmulld               m1, m15, [cq+r3+32*35]
5130    pmulld               m2, m15, [cq+r3+32*37]
5131    pmulld               m3, m15, [cq+r3+32*39]
5132    add                  r6, 32*4
5133    REPX      {paddd x, m7}, m0, m1, m2, m3
5134    REPX      {psrad x, 13}, m0, m1, m2, m3
5135    mova          [r6+32*0], m0
5136    mova          [r6+32*1], m1
5137    mova          [r6+32*2], m2
5138    mova          [r6+32*3], m3
5139    add                  r3, 32*8
5140    jl .righthalf
5141.fast:
5142    pmulld               m0, m15, [cq+64* 0]
5143    pmulld               m1, m15, [cq+64* 1]
5144    pmulld               m2, m15, [cq+64* 2]
5145    pmulld               m3, m15, [cq+64* 3]
5146    pmulld               m4, m15, [cq+64* 4]
5147    pmulld               m5, m15, [cq+64* 5]
5148    pmulld               m6, m15, [cq+64* 6]
5149    pmulld               m8, m15, [cq+64* 7]
5150    mova               [cq], m8
5151    pmulld               m8, m15, [cq+64* 8]
5152    pmulld               m9, m15, [cq+64* 9]
5153    pmulld              m10, m15, [cq+64*10]
5154    pmulld              m11, m15, [cq+64*11]
5155    pmulld              m12, m15, [cq+64*12]
5156    pmulld              m13, m15, [cq+64*13]
5157    pmulld              m14, m15, [cq+64*14]
5158    pmulld              m15,      [cq+64*15]
5159    REPX      {paddd x, m7}, m0,  m1,  m2,  m3,  m4,  m5,  m6, \
5160                             m8,  m9,  m10, m11, m12, m13, m14, m15
5161    paddd                m7, [cq]
5162    REPX      {psrad x, 13}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
5163                             m8,  m9,  m10, m11, m12, m13, m14, m15
5164    jmp                tx2q
5165.pass2:
5166    call m(idct_16x16_internal_10bpc).transpose
5167
5168    mova          [cq+32*0], m15
5169    mova          [cq+32*1], m0
5170    vpbroadcastd        m15, [pw_1697x16]
5171
5172    REPX  {IDTX16 x, 0, 15},  1,  2,  3,  4,  5,  6,  7, \
5173                              8,  9, 10, 11, 12, 13, 14
5174    mova                 m0, [cq+32*1]
5175    mova          [cq+32*1], m1
5176    IDTX16                0, 1, 15
5177    mova                 m1, [cq+32*0]
5178    pmulhrsw            m15, m1
5179    paddsw               m1, m1
5180    paddsw              m15, m1
5181    mova                 m1, [cq+32*1]
5182    jmp m(idct_16x16_internal_10bpc).end
5183
5184INV_TXFM_16X16_FN dct, dct,       0, 12
5185INV_TXFM_16X16_FN dct, identity, 28, 12
5186INV_TXFM_16X16_FN dct, adst,      0, 12
5187INV_TXFM_16X16_FN dct, flipadst,  0, 12
5188
5189cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
5190    vpbroadcastd        m12, [clip_20b_min]
5191    vpbroadcastd        m13, [clip_20b_max]
5192    jmp m(idct_16x16_internal_10bpc).pass1
5193.pass2:
5194    mova         [cq+32* 8], m8
5195    mova         [cq+32* 9], m9
5196    mova         [cq+32*10], m10
5197    mova         [cq+32*11], m11
5198    mova         [cq+32*12], m12
5199    mova         [cq+32*13], m13
5200    mova         [cq+32*14], m14
5201    mova         [cq+32*15], m15
5202    call .pass2_main
5203    packssdw             m0,  m1
5204    packssdw             m1,  m2,  m3
5205    packssdw             m2,  m4,  m5
5206    packssdw             m3,  m6,  m7
5207    packssdw             m4,  m8,  m9
5208    packssdw             m5, m10, m11
5209    packssdw             m6, m12, m13
5210    packssdw             m7, m14, m15
5211    mova          [r6-32*4], m0
5212    mova          [r6-32*3], m1
5213    mova          [r6-32*2], m2
5214    mova          [r6-32*1], m3
5215    mova          [r6+32*0], m4
5216    mova          [r6+32*1], m5
5217    mova          [r6+32*2], m6
5218    mova          [r6+32*3], m7
5219    mova                 m0, [cq+32* 8]
5220    mova                 m1, [cq+32* 9]
5221    mova                 m2, [cq+32*10]
5222    mova                 m3, [cq+32*11]
5223    mova                 m4, [cq+32*12]
5224    mova                 m5, [cq+32*13]
5225    mova                 m6, [cq+32*14]
5226    mova                 m7, [cq+32*15]
5227    mov                  r5, r6
5228    add                  r6, 32*16
5229    call .pass2_main
5230    jmp m(iadst_16x16_internal_12bpc).end
5231ALIGN function_align
5232.write_16x16:
5233    mova [rsp+gprsize+32*0], m8
5234    mova [rsp+gprsize+32*1], m9
5235    mova [rsp+gprsize+32*2], m12
5236    vpbroadcastd        m12, [pw_16384]
5237    pmulhrsw             m0, m12
5238    pmulhrsw             m1, m12
5239    pmulhrsw             m2, m12
5240    pmulhrsw             m3, m12
5241    call m(idct_16x8_internal_12bpc).write_16x4_start
5242    call m(idct_16x8_internal_10bpc).write_16x4_zero
5243    jmp m(idct_16x16_internal_10bpc).write_16x16_2
5244ALIGN function_align
5245.pass2_main:
5246    call m(idct_8x8_internal_12bpc).transpose_8x8
5247    mova         [cq+32* 0], m0
5248    mova         [cq+32* 1], m2
5249    mova         [cq+32* 2], m4
5250    mova         [cq+32* 3], m6
5251    vpbroadcastd        m12, [clip_18b_min]
5252    vpbroadcastd        m13, [clip_18b_max]
5253    pmaxsd               m0, m12, m1
5254    pmaxsd               m1, m12, m3
5255    pmaxsd               m2, m12, m5
5256    pmaxsd               m3, m12, m7
5257    REPX    {pminsd x, m13}, m0, m1, m2, m3
5258    test               eobd, eobd
5259    jge .pass2_slow
5260    pxor                 m4, m4
5261    REPX       {mova x, m4}, m5, m6, m7
5262    jmp .pass2_fast
5263.pass2_slow:
5264    sub                  r6, 32*8
5265    mova                 m8, [r6-32*4]
5266    mova                 m4, [r6-32*3]
5267    mova                m10, [r6-32*2]
5268    mova                 m5, [r6-32*1]
5269    mova                m12, [r6+32*0]
5270    mova                 m6, [r6+32*1]
5271    mova                m14, [r6+32*2]
5272    mova                 m7, [r6+32*3]
5273    TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15
5274    mova         [cq+32* 4], m8
5275    mova         [cq+32* 5], m10
5276    mova         [cq+32* 6], m12
5277    mova         [cq+32* 7], m14
5278    vpbroadcastd        m12, [clip_18b_min]
5279    vpbroadcastd        m13, [clip_18b_max]
5280    REPX    {pmaxsd x, m12}, m4, m5, m6, m7
5281    REPX    {pminsd x, m13}, m4, m5, m6, m7
5282.pass2_fast:
5283    vpbroadcastd        m11, [pd_2048]
5284    vpbroadcastd        m14, [pd_2896]
5285    call m(idct_8x16_internal_10bpc).main_oddhalf
5286    pmaxsd               m0, m12, [cq+32* 0]
5287    pmaxsd               m1, m12, [cq+32* 1]
5288    pmaxsd               m2, m12, [cq+32* 2]
5289    pmaxsd               m3, m12, [cq+32* 3]
5290    REPX    {pminsd x, m13}, m0, m1, m2, m3
5291    test               eobd, eobd
5292    jge .pass2_slow2
5293    pxor                 m4, m4
5294    REPX       {mova x, m4}, m5, m6, m7
5295    jmp .pass2_fast2
5296.pass2_slow2:
5297    pmaxsd               m4, m12, [cq+32* 4]
5298    pmaxsd               m5, m12, [cq+32* 5]
5299    pmaxsd               m6, m12, [cq+32* 6]
5300    pmaxsd               m7, m12, [cq+32* 7]
5301    REPX    {pminsd x, m13}, m4, m5, m6, m7
5302.pass2_fast2:
5303    call m(idct_8x8_internal_10bpc).main
5304    call m(idct_8x16_internal_10bpc).main_evenhalf
5305    psrad               m11, 8  ; pd_8
5306    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
5307    call m(idct_16x8_internal_10bpc).pass1_rotations
5308    REPX       {psrad x, 4}, m0, m1, m2,  m3,  m4,  m5,  m6,  m7, \
5309                             m8, m9, m10, m11, m12, m13, m14, m15
5310    ret
5311
5312INV_TXFM_16X16_FN adst, dct,      0, 12
5313INV_TXFM_16X16_FN adst, adst,     0, 12
5314INV_TXFM_16X16_FN adst, flipadst, 0, 12
5315
5316cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
5317    vpbroadcastd        m13, [clip_20b_min]
5318    vpbroadcastd        m14, [clip_20b_max]
5319    jmp m(iadst_16x16_internal_10bpc).pass1
5320.pass2:
5321    call .pass2_part1
5322    call m(iadst_16x8_internal_10bpc).pass1_rotations
5323    call .pass2_part2
5324    call m(iadst_16x8_internal_10bpc).pass1_rotations
5325.pass2_part3:
5326    REPX      {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
5327    REPX      {psrad x, 15}, m4, m5, m6, m7, m8,  m9,  m10, m11
5328.end:
5329    packssdw            m15, m14
5330    packssdw            m14, m13, m12
5331    packssdw            m13, m11, m10
5332    packssdw            m12,  m9,  m8
5333    packssdw            m11,  m7,  m6
5334    packssdw            m10,  m5,  m4
5335    packssdw             m7,  m3,  m2
5336    packssdw             m6,  m1,  m0
5337    vpblendd             m0, m6, [r5-32*4], 0x33
5338    vpblendd             m1, m6, [r5-32*4], 0xcc
5339    vpblendd             m2, m7, [r5-32*3], 0x33
5340    vpblendd             m3, m7, [r5-32*3], 0xcc
5341    vpermq               m0, m0, q3120
5342    vpermq               m1, m1, q2031
5343    vpermq               m2, m2, q3120
5344    vpermq               m3, m3, q2031
5345    call m(idct_16x8_internal_12bpc).write_16x4_start
5346    call m(idct_16x8_internal_10bpc).write_16x4_zero
5347    vpblendd             m0, m10, [r5-32*2], 0x33
5348    vpblendd             m1, m10, [r5-32*2], 0xcc
5349    vpblendd             m2, m11, [r5-32*1], 0x33
5350    vpblendd             m3, m11, [r5-32*1], 0xcc
5351    vpermq               m0, m0, q3120
5352    vpermq               m1, m1, q2031
5353    vpermq               m2, m2, q3120
5354    vpermq               m3, m3, q2031
5355    call m(idct_16x8_internal_10bpc).write_16x4_zero
5356    vpblendd             m0, m12, [r5+32*0], 0x33
5357    vpblendd             m1, m12, [r5+32*0], 0xcc
5358    vpblendd             m2, m13, [r5+32*1], 0x33
5359    vpblendd             m3, m13, [r5+32*1], 0xcc
5360    vpermq               m0, m0, q3120
5361    vpermq               m1, m1, q2031
5362    vpermq               m2, m2, q3120
5363    vpermq               m3, m3, q2031
5364    call m(idct_16x8_internal_10bpc).write_16x4_zero
5365    vpblendd             m0, m14, [r5+32*2], 0x33
5366    vpblendd             m1, m14, [r5+32*2], 0xcc
5367    vpblendd             m2, m15, [r5+32*3], 0x33
5368    vpblendd             m3, m15, [r5+32*3], 0xcc
5369    vpermq               m0, m0, q3120
5370    vpermq               m1, m1, q2031
5371    vpermq               m2, m2, q3120
5372    vpermq               m3, m3, q2031
5373    call m(idct_16x8_internal_10bpc).write_16x4_zero
5374    RET
5375ALIGN function_align
5376.pass2_part1:
5377    mova         [cq+32* 8], m8
5378    mova         [cq+32* 9], m9
5379    mova         [cq+32*10], m10
5380    mova         [cq+32*11], m11
5381    mova         [cq+32*12], m12
5382    mova         [cq+32*13], m13
5383    mova         [cq+32*14], m14
5384    mova         [cq+32*15], m15
5385.pass2_main:
5386    call m(idct_8x8_internal_12bpc).transpose_8x8
5387    mova         [cq+32* 0], m0
5388    mova         [cq+32* 1], m3
5389    mova         [cq+32* 2], m4
5390    mova         [cq+32* 3], m7
5391    vpbroadcastd        m13, [clip_18b_min]
5392    vpbroadcastd        m14, [clip_18b_max]
5393    pmaxsd               m0, m13, m2
5394    pmaxsd               m2, m13, m6
5395    pmaxsd               m5, m13, m5
5396    pmaxsd               m7, m13, m1
5397    REPX    {pminsd x, m14}, m0, m2, m5, m7
5398    test               eobd, eobd
5399    jge .pass2_slow
5400    pxor                 m1, m1
5401    REPX       {mova x, m1}, m3, m4, m6
5402    jmp .pass2_fast
5403.pass2_slow:
5404    sub                  r6, 32*8
5405    mova                 m8, [r6-32*4]
5406    mova                 m3, [r6-32*3]
5407    mova                 m4, [r6-32*2]
5408    mova                m11, [r6-32*1]
5409    mova                m12, [r6+32*0]
5410    mova                 m1, [r6+32*1]
5411    mova                 m6, [r6+32*2]
5412    mova                m15, [r6+32*3]
5413    TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14
5414    mova         [cq+32* 4], m8
5415    mova         [cq+32* 5], m11
5416    mova         [cq+32* 6], m12
5417    mova         [cq+32* 7], m15
5418    vpbroadcastd        m13, [clip_18b_min]
5419    vpbroadcastd        m14, [clip_18b_max]
5420    REPX    {pmaxsd x, m13}, m1, m3, m4, m6
5421    REPX    {pminsd x, m14}, m1, m3, m4, m6
5422.pass2_fast:
5423    vpbroadcastd        m12, [pd_2048]
5424    vpbroadcastd        m15, [pd_2896]
5425    call m(iadst_16x8_internal_10bpc).main_part1
5426    pmaxsd               m0, m13, [cq+32* 0] ;  0
5427    pmaxsd               m7, m13, [cq+32* 1] ;  3
5428    pmaxsd               m2, m13, [cq+32* 2] ;  4
5429    pmaxsd               m5, m13, [cq+32* 3] ;  7
5430    REPX    {pminsd x, m14}, m0, m2, m5, m7
5431    test               eobd, eobd
5432    jge .pass2_slow2
5433    pxor                 m1, m1
5434    REPX       {mova x, m1}, m3, m4, m6
5435    jmp .pass2_fast2
5436.pass2_slow2:
5437    pmaxsd               m4, m13, [cq+32* 4] ;  8
5438    pmaxsd               m3, m13, [cq+32* 5] ; 11
5439    pmaxsd               m6, m13, [cq+32* 6] ; 12
5440    pmaxsd               m1, m13, [cq+32* 7] ; 15
5441    REPX    {pminsd x, m14}, m1, m3, m4, m6
5442.pass2_fast2:
5443    call m(iadst_16x8_internal_10bpc).main_part2
5444    vpbroadcastd        m14, [pd_17408]
5445    psrld               m15, 11              ; pd_1
5446    psubd               m13, m14, m15        ; pd_17407
5447    pslld               m15, 3               ; pd_8
5448    ret
5449ALIGN function_align
5450.pass2_part2:
5451    REPX      {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15
5452    REPX      {psrad x, 15}, m4, m5, m6, m7, m8,  m9,  m10, m11
5453    packssdw             m0,  m1
5454    packssdw             m1,  m2,  m3
5455    packssdw             m2,  m4,  m5
5456    packssdw             m3,  m6,  m7
5457    packssdw             m4,  m8,  m9
5458    packssdw             m5, m10, m11
5459    packssdw             m6, m12, m13
5460    packssdw             m7, m14, m15
5461    mova          [r6-32*4], m0
5462    mova          [r6-32*3], m1
5463    mova          [r6-32*2], m2
5464    mova          [r6-32*1], m3
5465    mova          [r6+32*0], m4
5466    mova          [r6+32*1], m5
5467    mova          [r6+32*2], m6
5468    mova          [r6+32*3], m7
5469    mova                 m0, [cq+32* 8]
5470    mova                 m1, [cq+32* 9]
5471    mova                 m2, [cq+32*10]
5472    mova                 m3, [cq+32*11]
5473    mova                 m4, [cq+32*12]
5474    mova                 m5, [cq+32*13]
5475    mova                 m6, [cq+32*14]
5476    mova                 m7, [cq+32*15]
5477    mov                  r5, r6
5478    add                  r6, 32*16
5479    jmp .pass2_main
5480
5481INV_TXFM_16X16_FN flipadst, dct,      0, 12
5482INV_TXFM_16X16_FN flipadst, adst,     0, 12
5483INV_TXFM_16X16_FN flipadst, flipadst, 0, 12
5484
5485cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
5486    vpbroadcastd        m13, [clip_20b_min]
5487    vpbroadcastd        m14, [clip_20b_max]
5488    jmp m(iflipadst_16x16_internal_10bpc).pass1
5489.pass2:
5490    call m(iadst_16x16_internal_12bpc).pass2_part1
5491    call m(iflipadst_16x8_internal_10bpc).pass1_rotations
5492    call m(iadst_16x16_internal_12bpc).pass2_part2
5493    call m(iflipadst_16x8_internal_10bpc).pass1_rotations
5494    jmp m(iadst_16x16_internal_12bpc).pass2_part3
5495
5496INV_TXFM_16X16_FN identity, dct,    -92, 12
5497INV_TXFM_16X16_FN identity, identity, 0, 12
5498
5499%macro IDTX16_12BPC 1 ; src
5500    pmulld               m6, m7, m%1
5501    paddd                m6, m15
5502    psrad                m6, 12
5503    paddd                m6, m%1
5504    psrad               m%1, m6, 1
5505%endmacro
5506
5507cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
5508    vpbroadcastd         m7, [pd_1697]
5509    vpbroadcastd        m15, [pd_5120]
5510    lea                  r6, [rsp+32*4]
5511    sub                eobd, 36
5512    jl .fast
5513    mov                  r3, -32*8*4
5514.righthalf:
5515    mova                m10, [cq+r3+32*33]
5516    mova                m11, [cq+r3+32*35]
5517    mova                m12, [cq+r3+32*37]
5518    mova                m13, [cq+r3+32*39]
5519    add                  r6, 32*4
5520    pmulld               m0, m7, m10
5521    pmulld               m1, m7, m11
5522    pmulld               m2, m7, m12
5523    pmulld               m3, m7, m13
5524    REPX     {paddd x, m15}, m0, m1, m2, m3
5525    REPX     {psrad x, 12 }, m0, m1, m2, m3
5526    paddd                m0, m10
5527    paddd                m1, m11
5528    paddd                m2, m12
5529    paddd                m3, m13
5530    REPX     {psrad x, 1  }, m0, m1, m2, m3
5531    mova          [r6+32*0], m0
5532    mova          [r6+32*1], m1
5533    mova          [r6+32*2], m2
5534    mova          [r6+32*3], m3
5535    add                  r3, 32*8
5536    jl .righthalf
5537.fast:
5538    mova                 m0, [cq+64* 0]
5539    mova                 m1, [cq+64* 1]
5540    mova                 m2, [cq+64* 2]
5541    mova                 m3, [cq+64* 3]
5542    mova                 m4, [cq+64* 4]
5543    mova                 m5, [cq+64* 5]
5544    mova                 m8, [cq+64* 6]
5545    mova                 m9, [cq+64* 7]
5546    REPX   {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9
5547    mova          [cq+64*0], m8
5548    mova          [cq+64*1], m9
5549    mova                 m8, [cq+64* 8]
5550    mova                 m9, [cq+64* 9]
5551    mova                m10, [cq+64*10]
5552    mova                m11, [cq+64*11]
5553    mova                m12, [cq+64*12]
5554    mova                m13, [cq+64*13]
5555    mova                m14, [cq+64*14]
5556    REPX   {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14
5557    mova                 m6, [cq+64*15]
5558    pmulld               m7, m6
5559    paddd                m7, m15
5560    psrad                m7, 12
5561    paddd                m7, m6
5562    mova                 m6, [cq+64*0]
5563    psrad               m15, m7, 1
5564    mova                 m7, [cq+64*1]
5565    jmp                tx2q
5566.pass2:
5567    call m(iidentity_8x16_internal_12bpc).pass2_main
5568    call m(idct_16x16_internal_10bpc).transpose_fast
5569    test               eobd, eobd
5570    jl .pass2_fast
5571    mova         [cq+32* 8], m0
5572    mova         [cq+32* 9], m1
5573    mova         [cq+32*10], m2
5574    mova         [cq+32*11], m3
5575    mova         [cq+32*12], m4
5576    mova         [cq+32*13], m5
5577    mova         [cq+32*14], m6
5578    mova         [cq+32*15], m7
5579    mova                 m8, [r6-32*4]
5580    mova                 m9, [r6-32*3]
5581    mova                m10, [r6-32*2]
5582    mova                m11, [r6-32*1]
5583    mova                m12, [r6+32*0]
5584    mova                m13, [r6+32*1]
5585    mova                m14, [r6+32*2]
5586    mova                m15, [r6+32*3]
5587    sub                  r6, 32*8
5588    mova                 m0, [r6-32*4]
5589    mova                 m1, [r6-32*3]
5590    mova                 m2, [r6-32*2]
5591    mova                 m3, [r6-32*1]
5592    mova                 m4, [r6+32*0]
5593    mova                 m5, [r6+32*1]
5594    mova                 m6, [r6+32*2]
5595    mova                 m7, [r6+32*3]
5596    call m(iidentity_8x16_internal_12bpc).pass2_main
5597    call m(idct_16x8_internal_10bpc).transpose2
5598    mova                 m8, m0
5599    mova                 m9, m1
5600    mova                m10, m2
5601    mova                m11, m3
5602    mova                m12, m4
5603    mova                m13, m5
5604    mova                m14, m6
5605    mova                m15, m7
5606    mova                 m0, [cq+32* 8]
5607    mova                 m1, [cq+32* 9]
5608    mova                 m2, [cq+32*10]
5609    mova                 m3, [cq+32*11]
5610    mova                 m4, [cq+32*12]
5611    mova                 m5, [cq+32*13]
5612    mova                 m6, [cq+32*14]
5613    mova                 m7, [cq+32*15]
5614.pass2_fast:
5615    call m(idct_16x16_internal_12bpc).write_16x16
5616    RET
5617
5618%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack
5619    mova                m%4, [r6+32*(%1-4)]
5620    mova                m%2, [r5+32*(3-%1)]
5621    mova                m%5, [r4+32*(%1-4)]
5622    psubd               m%3, m%1, m%4 ; idct16 out15 - n
5623    paddd               m%1, m%4      ; idct16 out0  + n
5624    pmaxsd              m%1, m12
5625    pmaxsd              m%3, m12
5626    pminsd              m%1, m13
5627    pminsd              m%3, m13
5628    paddd               m%1, m11
5629    paddd               m%3, m11
5630    psubd               m%4, m%1, m%2 ; out31 - n
5631    paddd               m%1, m%2      ; out0  + n
5632    paddd               m%2, m%3, m%5 ; out15 - n
5633    psubd               m%3, m%5      ; out16 + n
5634    REPX      {psrad x, %6}, m%1, m%3, m%2, m%4
5635%if %7 & 1
5636    packssdw            m%1, m%3      ; out0  + n, out16 + n
5637    packssdw            m%2, m%4      ; out15 - n, out31 - n
5638%endif
5639%endmacro
5640
5641cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob
5642    test               eobd, eobd
5643    jz .dconly
5644    PROLOGUE              0, 7, 16, 32*12, dst, stride, c, eob
5645%undef cmp
5646    vpbroadcastd        m11, [pd_2048]
5647    vpbroadcastd        m12, [clip_18b_min]
5648    vpbroadcastd        m13, [clip_18b_max]
5649    vbroadcasti128      m14, [idct32_shuf]
5650    mov                  r4, cq
5651    call .pass1_main
5652    mova         [rsp+32*0], m2
5653    mova         [rsp+32*1], m3
5654    cmp                eobd, 43
5655    jge .eob43
5656    pxor                 m4, m4
5657    REPX       {mova x, m4}, [rsp+32*2], m2, m3, m11
5658    jmp .pass1_end_fast
5659.eob43:
5660    lea                  r6, [rsp+32*8]
5661    mova          [r6-32*4], m0
5662    mova          [r6-32*3], m1
5663    call .pass1_main
5664    mova         [rsp+32*2], m2
5665    cmp                eobd, 107
5666    jge .eob107
5667    mova                m11, m3
5668    mova                 m2, m0
5669    mova                 m3, m1
5670    mova                 m0, [r6-32*4]
5671    mova                 m1, [r6-32*3]
5672    pxor                 m4, m4
5673.pass1_end_fast:
5674    vpbroadcastd        m10, [pw_2048]
5675    lea                  r6, [deint_shuf+128]
5676    REPX       {mova x, m4}, m5, m6, m7
5677    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
5678    jmp .end
5679.eob107:
5680    mova         [rsp+32*3], m3
5681    mova          [r6-32*2], m0
5682    mova          [r6-32*1], m1
5683    call .pass1_main
5684    cmp                eobd, 171
5685    jge .eob171
5686    pshufd              m12, m2, q1032
5687    pshufd              m13, m3, q1032
5688    mova                 m4, m0
5689    mova                 m5, m1
5690    pxor                 m6, m6
5691    REPX       {mova x, m6}, m7, m14, m15
5692    jmp .pass1_end
5693.eob171:
5694    mova          [r6+32*0], m0
5695    mova          [r6+32*1], m1
5696    mova          [r6+32*2], m2
5697    mova          [r6+32*3], m3
5698    call .pass1_main
5699    pshufd              m12, [r6+32*2], q1032 ; out19 out17
5700    pshufd              m13, [r6+32*3], q1032 ; out23 out21
5701    mova                 m4, [r6+32*0]        ; out16 out18
5702    mova                 m5, [r6+32*1]        ; out20 out22
5703    pshufd              m14, m2, q1032        ; out27 out25
5704    pshufd              m15, m3, q1032        ; out31 out29
5705    mova                 m6, m0               ; out24 out26
5706    mova                 m7, m1               ; out28 out30
5707.pass1_end:
5708    mova                 m0, [r6-32*4]        ; out0  out2
5709    mova                 m1, [r6-32*3]        ; out4  out6
5710    mova                 m2, [r6-32*2]        ; out8  out10
5711    mova                 m3, [r6-32*1]        ; out12 out14
5712    lea                  r6, [deint_shuf+128]
5713    mova                m11, [rsp+32*3]       ; out13 out15
5714    vpbroadcastd        m10, [pw_2048]
5715    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
5716.end: ; [rsp+0*32] = m12
5717    vpbroadcastd        m12, [pw_2048]
5718    mov                  cq, r4
5719    mova         [rsp+32*1], m8
5720    mova         [rsp+32*2], m9
5721    mova         [rsp+32*3], m10
5722    mova         [rsp+32*4], m11
5723    vpermq               m0, m0, q3120
5724    vpermq               m1, m1, q2031
5725    pmulhrsw             m0, m12
5726    pmulhrsw             m1, m12
5727    call m(idct_8x8_internal_10bpc).write_8x4_start
5728    vpermq               m0, m2, q3120
5729    vpermq               m1, m3, q2031
5730    pmulhrsw             m0, m12
5731    pmulhrsw             m1, m12
5732    call m(idct_8x8_internal_10bpc).write_8x4
5733    vpermq               m0, m4, q3120
5734    vpermq               m1, m5, q2031
5735    pmulhrsw             m0, m12
5736    pmulhrsw             m1, m12
5737    call m(idct_8x8_internal_10bpc).write_8x4
5738    vpermq               m0, m6, q3120
5739    vpermq               m1, m7, q2031
5740    pmulhrsw             m0, m12
5741    pmulhrsw             m1, m12
5742    call m(idct_8x8_internal_10bpc).write_8x4
5743    vpermq               m0, [rsp+32*1], q3120
5744    vpermq               m1, [rsp+32*2], q2031
5745    pmulhrsw             m0, m12
5746    pmulhrsw             m1, m12
5747    call m(idct_8x8_internal_10bpc).write_8x4
5748    vpermq               m0, [rsp+32*3], q3120
5749    vpermq               m1, [rsp+32*4], q2031
5750    pmulhrsw             m0, m12
5751    pmulhrsw             m1, m12
5752    call m(idct_8x8_internal_10bpc).write_8x4
5753    vpermq               m0, [rsp+32*0], q3120
5754    vpermq               m1, m13, q2031
5755    pmulhrsw             m0, m12
5756    pmulhrsw             m1, m12
5757    call m(idct_8x8_internal_10bpc).write_8x4
5758    vpermq               m0, m14, q3120
5759    vpermq               m1, m15, q2031
5760    pmulhrsw             m0, m12
5761    pmulhrsw             m1, m12
5762    call m(idct_8x8_internal_10bpc).write_8x4
5763    RET
5764.dconly:
5765    imul                r6d, [cq], 181
5766    vpbroadcastd         m2, [dconly_10bpc]
5767    mov                [cq], eobd ; 0
5768    or                  r3d, 32
5769    add                 r6d, 640
5770    sar                 r6d, 10
5771    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
5772ALIGN function_align
5773.pass1_main_part1:
5774    mova                 m0, [cq+128*0]
5775    mova                 m1, [cq+128*1]
5776    mova                 m2, [cq+128*2]
5777    mova                 m3, [cq+128*3]
5778    mova                 m4, [cq+128*4]
5779    mova                 m5, [cq+128*5]
5780    mova                 m6, [cq+128*6]
5781    mova                 m7, [cq+128*7]
5782    call m(idct_8x8_internal_10bpc).main
5783    psrld                m1, m11, 10 ; pd_2
5784    REPX      {paddd x, m1}, m0, m6, m5, m3
5785    paddd                m1, m6, m7  ; out1
5786    psubd                m6, m7      ; out6
5787    psubd                m7, m0, m9  ; out7
5788    paddd                m0, m9      ; out0
5789    paddd                m2, m5, m4  ; out2
5790    psubd                m5, m4      ; out5
5791    psubd                m4, m3, m8  ; out4
5792    paddd                m3, m8      ; out3
5793    REPX      {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
5794    ret
5795ALIGN function_align
5796.pass1_main:
5797    call .pass1_main_part1
5798    add                  cq, 32
5799    packssdw             m0, m1
5800    packssdw             m2, m3
5801    packssdw             m4, m5
5802    packssdw             m6, m7
5803    pshufb               m0, m14
5804    pshufb               m2, m14
5805    pshufb               m4, m14
5806    pshufb               m6, m14
5807    punpckhdq            m3, m0, m2
5808    punpckldq            m0, m2
5809    punpckldq            m2, m4, m6
5810    punpckhdq            m4, m6
5811    vperm2i128           m1, m0, m2, 0x31 ; 4 6
5812    vinserti128          m0, xm2, 1       ; 0 2
5813    vinserti128          m2, m3, xm4, 1   ; 1 3
5814    vperm2i128           m3, m4, 0x31     ; 5 7
5815    ret
5816.main_oddhalf_part1_fast_rect2:
5817    REPX     {paddd x, m11}, m0, m1, m2, m3
5818    REPX     {psrad x, 12 }, m0, m1, m2, m3
5819.main_oddhalf_part1_fast: ; lower half zero
5820    vpbroadcastd         m7, [pd_4091]
5821    vpbroadcastd         m8, [pd_201]
5822    vpbroadcastd         m6, [pd_m1380]
5823    vpbroadcastd         m9, [pd_3857]
5824    vpbroadcastd         m5, [pd_3703]
5825    vpbroadcastd        m10, [pd_1751]
5826    vpbroadcastd         m4, [pd_m2751]
5827    vpbroadcastd        m15, [pd_3035]
5828    pmulld               m7, m0
5829    pmulld               m0, m8
5830    pmulld               m6, m1
5831    pmulld               m1, m9
5832    pmulld               m5, m2
5833    pmulld               m2, m10
5834    pmulld               m4, m3
5835    pmulld               m3, m15
5836    jmp .main_oddhalf_part1_fast2
5837.main_oddhalf_part1_rect2:
5838    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
5839    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
5840.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
5841    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  201, 4091 ; t16a, t31a
5842    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
5843    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
5844    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
5845.main_oddhalf_part1_fast2:
5846    REPX     {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
5847    REPX     {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
5848    psubd                m8, m0, m4 ; t17
5849    paddd                m0, m4     ; t16
5850    psubd                m4, m6, m2 ; t18
5851    paddd                m6, m2     ; t19
5852    psubd                m2, m1, m5 ; t29
5853    paddd                m1, m5     ; t28
5854    psubd                m5, m7, m3 ; t30
5855    paddd                m7, m3     ; t31
5856    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
5857    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
5858    vpbroadcastd        m15, [pd_4017]
5859    vpbroadcastd        m10, [pd_799]
5860    ITX_MULSUB_2D         5, 8, 3, 9, _, 11, 10, 15    ; t17a, t30a
5861    ITX_MULSUB_2D         2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a
5862    psubd                m3, m0, m6 ; t19a
5863    paddd                m0, m6     ; t16a
5864    psubd                m6, m7, m1 ; t28a
5865    paddd                m7, m1     ; t31a
5866    psubd                m1, m5, m4 ; t18
5867    paddd                m5, m4     ; t17
5868    psubd                m4, m8, m2 ; t29
5869    paddd                m8, m2     ; t30
5870    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
5871    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
5872    vpbroadcastd        m15, [pd_3784]
5873    vpbroadcastd        m10, [pd_1567]
5874    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
5875    ITX_MULSUB_2D         6, 3, 2, 9, _, 11, 10, 15 ; t19,  t28
5876    mova          [r6-32*4], m0
5877    mova          [r6-32*3], m5
5878    mova          [r6-32*2], m4
5879    mova          [r6-32*1], m6
5880    mova          [r6+32*0], m3
5881    mova          [r6+32*1], m1
5882    mova          [r6+32*2], m8
5883    mova          [r6+32*3], m7
5884    ret
5885.main_oddhalf_part2_fast_rect2:
5886    REPX     {paddd x, m11}, m0, m1, m2, m3
5887    REPX     {psrad x, 12 }, m0, m1, m2, m3
5888.main_oddhalf_part2_fast: ; lower half zero
5889    vpbroadcastd         m7, [pd_m601]
5890    vpbroadcastd         m8, [pd_4052]
5891    vpbroadcastd         m6, [pd_3973]
5892    vpbroadcastd         m9, [pd_995]
5893    vpbroadcastd         m5, [pd_m2106]
5894    vpbroadcastd        m10, [pd_3513]
5895    vpbroadcastd         m4, [pd_3290]
5896    vpbroadcastd        m15, [pd_2440]
5897    pmulld               m7, m0
5898    pmulld               m0, m8
5899    pmulld               m6, m1
5900    pmulld               m1, m9
5901    pmulld               m5, m2
5902    pmulld               m2, m10
5903    pmulld               m4, m3
5904    pmulld               m3, m15
5905    jmp .main_oddhalf_part2_fast2
5906.main_oddhalf_part2_rect2:
5907    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
5908    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
5909.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
5910    ITX_MULSUB_2D         7, 0, 8, 9, 10, _, 4052,  601 ; t23a, t24a
5911    ITX_MULSUB_2D         1, 6, 8, 9, 10, _,  995, 3973 ; t20a, t27a
5912    ITX_MULSUB_2D         5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
5913    ITX_MULSUB_2D         3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
5914.main_oddhalf_part2_fast2:
5915    REPX     {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
5916    REPX     {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
5917    psubd                m8, m0, m4 ; t25
5918    paddd                m0, m4     ; t24
5919    psubd                m4, m6, m2 ; t26
5920    paddd                m6, m2     ; t27
5921    psubd                m2, m1, m5 ; t21
5922    paddd                m1, m5     ; t20
5923    psubd                m5, m7, m3 ; t22
5924    paddd                m7, m3     ; t23
5925    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
5926    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
5927    vpbroadcastd        m15, [pd_2276]
5928    vpbroadcastd        m10, [pd_3406]
5929    ITX_MULSUB_2D         4, 2, 3, 9, _, 11, 10, 15    ; t21a, t26a
5930    ITX_MULSUB_2D         8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a
5931    psubd                m3, m0, m6 ; t27a
5932    paddd                m0, m6     ; t24a
5933    psubd                m6, m7, m1 ; t20a
5934    paddd                m7, m1     ; t23a
5935    psubd                m1, m5, m4 ; t21
5936    paddd                m5, m4     ; t22
5937    psubd                m4, m8, m2 ; t26
5938    paddd                m8, m2     ; t25
5939    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
5940    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
5941    vpbroadcastd        m15, [pd_3784]
5942    vpbroadcastd        m10, [pd_1567]
5943    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a
5944    ITX_MULSUB_2D         3, 6, 2, 9, _, 11, 10, 15, 2 ; t27,  t20
5945    mova                 m9, [r6-32*4] ; t16a
5946    mova                m10, [r6-32*3] ; t17
5947    psubd                m2, m9, m7    ; t23
5948    paddd                m9, m7        ; t16
5949    psubd                m7, m10, m5   ; t22a
5950    paddd               m10, m5        ; t17a
5951    REPX    {pmaxsd x, m12}, m9, m10, m2, m7
5952    REPX    {pminsd x, m13}, m9, m10, m2, m7
5953    mova          [r6-32*4], m9
5954    mova          [r6-32*3], m10
5955    mova                 m9, [r6-32*2] ; t18a
5956    mova                m10, [r6-32*1] ; t19
5957    psubd                m5, m9, m1    ; t21
5958    paddd                m9, m1        ; t18
5959    psubd                m1, m10, m6   ; t20a
5960    paddd               m10, m6        ; t19a
5961    REPX    {pmaxsd x, m12}, m9, m10, m5, m1
5962    REPX    {pminsd x, m13}, m9, m10, m5, m1
5963    mova          [r6-32*2], m9
5964    mova          [r6-32*1], m10
5965    mova                 m9, [r6+32*0] ; t28
5966    mova                m10, [r6+32*1] ; t29a
5967    psubd                m6, m9, m3    ; t27a
5968    paddd                m9, m3        ; t28a
5969    psubd                m3, m10, m4   ; t26
5970    paddd               m10, m4        ; t29
5971    REPX    {pmaxsd x, m12}, m9, m10, m6, m3
5972    REPX    {pminsd x, m13}, m9, m10, m6, m3
5973    REPX    {pmulld x, m14}, m6, m3, m1, m5
5974    paddd                m6, m11
5975    paddd                m3, m11
5976    psubd                m4, m6, m1    ; t20
5977    paddd                m6, m1        ; t27
5978    psubd                m1, m3, m5    ; t21a
5979    paddd                m3, m5        ; t26a
5980    REPX    {psrad  x, 12 }, m4, m1, m3, m6
5981    mova          [r6+32*0], m4
5982    mova          [r6+32*1], m1
5983    mova                 m4, [r6+32*2] ; t30
5984    mova                 m1, [r6+32*3] ; t31a
5985    psubd                m5, m4, m8    ; t25a
5986    paddd                m4, m8        ; t30a
5987    psubd                m8, m1, m0    ; t24
5988    paddd                m1, m0        ; t31
5989    REPX    {pmaxsd x, m12}, m8, m5, m4, m1
5990    REPX    {pminsd x, m13}, m8, m5, m4, m1
5991    REPX    {pmulld x, m14}, m5, m8, m7, m2
5992    paddd                m5, m11
5993    paddd                m8, m11
5994    psubd                m0, m5, m7    ; t22
5995    paddd                m5, m7        ; t25
5996    psubd                m7, m8, m2    ; t23a
5997    paddd                m2, m8        ; t24a
5998    REPX    {psrad  x, 12 }, m0, m7, m2, m5
5999    mova          [r6+32*2], m0
6000    mova          [r6+32*3], m7
6001    mov                  r4, r6
6002    add                  r6, 32*8
6003    mova          [r6-32*4], m2
6004    mova          [r6-32*3], m5
6005    mova          [r6-32*2], m3
6006    mova          [r6-32*1], m6
6007    mova          [r6+32*0], m9
6008    mova          [r6+32*1], m10
6009    mova          [r6+32*2], m4
6010    mova          [r6+32*3], m1
6011    mov                  r5, r6
6012    add                  r6, 32*8
6013    ret
6014ALIGN function_align
6015.main_end:
6016    psrld               m11, 10 ; pd_2
6017    IDCT32_END            0, 15, 8, 9, 10, 2
6018    IDCT32_END            1, 14, 8, 9, 10, 2
6019    punpckhwd            m8, m0, m1   ; 16 17
6020    punpcklwd            m0, m1       ;  0  1
6021    punpcklwd            m1, m14, m15 ; 14 15
6022    punpckhwd           m14, m15      ; 30 31
6023    mova          [r5+32*3], m8
6024    mova          [r5+32*2], m14
6025    IDCT32_END            2, 15, 8, 9, 10, 2
6026    IDCT32_END            3, 14, 8, 9, 10, 2
6027    punpckhwd            m8, m2, m3   ; 18 19
6028    punpcklwd            m2, m3       ;  2  3
6029    punpcklwd            m3, m14, m15 ; 12 13
6030    punpckhwd           m14, m15      ; 28 29
6031    mova          [r5+32*1], m8
6032    mova          [r5+32*0], m14
6033    IDCT32_END            4, 15, 8, 9, 10, 2
6034    IDCT32_END            5, 14, 8, 9, 10, 2
6035    punpckhwd            m8, m4, m5   ; 20 21
6036    punpcklwd            m4, m5       ;  4  5
6037    punpcklwd            m5, m14, m15 ; 10 11
6038    punpckhwd           m14, m15      ; 26 27
6039    mova          [r5-32*1], m8
6040    mova          [r5-32*2], m14
6041    IDCT32_END            6, 15, 8, 9, 10, 2
6042    IDCT32_END            7, 14, 8, 9, 10, 2
6043    punpckhwd            m8, m6, m7   ; 22 23
6044    punpcklwd            m6, m7       ;  6  7
6045    punpcklwd            m7, m14, m15 ;  8  9
6046    punpckhwd           m14, m15      ; 24 25
6047    mova          [r5-32*3], m8
6048    mova          [r5-32*4], m14
6049.transpose:
6050    punpckhdq           m15, m3, m1
6051    punpckldq            m3, m1
6052    punpckhdq            m1, m4, m6
6053    punpckldq            m4, m6
6054    punpckhdq            m6, m0, m2
6055    punpckldq            m0, m2
6056    punpckhdq            m2, m7, m5
6057    punpckldq            m7, m5
6058    punpcklqdq           m5, m2, m15
6059    punpckhqdq           m2, m15
6060    punpckhqdq          m15, m7, m3
6061    punpcklqdq           m7, m3
6062    punpckhqdq           m3, m6, m1
6063    punpcklqdq           m6, m1
6064    punpckhqdq           m1, m0, m4
6065    punpcklqdq           m0, m4
6066    vperm2i128           m4, m0, m7, 0x31
6067    vinserti128          m0, xm7, 1
6068    vperm2i128           m7, m3, m2, 0x31
6069    vinserti128          m3, xm2, 1
6070    vinserti128          m2, m6, xm5, 1
6071    vperm2i128           m6, m5, 0x31
6072    vperm2i128           m5, m1, m15, 0x31
6073    vinserti128          m1, xm15, 1
6074    ret
6075
6076cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob
6077    vpbroadcastd         m7, [pixel_10bpc_max]
6078.pass1:
6079    vpbroadcastd         m5, [pw_5]
6080    pxor                 m6, m6
6081    mov                 r6d, eobd
6082    add                eobb, 21
6083    cmovc              eobd, r6d ; 43, 107, 171 -> 64, 128, 192
6084    lea                  r6, [strideq*3]
6085    lea                  r5, [strideq*5]
6086    lea                  r4, [strideq+r6*2] ; strideq*7
6087.loop:
6088    mova                 m0, [cq+128*0]
6089    packssdw             m0, [cq+128*1]
6090    mova                 m1, [cq+128*2]
6091    packssdw             m1, [cq+128*3]
6092    mova                 m2, [cq+128*4]
6093    packssdw             m2, [cq+128*5]
6094    mova                 m3, [cq+128*6]
6095    packssdw             m3, [cq+128*7]
6096    REPX     {paddsw x, m5}, m0, m1, m2, m3
6097    REPX     {psraw  x, 3 }, m0, m1, m2, m3
6098    call .main_zero
6099    add                  cq, 32
6100    lea                dstq, [dstq+strideq*8]
6101    sub                eobd, 64
6102    jge .loop
6103    RET
6104ALIGN function_align
6105.main_zero:
6106    REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
6107.main:
6108    punpckhwd            m4, m0, m1
6109    punpcklwd            m0, m1
6110    punpckhwd            m1, m2, m3
6111    punpcklwd            m2, m3
6112    punpckhwd            m3, m0, m4
6113    punpcklwd            m0, m4
6114    punpckhwd            m4, m2, m1
6115    punpcklwd            m2, m1
6116    punpckhqdq           m1, m0, m2
6117    punpcklqdq           m0, m2
6118    punpcklqdq           m2, m3, m4
6119    punpckhqdq           m3, m4
6120    mova                xm4, [dstq+strideq*0]
6121    vinserti128          m4, [dstq+strideq*4], 1
6122    paddw                m0, m4
6123    mova                xm4, [dstq+strideq*1]
6124    vinserti128          m4, [dstq+r5       ], 1
6125    paddw                m1, m4
6126    mova                xm4, [dstq+strideq*2]
6127    vinserti128          m4, [dstq+r6*2     ], 1
6128    paddw                m2, m4
6129    mova                xm4, [dstq+r6       ]
6130    vinserti128          m4, [dstq+r4       ], 1
6131    paddw                m3, m4
6132    REPX     {pmaxsw x, m6}, m0, m1, m2, m3
6133    REPX     {pminsw x, m7}, m0, m1, m2, m3
6134    mova         [dstq+strideq*0], xm0
6135    vextracti128 [dstq+strideq*4], m0, 1
6136    mova         [dstq+strideq*1], xm1
6137    vextracti128 [dstq+r5       ], m1, 1
6138    mova         [dstq+strideq*2], xm2
6139    vextracti128 [dstq+r6*2     ], m2, 1
6140    mova         [dstq+r6       ], xm3
6141    vextracti128 [dstq+r4       ], m3, 1
6142    ret
6143
6144cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob
6145    test               eobd, eobd
6146    jz .dconly
6147    PROLOGUE              0, 7, 16, 32*24, dst, stride, c, eob
6148%undef cmp
6149    vpbroadcastd        m11, [pd_2048]
6150    vpbroadcastd        m12, [clip_20b_min]
6151    vpbroadcastd        m13, [clip_20b_max]
6152    mov                  r4, cq
6153    lea                  r6, [rsp+32*4]
6154    call .pass1_main
6155    cmp                eobd, 43
6156    jge .eob43
6157    jmp .pass2_fast
6158.eob43:
6159    call .pass1_main
6160    cmp                eobd, 107
6161    jge .eob107
6162.pass2_fast:
6163    mov                  cq, r4
6164    vpbroadcastd        m12, [clip_18b_min]
6165    vpbroadcastd        m13, [clip_18b_max]
6166    pmaxsd               m0, m12, [cq+128*1+ 0]
6167    pmaxsd               m1, m12, [cq+128*7+ 0]
6168    pmaxsd               m2, m12, [cq+128*1+32]
6169    pmaxsd               m3, m12, [cq+128*7+32]
6170    REPX    {pminsd x, m13}, m0, m1, m2, m3
6171    vpbroadcastd        m14, [pd_2896]
6172    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
6173    pmaxsd               m0, m12, [cq+128*3+ 0]
6174    pmaxsd               m1, m12, [cq+128*5+ 0]
6175    pmaxsd               m2, m12, [cq+128*3+32]
6176    pmaxsd               m3, m12, [cq+128*5+32]
6177    REPX    {pminsd x, m13}, m0, m1, m2, m3
6178    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
6179    pmaxsd               m0, m12, [cq+128*2+ 0]
6180    pmaxsd               m1, m12, [cq+128*6+ 0]
6181    pmaxsd               m2, m12, [cq+128*2+32]
6182    pmaxsd               m3, m12, [cq+128*6+32]
6183    REPX    {pminsd x, m13}, m0, m1, m2, m3
6184    call m(idct_8x16_internal_10bpc).main_oddhalf_fast
6185    pmaxsd               m0, m12, [cq+128*0+ 0]
6186    pmaxsd               m1, m12, [cq+128*4+ 0]
6187    pmaxsd               m2, m12, [cq+128*0+32]
6188    pmaxsd               m3, m12, [cq+128*4+32]
6189    REPX    {pminsd x, m13}, m0, m1, m2, m3
6190    pxor                 m4, m4
6191    REPX       {mova x, m4}, m5, m6, m7
6192    call m(idct_8x8_internal_10bpc).main
6193    call m(idct_8x16_internal_10bpc).main_evenhalf
6194    jmp .pass2_end
6195.eob107:
6196    call .pass1_main
6197    cmp                eobd, 171
6198    jge .eob171
6199    jmp .pass2
6200.eob171:
6201    call .pass1_main
6202.pass2:
6203    mov                  cq, r4
6204    vpbroadcastd        m12, [clip_18b_min]
6205    vpbroadcastd        m13, [clip_18b_max]
6206    pmaxsd               m0, m12, [cq+128*1+ 0]
6207    pmaxsd               m1, m12, [cq+128*7+ 0]
6208    pmaxsd               m2, m12, [cq+128*1+32]
6209    pmaxsd               m3, m12, [cq+128*7+32]
6210    pmaxsd               m4, m12, [cq+128*1+64]
6211    pmaxsd               m5, m12, [cq+128*7+64]
6212    pmaxsd               m6, m12, [cq+128*1+96]
6213    pmaxsd               m7, m12, [cq+128*7+96]
6214    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
6215    vpbroadcastd        m14, [pd_2896]
6216    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
6217    pmaxsd               m0, m12, [cq+128*3+ 0]
6218    pmaxsd               m1, m12, [cq+128*5+ 0]
6219    pmaxsd               m2, m12, [cq+128*3+32]
6220    pmaxsd               m3, m12, [cq+128*5+32]
6221    pmaxsd               m4, m12, [cq+128*3+64]
6222    pmaxsd               m5, m12, [cq+128*5+64]
6223    pmaxsd               m6, m12, [cq+128*3+96]
6224    pmaxsd               m7, m12, [cq+128*5+96]
6225    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
6226    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
6227    pmaxsd               m0, m12, [cq+128*2+ 0]
6228    pmaxsd               m1, m12, [cq+128*6+ 0]
6229    pmaxsd               m2, m12, [cq+128*2+32]
6230    pmaxsd               m3, m12, [cq+128*6+32]
6231    pmaxsd               m4, m12, [cq+128*2+64]
6232    pmaxsd               m5, m12, [cq+128*6+64]
6233    pmaxsd               m6, m12, [cq+128*2+96]
6234    pmaxsd               m7, m12, [cq+128*6+96]
6235    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
6236    call m(idct_8x16_internal_10bpc).main_oddhalf
6237    pmaxsd               m0, m12, [cq+128*0+ 0]
6238    pmaxsd               m1, m12, [cq+128*4+ 0]
6239    pmaxsd               m2, m12, [cq+128*0+32]
6240    pmaxsd               m3, m12, [cq+128*4+32]
6241    pmaxsd               m4, m12, [cq+128*0+64]
6242    pmaxsd               m5, m12, [cq+128*4+64]
6243    pmaxsd               m6, m12, [cq+128*0+96]
6244    pmaxsd               m7, m12, [cq+128*4+96]
6245    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
6246    call m(idct_8x8_internal_10bpc).main
6247    call m(idct_8x16_internal_10bpc).main_evenhalf
6248.pass2_end:
6249    psrld               m11, 8 ; pd_8
6250    IDCT32_END            0, 15, 8, 9, 10, 4
6251    IDCT32_END            1, 14, 8, 9, 10, 4
6252    punpckhqdq           m8, m0, m1   ; 16 17 (interleaved)
6253    punpcklqdq           m0, m1       ;  0  1 (interleaved)
6254    punpcklqdq           m1, m14, m15 ; 14 15 (interleaved)
6255    punpckhqdq          m14, m15      ; 30 31 (interleaved)
6256    mova          [r5+32*3], m8
6257    mova          [r5+32*2], m14
6258    IDCT32_END            2, 15, 8, 9, 10, 4
6259    IDCT32_END            3, 14, 8, 9, 10, 4
6260    punpckhqdq            m8, m2, m3   ; 18 19 (interleaved)
6261    punpcklqdq            m2, m3       ;  2  3 (interleaved)
6262    punpcklqdq            m3, m14, m15 ; 12 13 (interleaved)
6263    punpckhqdq           m14, m15      ; 28 29 (interleaved)
6264    mova          [r5+32*1], m8
6265    mova          [r5+32*0], m14
6266    IDCT32_END            4, 15, 8, 9, 10, 4
6267    IDCT32_END            5, 14, 8, 9, 10, 4
6268    punpckhqdq            m8, m4, m5   ; 20 21 (interleaved)
6269    punpcklqdq            m4, m5       ;  4  5 (interleaved)
6270    punpcklqdq            m5, m14, m15 ; 10 11 (interleaved)
6271    punpckhqdq           m14, m15      ; 26 27 (interleaved)
6272    mova          [r5-32*1], m8
6273    mova          [r5-32*2], m14
6274    IDCT32_END            6, 15, 8, 9, 10, 4
6275    IDCT32_END            7, 14, 8, 9, 10, 4
6276    punpckhqdq            m8, m6, m7   ; 22 23 (interleaved)
6277    punpcklqdq            m6, m7       ;  6  7 (interleaved)
6278    punpcklqdq            m7, m14, m15 ;  8  9 (interleaved)
6279    punpckhqdq           m14, m15      ; 24 25 (interleaved)
6280    mova          [r5-32*3], m8
6281    mova          [r5-32*4], m14
6282    mova                m15, m1
6283.end:
6284    vpermq               m0, m0, q3120
6285    vpermq               m1, m2, q3120
6286    call m(idct_8x8_internal_12bpc).write_8x4_start
6287    call m(idct_8x8_internal_10bpc).write_8x4
6288    vpermq               m0, m4, q3120
6289    vpermq               m1, m6, q3120
6290    call m(idct_8x8_internal_10bpc).write_8x4
6291    vpermq               m0, m7, q3120
6292    vpermq               m1, m5, q3120
6293    call m(idct_8x8_internal_10bpc).write_8x4
6294    vpermq               m0, m3, q3120
6295    vpermq               m1, m15, q3120
6296    call m(idct_8x8_internal_10bpc).write_8x4
6297    vpermq               m0, [r5+32*3], q3120
6298    vpermq               m1, [r5+32*1], q3120
6299    call m(idct_8x8_internal_10bpc).write_8x4
6300    vpermq               m0, [r5-32*1], q3120
6301    vpermq               m1, [r5-32*3], q3120
6302    call m(idct_8x8_internal_10bpc).write_8x4
6303    vpermq               m0, [r5-32*4], q3120
6304    vpermq               m1, [r5-32*2], q3120
6305    call m(idct_8x8_internal_10bpc).write_8x4
6306    vpermq               m0, [r5+32*0], q3120
6307    vpermq               m1, [r5+32*2], q3120
6308    call m(idct_8x8_internal_10bpc).write_8x4
6309    RET
6310.dconly:
6311    imul                r6d, [cq], 181
6312    vpbroadcastd         m2, [dconly_12bpc]
6313    mov                [cq], eobd ; 0
6314    or                  r3d, 32
6315    add                 r6d, 640
6316    sar                 r6d, 10
6317    jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3
6318ALIGN function_align
6319.pass1_main:
6320    call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1
6321    TRANSPOSE_8X8_DWORD   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15
6322    mova         [cq+128*0], m0
6323    mova         [cq+128*1], m1
6324    mova         [cq+128*2], m2
6325    mova         [cq+128*3], m3
6326    mova         [cq+128*4], m4
6327    mova         [cq+128*5], m5
6328    mova         [cq+128*6], m6
6329    mova         [cq+128*7], m7
6330    add                  cq, 32
6331    ret
6332ALIGN function_align
6333.main_end:
6334    psrld               m11, 10 ; pd_2
6335    IDCT32_END            0, 15, 8, 9, 10, 2, 0
6336    mova         [cq+32*16], m8
6337    mova         [cq+32*31], m9
6338    IDCT32_END            1, 14, 8, 9, 10, 2, 0
6339    mova         [cq+32*17], m8
6340    mova         [cq+32*30], m9
6341    mova         [cq+32*14], m14
6342    IDCT32_END            2, 14, 8, 9, 10, 2, 0
6343    mova         [cq+32*18], m8
6344    mova         [cq+32*29], m9
6345    mova         [cq+32*13], m14
6346    IDCT32_END            3, 14, 8, 9, 10, 2, 0
6347    mova         [cq+32*19], m8
6348    mova         [cq+32*28], m9
6349    mova         [cq+32*12], m14
6350    IDCT32_END            4, 14, 8, 9, 10, 2, 0
6351    mova         [cq+32*20], m8
6352    mova         [cq+32*27], m9
6353    mova         [cq+32* 0], m0
6354    mova         [cq+32* 1], m1
6355    mova         [cq+32* 2], m2
6356    IDCT32_END            5, 10, 0, 1, 2, 2, 0
6357    mova         [cq+32*21], m0
6358    mova         [cq+32*26], m1
6359    IDCT32_END            6, 9, 0, 1, 2, 2, 0
6360    mova         [cq+32*22], m0
6361    mova         [cq+32*25], m1
6362    IDCT32_END            7, 8, 0, 1, 2, 2, 0
6363    mova         [cq+32*23], m0
6364    mova         [cq+32*24], m1
6365    mova                 m0, [cq+32* 0]
6366    mova                 m1, [cq+32* 1]
6367    mova                 m2, [cq+32* 2]
6368    mova                m11, m14
6369    mova                m12, [cq+32*12]
6370    mova                m13, [cq+32*13]
6371    mova                m14, [cq+32*14]
6372    ret
6373
6374cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob
6375    vpbroadcastd         m7, [pixel_12bpc_max]
6376    jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1
6377
6378cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob
6379    test               eobd, eobd
6380    jnz .full
6381    imul                r6d, [cq], 181
6382    vpbroadcastd         m3, [dconly_10bpc]
6383    mov                [cq], eobd ; 0
6384    or                  r3d, 8
6385.dconly:
6386    add                 r6d, 640
6387    sar                 r6d, 10
6388.dconly2:
6389    imul                r6d, 181
6390    add                 r6d, 2176
6391    sar                 r6d, 12
6392    movd                xm0, r6d
6393    paddsw              xm0, xm3
6394    vpbroadcastw         m0, xm0
6395.dconly_loop:
6396    paddsw               m1, m0, [dstq+32*0]
6397    paddsw               m2, m0, [dstq+32*1]
6398    psubusw              m1, m3
6399    psubusw              m2, m3
6400    mova        [dstq+32*0], m1
6401    mova        [dstq+32*1], m2
6402    add                dstq, strideq
6403    dec                 r3d
6404    jg .dconly_loop
6405    RET
6406.full:
6407    PROLOGUE              0, 7, 16, 32*24, dst, stride, c, eob
6408    lea                  r6, [rsp+32*4]
6409    vpbroadcastd        m12, [clip_18b_min]
6410    vpbroadcastd        m13, [clip_18b_max]
6411    call .pass1
6412    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
6413    lea                  r6, [deint_shuf+128]
6414    vpbroadcastd        m11, [pw_2048]
6415    mov                  r4, dstq
6416    call .pass2
6417    mova                 m0, [r5+32*3] ; 16 17
6418    mova                 m1, [r5+32*2] ; 30 31
6419    mova                 m2, [r5+32*1] ; 18 19
6420    mova                 m3, [r5+32*0] ; 28 29
6421    mova                 m4, [r5-32*1] ; 20 21
6422    mova                 m5, [r5-32*2] ; 26 27
6423    mova                 m6, [r5-32*3] ; 22 23
6424    mova                 m7, [r5-32*4] ; 24 25
6425    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
6426    lea                dstq, [r4+32]
6427    call .pass2
6428    RET
6429ALIGN function_align
6430.pass2:
6431    call m(idct_16x8_internal_8bpc).main
6432    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
6433    call m(idct_16x8_internal_10bpc).write_16x4_start
6434    pmulhrsw             m0, m11, m4
6435    pmulhrsw             m1, m11, m5
6436    pmulhrsw             m2, m11, m6
6437    pmulhrsw             m3, m11, m7
6438    jmp m(idct_16x8_internal_10bpc).write_16x4_zero
6439ALIGN function_align
6440.pass1:
6441    mova                 m0, [cq+32* 1]
6442    mova                 m1, [cq+32* 7]
6443    mova                 m2, [cq+32* 9]
6444    mova                 m3, [cq+32*15]
6445    mova                 m4, [cq+32*17]
6446    mova                 m5, [cq+32*23]
6447    mova                 m6, [cq+32*25]
6448    mova                 m7, [cq+32*31]
6449    vpbroadcastd        m11, [pd_2048]
6450    vpbroadcastd        m14, [pd_2896]
6451    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
6452    mova                 m0, [cq+32* 3]
6453    mova                 m1, [cq+32* 5]
6454    mova                 m2, [cq+32*11]
6455    mova                 m3, [cq+32*13]
6456    mova                 m4, [cq+32*19]
6457    mova                 m5, [cq+32*21]
6458    mova                 m6, [cq+32*27]
6459    mova                 m7, [cq+32*29]
6460    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
6461    mova                 m0, [cq+32* 2]
6462    mova                 m1, [cq+32* 6]
6463    mova                 m2, [cq+32*10]
6464    mova                 m3, [cq+32*14]
6465    mova                 m4, [cq+32*18]
6466    mova                 m5, [cq+32*22]
6467    mova                 m6, [cq+32*26]
6468    mova                 m7, [cq+32*30]
6469    call m(idct_8x16_internal_10bpc).main_oddhalf
6470    mova                 m0, [cq+32* 0]
6471    mova                 m1, [cq+32* 4]
6472    mova                 m2, [cq+32* 8]
6473    mova                 m3, [cq+32*12]
6474    mova                 m4, [cq+32*16]
6475    mova                 m5, [cq+32*20]
6476    mova                 m6, [cq+32*24]
6477    mova                 m7, [cq+32*28]
6478    call m(idct_8x8_internal_10bpc).main
6479    call m(idct_8x16_internal_10bpc).main_evenhalf
6480    ret
6481
6482cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob
6483    vpbroadcastd         m7, [pixel_10bpc_max]
6484.pass1:
6485    vpbroadcastd         m5, [pw_4096]
6486    pxor                 m6, m6
6487    mov                 r6d, eobd
6488    add                eobb, 21
6489    cmovc              eobd, r6d
6490    lea                  r6, [strideq*3]
6491    lea                  r5, [strideq*5]
6492    lea                  r4, [strideq+r6*2] ; strideq*7
6493.loop:
6494    mova                 m0, [cq+32*0]
6495    packssdw             m0, [cq+32*1]
6496    mova                 m1, [cq+32*2]
6497    packssdw             m1, [cq+32*3]
6498    REPX {mova [cq+32*x], m6}, 0, 1, 2, 3
6499    add                  cq, 32*8
6500    mova                 m2, [cq-32*4]
6501    packssdw             m2, [cq-32*3]
6502    mova                 m3, [cq-32*2]
6503    packssdw             m3, [cq-32*1]
6504    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
6505    REPX {mova [cq+32*x], m6}, -4, -3, -2, -1
6506    call m(inv_txfm_add_identity_identity_8x32_10bpc).main
6507    add                dstq, 16
6508    sub                eobd, 64
6509    jge .loop
6510    RET
6511
6512cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob
6513    test               eobd, eobd
6514    jnz .full
6515    imul                r6d, [cq], 181
6516    vpbroadcastd         m3, [dconly_12bpc]
6517    mov                [cq], eobd ; 0
6518    or                  r3d, 8
6519    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
6520.full:
6521    PROLOGUE              0, 7, 16, 32*24, dst, stride, c, eob
6522    lea                  r6, [rsp+32*4]
6523    vpbroadcastd        m12, [clip_20b_min]
6524    vpbroadcastd        m13, [clip_20b_max]
6525    call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1
6526    call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end
6527    mov                  r4, dstq
6528    call m(idct_16x8_internal_12bpc).pass2_main
6529    mova                 m0, [cq+32* 0] ; 16
6530    mova                 m1, [cq+32* 1] ; 17
6531    mova                 m2, [cq+32* 2] ; 18
6532    mova                 m3, [cq+32* 3] ; 19
6533    mova                 m4, [cq+32* 4] ; 20
6534    mova                 m5, [cq+32* 5] ; 21
6535    mova                 m6, [cq+32* 6] ; 22
6536    mova                 m7, [cq+32* 7] ; 23
6537    mova                 m8, [cq+32* 8] ; 24
6538    mova                 m9, [cq+32* 9] ; 25
6539    mova                m10, [cq+32*10] ; 26
6540    mova                m11, [cq+32*11] ; 27
6541    mova                m12, [cq+32*12] ; 28
6542    mova                m13, [cq+32*13] ; 29
6543    mova                m14, [cq+32*14] ; 30
6544    mova                m15, [cq+32*15] ; 31
6545    lea                dstq, [r4+32]
6546    call m(idct_16x8_internal_12bpc).pass2_main
6547    RET
6548
6549cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob
6550    vpbroadcastd         m7, [pixel_12bpc_max]
6551    jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1
6552
6553%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
6554    mova                m%4, [%2]
6555    paddsw              m%3, m%1, m%4
6556    psubsw              m%1, m%4
6557%if %1 == 0
6558    pxor                 m6, m6
6559%endif
6560    pmulhrsw            m%3, m15
6561    pmulhrsw            m%1, m15
6562    paddw               m%3, [dstq+%5]
6563    paddw               m%1, [r2+%6]
6564    pmaxsw              m%3, m6
6565    pmaxsw              m%1, m6
6566    pminsw              m%3, m7
6567    pminsw              m%1, m7
6568    mova          [dstq+%5], m%3
6569    mova            [r2+%6], m%1
6570%endmacro
6571
6572cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob
6573    test               eobd, eobd
6574    jz .dconly
6575    PROLOGUE              0, 8, 16, 32*36, dst, stride, c, eob
6576%undef cmp
6577    vpbroadcastd        m11, [pd_2048]
6578    vpbroadcastd        m12, [clip_18b_min]
6579    vpbroadcastd        m13, [clip_18b_max]
6580    vpbroadcastd        m14, [pd_2896]
6581    lea                  r6, [rsp+32*16]
6582    lea                  r4, [r6+32*8]
6583    lea                  r5, [r6+32*16]
6584    call .main
6585    sub                eobd, 44
6586    jge .eob44
6587    vperm2i128           m2, m0, m3, 0x31 ;  5
6588    vinserti128          m0, xm3, 1       ;  1
6589    vperm2i128           m3, m1, m4, 0x31 ;  7
6590    vinserti128          m1, xm4, 1       ;  3
6591    pxor                 m4, m4
6592    REPX       {mova x, m4}, m5, m6, m7
6593    REPX {mova [r6+32*x], m4}, 0, 1, 2, 3
6594    jmp .fast
6595.dconly:
6596    imul                r6d, [cq], 181
6597    vpbroadcastd         m3, [dconly_10bpc]
6598    mov                [cq], eobd ; 0
6599    or                  r3d, 32
6600    add                 r6d, 128
6601    sar                 r6d, 8
6602    imul                r6d, 181
6603    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2
6604.eob44:
6605    mova          [r4+16*0], xm0
6606    mova          [r4+16*1], xm3
6607    mova          [r4+16*2], xm1
6608    mova          [r4+16*3], xm4
6609    vextracti128  [r4+16*4], m0, 1
6610    vextracti128  [r4+16*5], m3, 1
6611    vextracti128  [r4+16*6], m1, 1
6612    vextracti128  [r4+16*7], m4, 1
6613    call .main
6614    sub                eobd, 107
6615    jge .eob151
6616    vperm2i128           m7, m1, m4, 0x31 ; 15
6617    vinserti128          m5, m1, xm4, 1   ; 11
6618    vperm2i128           m6, m0, m3, 0x31 ; 13
6619    vinserti128          m4, m0, xm3, 1   ;  9
6620    mova                 m0, [r4+32*0]
6621    mova                 m1, [r4+32*1]
6622    mova                 m2, [r4+32*2]
6623    mova                 m3, [r4+32*3]
6624.fast:
6625    lea                  r6, [pw_5+128]
6626    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
6627    pxor                 m8, m8
6628    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
6629    jmp .idct16
6630.eob151:
6631    mova          [r4-16*8], xm0
6632    mova          [r4-16*7], xm3
6633    mova          [r4-16*6], xm1
6634    mova          [r4-16*5], xm4
6635    vextracti128  [r4-16*4], m0, 1
6636    vextracti128  [r4-16*3], m3, 1
6637    vextracti128  [r4-16*2], m1, 1
6638    vextracti128  [r4-16*1], m4, 1
6639    call .main
6640    sub                eobd, 128
6641    jge .eob279
6642    vperm2i128          m10, m0, m3, 0x31 ; 21
6643    vinserti128          m8, m0, xm3, 1   ; 17
6644    vperm2i128          m11, m1, m4, 0x31 ; 23
6645    vinserti128          m9, m1, xm4, 1   ; 19
6646    pxor                m12, m12
6647    REPX      {mova x, m12}, m13, m14, m15
6648    REPX {mova [r6+32*x], m12}, 0, 1, 2, 3
6649    jmp .full
6650.eob279:
6651    mova          [r5+16*0], xm0
6652    mova          [r5+16*1], xm3
6653    mova          [r5+16*2], xm1
6654    mova          [r5+16*3], xm4
6655    vextracti128  [r5+16*4], m0, 1
6656    vextracti128  [r5+16*5], m3, 1
6657    vextracti128  [r5+16*6], m1, 1
6658    vextracti128  [r5+16*7], m4, 1
6659    call .main
6660    vperm2i128          m14, m0, m3, 0x31 ; 29
6661    vinserti128         m12, m0, xm3, 1   ; 25
6662    vperm2i128          m15, m1, m4, 0x31 ; 31
6663    vinserti128         m13, m1, xm4, 1   ; 27
6664    mova                 m8, [r5+32*0]
6665    mova                 m9, [r5+32*1]
6666    mova                m10, [r5+32*2]
6667    mova                m11, [r5+32*3]
6668.full:
6669    mova                 m0, [r4+32*0]
6670    mova                 m1, [r4+32*1]
6671    mova                 m2, [r4+32*2]
6672    mova                 m3, [r4+32*3]
6673    mova                 m4, [r4-32*4]
6674    mova                 m5, [r4-32*3]
6675    mova                 m6, [r4-32*2]
6676    mova                 m7, [r4-32*1]
6677    lea                  r6, [pw_5 + 128]
6678    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
6679    lea                  r3, [rsp+32*8]
6680    mova                 m8, [r3+32*0]
6681    mova                 m9, [r3+32*1]
6682    mova                m10, [r3+32*2]
6683    mova                m11, [r3+32*3]
6684    mova                m12, [r3-32*4]
6685    mova                m13, [r3-32*3]
6686    mova                m14, [r3-32*2]
6687    mova                m15, [r3-32*1]
6688.idct16:
6689    lea                  r3, [rsp+32*16]
6690    mova                 m0, [r3+32*0]
6691    mova                 m1, [r3+32*1]
6692    mova                 m2, [r3+32*2]
6693    mova                 m3, [r3+32*3]
6694    mova                 m4, [r3-32*4]
6695    mova                 m5, [r3-32*3]
6696    mova                 m6, [r3-32*2]
6697    mova                 m7, [r3-32*1]
6698    mova              [rsp], m15
6699    call m(idct_16x16_internal_8bpc).main
6700    imul                 r2, strideq, 19
6701    lea                  r3, [strideq*3]
6702    add                  r2, dstq
6703    call .pass2_end
6704    RET
6705ALIGN function_align
6706.main:
6707    pmulld               m0, m14, [cq+128* 1]
6708    pmulld               m1, m14, [cq+128* 3]
6709    pmulld               m2, m14, [cq+128* 5]
6710    pmulld               m3, m14, [cq+128* 7]
6711    pmulld               m4, m14, [cq+128* 9]
6712    pmulld               m5, m14, [cq+128*11]
6713    pmulld               m6, m14, [cq+128*13]
6714    pmulld               m7, m14, [cq+128*15]
6715    call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
6716    pmulld               m0, m14, [cq+128* 0]
6717    pmulld               m1, m14, [cq+128* 2]
6718    pmulld               m2, m14, [cq+128* 4]
6719    pmulld               m3, m14, [cq+128* 6]
6720    pmulld               m4, m14, [cq+128* 8]
6721    pmulld               m5, m14, [cq+128*10]
6722    pmulld               m6, m14, [cq+128*12]
6723    pmulld               m7, m14, [cq+128*14]
6724    call m(idct_8x8_internal_10bpc).main_rect2
6725    call m(idct_8x16_internal_10bpc).main_evenhalf
6726    psrld               m15, m11, 11 ; pd_1
6727    mova                 m8, [r6-32*4]
6728    mova                 m9, [r6-32*3]
6729    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
6730    psubd               m10, m0, m8 ; out15
6731    paddd                m0, m8     ; out0
6732    mova                 m8, [r6-32*2]
6733    paddd               m15, m1, m9 ; out1
6734    psubd                m1, m9     ; out14
6735    mova                 m9, [r6-32*1]
6736    REPX       {psrad x, 1}, m0, m15, m10, m1
6737    packssdw             m0, m15
6738    packssdw             m1, m10
6739    psubd               m10, m2, m8 ; out13
6740    paddd                m2, m8     ; out2
6741    mova                 m8, [r6+32*0]
6742    paddd               m15, m3, m9 ; out3
6743    psubd                m3, m9     ; out12
6744    mova                 m9, [r6+32*1]
6745    REPX       {psrad x, 1}, m2, m15, m10, m3
6746    packssdw             m2, m15
6747    packssdw             m3, m10
6748    psubd               m10, m4, m8 ; out11
6749    paddd                m4, m8     ; out4
6750    mova                 m8, [r6+32*2]
6751    paddd               m15, m5, m9 ; out5
6752    psubd                m5, m9     ; out10
6753    mova                 m9, [r6+32*3]
6754    REPX       {psrad x, 1}, m4, m10, m15, m5
6755    packssdw             m4, m15
6756    packssdw             m5, m10
6757    psubd               m10, m6, m8 ; out9
6758    paddd                m6, m8     ; out6
6759    paddd               m15, m7, m9 ; out7
6760    psubd                m7, m9     ; out8
6761    REPX       {psrad x, 1}, m6, m10, m15, m7
6762    packssdw             m6, m15
6763    packssdw             m7, m10
6764    punpckhwd            m8, m0, m2
6765    punpcklwd            m0, m2
6766    punpckhwd            m2, m3, m1
6767    punpcklwd            m3, m1
6768    punpckhwd            m1, m4, m6
6769    punpcklwd            m4, m6
6770    punpcklwd            m6, m7, m5
6771    punpckhwd            m7, m5
6772    pxor                 m5, m5
6773    mov                 r7d, 128*13
6774.main_zero_loop:
6775    mova      [cq+r7-128*1], m5
6776    mova      [cq+r7+128*0], m5
6777    mova      [cq+r7+128*1], m5
6778    mova      [cq+r7+128*2], m5
6779    sub                 r7d, 128*4
6780    jg .main_zero_loop
6781    add                  cq, 32
6782    punpcklwd            m5, m3, m2
6783    punpckhwd            m3, m2
6784    punpcklwd            m2, m4, m1
6785    punpckhwd            m4, m1
6786    punpckhwd            m1, m0, m8
6787    punpcklwd            m0, m8
6788    punpckhwd            m8, m6, m7
6789    punpcklwd            m6, m7
6790    punpcklqdq           m7, m1, m4
6791    punpckhqdq           m1, m4
6792    punpckhqdq           m4, m8, m3
6793    punpcklqdq           m8, m3
6794    punpckhqdq           m3, m6, m5
6795    punpcklqdq           m6, m5
6796    punpcklqdq           m5, m0, m2
6797    punpckhqdq           m0, m2
6798    mova          [r6+16*0], xm5
6799    mova          [r6+16*1], xm6
6800    mova          [r6+16*2], xm7
6801    mova          [r6+16*3], xm8
6802    vextracti128  [r6+16*4], m5, 1
6803    vextracti128  [r6+16*5], m6, 1
6804    vextracti128  [r6+16*6], m7, 1
6805    vextracti128  [r6+16*7], m8, 1
6806    sub                  r6, 32*4
6807    ret
6808ALIGN function_align
6809.pass2_end:
6810    mova [rsp+gprsize+32*0], m6
6811    mova [rsp+gprsize+32*2], m7
6812    mova [rsp+gprsize+32*3], m15
6813    vpbroadcastd        m15, [pw_2048]
6814    vpbroadcastd         m7, [pixel_10bpc_max]
6815    IDCT32_PASS2_END      0, r5+32*3, 1, 6, strideq*0, r3*4
6816    IDCT32_PASS2_END      4, r5-32*1, 0, 1, strideq*4, strideq*8
6817    IDCT32_PASS2_END      8, r4+32*3, 0, 4, strideq*8, strideq*4
6818    IDCT32_PASS2_END     12, r4-32*1, 0, 4, r3*4,      strideq*0
6819    add                dstq, strideq
6820    sub                  r2, strideq
6821    mova                 m1, [rsp+gprsize+32*1]
6822    IDCT32_PASS2_END      1, r5+32*2, 0, 4, strideq*0, r3*4
6823    IDCT32_PASS2_END      5, r5-32*2, 0, 4, strideq*4, strideq*8
6824    IDCT32_PASS2_END      9, r4+32*2, 0, 4, strideq*8, strideq*4
6825    IDCT32_PASS2_END     13, r4-32*2, 0, 4, r3*4,      strideq*0
6826    add                dstq, strideq
6827    sub                  r2, strideq
6828    mova                 m1, [rsp+gprsize+32*0]
6829    IDCT32_PASS2_END      2, r5+32*1, 0, 4, strideq*0, r3*4
6830    IDCT32_PASS2_END      1, r5-32*3, 0, 4, strideq*4, strideq*8
6831    IDCT32_PASS2_END     10, r4+32*1, 0, 4, strideq*8, strideq*4
6832    IDCT32_PASS2_END     14, r4-32*3, 0, 4, r3*4,      strideq*0
6833    add                dstq, strideq
6834    sub                  r2, strideq
6835    mova                 m1, [rsp+gprsize+32*2]
6836    mova                 m2, [rsp+gprsize+32*3]
6837    IDCT32_PASS2_END      3, r5+32*0, 0, 4, strideq*0, r3*4
6838    IDCT32_PASS2_END      1, r5-32*4, 0, 4, strideq*4, strideq*8
6839    IDCT32_PASS2_END     11, r4+32*0, 0, 4, strideq*8, strideq*4
6840    IDCT32_PASS2_END      2, r4-32*4, 0, 4, r3*4,      strideq*0
6841    ret
6842
6843cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob
6844    vpbroadcastd         m7, [pixel_10bpc_max]
6845.pass1:
6846    vpbroadcastd         m8, [pw_2896x8]
6847    vpbroadcastd         m9, [pw_1697x16]
6848    vpbroadcastd        m11, [pw_8192]
6849    lea                  r6, [strideq*5]
6850    pxor                 m6, m6
6851    paddw               m10, m11, m11 ; pw_16384
6852    mov                  r5, dstq
6853    call .main
6854    sub                eobd, 36
6855    jl .ret
6856    add                  cq, 128*8
6857    lea                dstq, [r5+16]
6858    call .main
6859    sub                  cq, 128*8-32
6860    lea                dstq, [r5+strideq*8]
6861    mov                  r5, dstq
6862    call .main
6863    sub                eobd, 107 ; eob < 143
6864    jl .ret
6865    add                  cq, 128*8
6866    lea                dstq, [r5+16]
6867    call .main
6868    sub                  cq, 128*8-32
6869    lea                dstq, [r5+strideq*8]
6870    mov                  r5, dstq
6871    call .main
6872    sub                eobd, 128 ; eob < 271
6873    jl .ret
6874    add                  cq, 128*8
6875    lea                dstq, [r5+16]
6876    call .main
6877    sub                  cq, 128*8-32
6878    lea                dstq, [r5+strideq*8]
6879    mov                  r5, dstq
6880    call .main
6881    sub                eobd, 128 ; eob < 399
6882    jl .ret
6883    add                  cq, 128*8
6884    lea                dstq, [r5+16]
6885    call .main
6886.ret:
6887    RET
6888ALIGN function_align
6889.main:
6890    mova                 m0, [cq+128*0]
6891    packssdw             m0, [cq+128*1]
6892    mova                 m1, [cq+128*2]
6893    packssdw             m1, [cq+128*3]
6894    mova                 m2, [cq+128*4]
6895    packssdw             m2, [cq+128*5]
6896    mova                 m3, [cq+128*6]
6897    packssdw             m3, [cq+128*7]
6898    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
6899    REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3
6900    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
6901    REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
6902.main2:
6903    punpckhwd            m4, m0, m1
6904    punpcklwd            m0, m1
6905    punpckhwd            m1, m2, m3
6906    punpcklwd            m2, m3
6907    punpckhwd            m3, m0, m4
6908    punpcklwd            m0, m4
6909    punpcklwd            m4, m2, m1
6910    punpckhwd            m2, m1
6911    punpckhqdq           m1, m0, m4
6912    punpcklqdq           m0, m4
6913    call m(iidentity_8x8_internal_10bpc).write_2x8x2
6914    punpcklqdq           m0, m3, m2
6915    punpckhqdq           m1, m3, m2
6916    jmp m(iidentity_8x8_internal_10bpc).write_2x8x2
6917
6918cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob
6919    vpbroadcastd         m7, [pixel_12bpc_max]
6920    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1
6921
6922cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
6923    test               eobd, eobd
6924    jz .dconly
6925    PROLOGUE              0, 8, 16, 32*40, dst, stride, c, eob
6926%undef cmp
6927    vpbroadcastd        m12, [clip_18b_min]
6928    vpbroadcastd        m13, [clip_18b_max]
6929    lea                  r6, [rsp+32*4]
6930    call .main
6931    cmp                eobd, 36
6932    jge .full
6933    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
6934    pxor                 m8, m8
6935    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
6936    lea                  r6, [pw_5+128]
6937    mov                  r7, dstq
6938    call m(idct_16x16_internal_8bpc).main
6939    call .write_16x16
6940    mova                 m0, [r5+32*3]
6941    mova                 m1, [r5+32*2]
6942    mova                 m2, [r5+32*1]
6943    mova                 m3, [r5+32*0]
6944    mova                 m4, [r5-32*1]
6945    mova                 m5, [r5-32*2]
6946    mova                 m6, [r5-32*3]
6947    mova                 m7, [r5-32*4]
6948    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
6949    pxor                 m8, m8
6950    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
6951    jmp .end
6952.dconly:
6953    imul                r6d, [cq], 181
6954    vpbroadcastd         m3, [dconly_10bpc]
6955    mov                [cq], eobd ; 0
6956    or                  r3d, 16
6957    add                 r6d, 128
6958    sar                 r6d, 8
6959    imul                r6d, 181
6960    add                 r6d, 384
6961    sar                 r6d, 9
6962    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
6963.full:
6964    add                  cq, 32
6965    mova          [r4+32*3], m0
6966    mova          [r4+32*2], m1
6967    mova          [r4+32*1], m2
6968    mova          [r4+32*0], m3
6969    mova          [r4-32*1], m4
6970    mova          [r4-32*2], m5
6971    mova          [r4-32*3], m6
6972    mova          [r4-32*4], m7
6973    call .main
6974    sub                  r4, 32*16 ; topleft 16x8
6975    call .transpose_16x16
6976    lea                  r6, [pw_5+128]
6977    mov                  r7, dstq
6978    call m(idct_16x16_internal_8bpc).main
6979    call .write_16x16
6980    mova                 m0, [r5+32*3]
6981    mova                 m1, [r5+32*2]
6982    mova                 m2, [r5+32*1]
6983    mova                 m3, [r5+32*0]
6984    mova                 m4, [r5-32*1]
6985    mova                 m5, [r5-32*2]
6986    mova                 m6, [r5-32*3]
6987    mova                 m7, [r5-32*4]
6988    add                  r4, 32*8 ; bottomleft 16x8
6989    call .transpose_16x16
6990.end:
6991    lea                dstq, [r7+32]
6992    call m(idct_16x16_internal_8bpc).main
6993    call .write_16x16
6994    RET
6995ALIGN function_align
6996.transpose_16x16:
6997    punpckhdq            m8, m3, m1
6998    punpckldq            m3, m1
6999    punpckhdq            m1, m0, m2
7000    punpckldq            m0, m2
7001    punpckhdq            m2, m7, m5
7002    punpckldq            m7, m5
7003    punpckhdq            m5, m4, m6
7004    punpckldq            m4, m6
7005    punpckhqdq           m6, m0, m4
7006    punpcklqdq           m0, m4
7007    punpckhqdq           m4, m1, m5
7008    punpcklqdq           m1, m5
7009    punpckhqdq           m5, m7, m3
7010    punpcklqdq           m7, m3
7011    punpckhqdq           m3, m2, m8
7012    punpcklqdq           m2, m8
7013    vinserti128          m8, m0, xm7, 1
7014    vperm2i128          m12, m0, m7, 0x31
7015    vinserti128          m9, m6, xm5, 1
7016    vperm2i128          m13, m6, m5, 0x31
7017    vinserti128         m10, m1, xm2, 1
7018    vperm2i128          m14, m1, m2, 0x31
7019    vinserti128         m11, m4, xm3, 1
7020    vperm2i128          m15, m4, m3, 0x31
7021    mova                 m0, [r4+32*3]
7022    mova                 m1, [r4+32*2]
7023    mova                 m2, [r4+32*1]
7024    mova                 m3, [r4+32*0]
7025    mova                 m4, [r4-32*1]
7026    mova                 m5, [r4-32*2]
7027    mova                 m6, [r4-32*3]
7028    mova                 m7, [r4-32*4]
7029    mova      [rsp+gprsize], m15
7030    jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
7031ALIGN function_align
7032.main:
7033    vpbroadcastd        m14, [pd_2896]
7034    vpbroadcastd        m11, [pd_2048]
7035    pmulld               m0, m14, [cq+64* 1]
7036    pmulld               m1, m14, [cq+64* 7]
7037    pmulld               m2, m14, [cq+64* 9]
7038    pmulld               m3, m14, [cq+64*15]
7039    pmulld               m4, m14, [cq+64*17]
7040    pmulld               m5, m14, [cq+64*23]
7041    pmulld               m6, m14, [cq+64*25]
7042    pmulld               m7, m14, [cq+64*31]
7043    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
7044    pmulld               m0, m14, [cq+64* 3]
7045    pmulld               m1, m14, [cq+64* 5]
7046    pmulld               m2, m14, [cq+64*11]
7047    pmulld               m3, m14, [cq+64*13]
7048    pmulld               m4, m14, [cq+64*19]
7049    pmulld               m5, m14, [cq+64*21]
7050    pmulld               m6, m14, [cq+64*27]
7051    pmulld               m7, m14, [cq+64*29]
7052    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
7053    pmulld               m0, m14, [cq+64* 2]
7054    pmulld               m1, m14, [cq+64* 6]
7055    pmulld               m2, m14, [cq+64*10]
7056    pmulld               m3, m14, [cq+64*14]
7057    pmulld               m4, m14, [cq+64*18]
7058    pmulld               m5, m14, [cq+64*22]
7059    pmulld               m6, m14, [cq+64*26]
7060    pmulld               m7, m14, [cq+64*30]
7061    call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
7062    pmulld               m0, m14, [cq+64* 0]
7063    pmulld               m1, m14, [cq+64* 4]
7064    pmulld               m2, m14, [cq+64* 8]
7065    pmulld               m3, m14, [cq+64*12]
7066    pmulld               m4, m14, [cq+64*16]
7067    pmulld               m5, m14, [cq+64*20]
7068    pmulld               m6, m14, [cq+64*24]
7069    pmulld               m7, m14, [cq+64*28]
7070    call m(idct_8x8_internal_10bpc).main_rect2
7071    call m(idct_8x16_internal_10bpc).main_evenhalf
7072    pxor                 m8, m8
7073    mov                 r7d, 64*30
7074.main_zero_loop:
7075    mova       [cq+r7-64*2], m8
7076    mova       [cq+r7-64*1], m8
7077    mova       [cq+r7+64*0], m8
7078    mova       [cq+r7+64*1], m8
7079    sub                 r7d, 64*4
7080    jg .main_zero_loop
7081.main_end:
7082    psrld               m11, 11 ; pd_1
7083    IDCT32_END            0, 15, 8, 9, 10, 1
7084    IDCT32_END            1, 14, 8, 9, 10, 1
7085    punpckhwd            m8, m0, m1   ; 16 17
7086    punpcklwd            m0, m1       ;  0  1
7087    punpcklwd            m1, m14, m15 ; 14 15
7088    punpckhwd           m14, m15      ; 30 31
7089    mova          [r5+32*3], m8
7090    mova          [r5+32*2], m14
7091    IDCT32_END            2, 15, 8, 9, 10, 1
7092    IDCT32_END            3, 14, 8, 9, 10, 1
7093    punpckhwd            m8, m2, m3   ; 18 19
7094    punpcklwd            m2, m3       ;  2  3
7095    punpcklwd            m3, m14, m15 ; 12 13
7096    punpckhwd           m14, m15      ; 28 29
7097    mova          [r5+32*1], m8
7098    mova          [r5+32*0], m14
7099    IDCT32_END            4, 15, 8, 9, 10, 1
7100    IDCT32_END            5, 14, 8, 9, 10, 1
7101    punpckhwd            m8, m4, m5   ; 20 21
7102    punpcklwd            m4, m5       ;  4  5
7103    punpcklwd            m5, m14, m15 ; 10 11
7104    punpckhwd           m14, m15      ; 26 27
7105    mova          [r5-32*1], m8
7106    mova          [r5-32*2], m14
7107    IDCT32_END            6, 15, 8, 9, 10, 1
7108    IDCT32_END            7, 14, 8, 9, 10, 1
7109    punpckhwd            m8, m6, m7   ; 22 23
7110    punpcklwd            m6, m7       ;  6  7
7111    punpcklwd            m7, m14, m15 ;  8  9
7112    punpckhwd           m14, m15      ; 24 25
7113    mova          [r5-32*3], m8
7114    mova          [r5-32*4], m14
7115    ret
7116ALIGN function_align
7117.write_16x16:
7118    mova                 m1, [rsp+gprsize+32*1]
7119    mova [rsp+gprsize+32*0], m8
7120    mova [rsp+gprsize+32*1], m9
7121    mova [rsp+gprsize+32*2], m12
7122    vpbroadcastd        m12, [pw_2048]
7123    vpbroadcastd         m9, [pixel_10bpc_max]
7124    lea                  r3, [strideq*3]
7125    pxor                 m8, m8
7126    pmulhrsw             m0, m12
7127    pmulhrsw             m1, m12
7128    pmulhrsw             m2, m12
7129    pmulhrsw             m3, m12
7130    call m(idct_16x8_internal_10bpc).write_16x4
7131    pmulhrsw             m0, m12, m4
7132    pmulhrsw             m1, m12, m5
7133    pmulhrsw             m2, m12, m6
7134    pmulhrsw             m3, m12, m7
7135    call m(idct_16x8_internal_10bpc).write_16x4
7136    pmulhrsw             m0, m12, [rsp+gprsize+32*0]
7137    pmulhrsw             m1, m12, [rsp+gprsize+32*1]
7138    pmulhrsw             m2, m12, m10
7139    pmulhrsw             m3, m12, m11
7140    call m(idct_16x8_internal_10bpc).write_16x4
7141    pmulhrsw             m0, m12, [rsp+gprsize+32*2]
7142    pmulhrsw             m1, m12, m13
7143    pmulhrsw             m2, m12, m14
7144    pmulhrsw             m3, m12, m15
7145    jmp m(idct_16x8_internal_10bpc).write_16x4
7146
7147cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob
7148    vpbroadcastd         m7, [pixel_10bpc_max]
7149.pass1:
7150    vpbroadcastd         m8, [pw_2896x8]
7151    vpbroadcastd         m9, [pw_1697x16]
7152    vpbroadcastd        m10, [pw_4096]
7153    lea                  r6, [strideq*5]
7154    pxor                 m6, m6
7155    mov                  r5, dstq
7156    call .main
7157    sub                eobd, 36
7158    jl .ret
7159    add                  cq, 32
7160    lea                dstq, [dstq+strideq*4]
7161    call .main
7162    add                  cq, 64*8-32
7163    lea                dstq, [r5+16*1]
7164    call .main
7165    sub                eobd, 107 ; eob < 143
7166    jl .ret
7167    add                  cq, 32
7168    lea                dstq, [dstq+strideq*4]
7169    call .main
7170    add                  cq, 64*8-32
7171    lea                dstq, [r5+16*2]
7172    call .main
7173    sub                eobd, 128 ; eob < 271
7174    jl .ret
7175    add                  cq, 32
7176    lea                dstq, [dstq+strideq*4]
7177    call .main
7178    add                  cq, 64*8-32
7179    lea                dstq, [r5+16*3]
7180    call .main
7181    sub                eobd, 128 ; eob < 399
7182    jl .ret
7183    add                  cq, 32
7184    lea                dstq, [dstq+strideq*4]
7185    call .main
7186.ret:
7187    RET
7188ALIGN function_align
7189.main:
7190    mova                 m0, [cq+64*0]
7191    packssdw             m0, [cq+64*1]
7192    mova                 m1, [cq+64*2]
7193    packssdw             m1, [cq+64*3]
7194    mova                 m2, [cq+64*4]
7195    packssdw             m2, [cq+64*5]
7196    mova                 m3, [cq+64*6]
7197    packssdw             m3, [cq+64*7]
7198    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
7199    REPX  {paddsw   x, x  }, m0, m1, m2, m3
7200    REPX  {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3
7201    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3
7202    REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
7203    jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
7204
7205cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob
7206    vpbroadcastd         m7, [pixel_12bpc_max]
7207    jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1
7208
7209cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
7210    test               eobd, eobd
7211    jz .dconly
7212    PROLOGUE              0, 8, 16, 32*83, dst, stride, c, eob
7213%undef cmp
7214    vpbroadcastd        m12, [clip_18b_min]
7215    vpbroadcastd        m13, [clip_18b_max]
7216    lea                  r6, [rsp+32*7]
7217    call .main
7218    cmp                eobd, 36
7219    jl .fast
7220    call .main
7221    cmp                eobd, 136
7222    jl .fast
7223    call .main
7224    cmp                eobd, 300
7225    jl .fast
7226    call .main
7227    jmp .pass2
7228.dconly:
7229    imul                r6d, [cq], 181
7230    vpbroadcastd         m3, [dconly_10bpc]
7231    mov                [cq], eobd ; 0
7232    or                  r3d, 32
7233    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly
7234.fast:
7235    lea                  r4, [rsp+32*71]
7236    pxor                 m0, m0
7237.fast_loop:
7238    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
7239    add                  r6, 32*8
7240    cmp                  r6, r4
7241    jl .fast_loop
7242.pass2:
7243    lea                  r3, [rsp+32*3]
7244    mov                  r4, r6
7245    lea                  r5, [r6+32*8]
7246    lea                  r6, [pw_5+128]
7247    call .pass2_oddhalf
7248    call .pass2_evenhalf
7249    imul                 r2, strideq, 19
7250    lea                  r3, [strideq*3]
7251    add                  r2, dstq
7252    call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
7253    sub                dstq, r3
7254    lea                  r2, [r2+r3+32]
7255    add                dstq, 32
7256    lea                  r3, [rsp+32*11]
7257    call .pass2_oddhalf
7258    call .pass2_evenhalf
7259    lea                  r3, [strideq*3]
7260    call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
7261    RET
7262ALIGN function_align
7263.main:
7264    mova                 m0, [cq+128* 1]
7265    mova                 m1, [cq+128* 7]
7266    mova                 m2, [cq+128* 9]
7267    mova                 m3, [cq+128*15]
7268    mova                 m4, [cq+128*17]
7269    mova                 m5, [cq+128*23]
7270    mova                 m6, [cq+128*25]
7271    mova                 m7, [cq+128*31]
7272    vpbroadcastd        m11, [pd_2048]
7273    vpbroadcastd        m14, [pd_2896]
7274    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1
7275    mova                 m0, [cq+128* 3]
7276    mova                 m1, [cq+128* 5]
7277    mova                 m2, [cq+128*11]
7278    mova                 m3, [cq+128*13]
7279    mova                 m4, [cq+128*19]
7280    mova                 m5, [cq+128*21]
7281    mova                 m6, [cq+128*27]
7282    mova                 m7, [cq+128*29]
7283    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2
7284    mova                 m0, [cq+128* 2]
7285    mova                 m1, [cq+128* 6]
7286    mova                 m2, [cq+128*10]
7287    mova                 m3, [cq+128*14]
7288    mova                 m4, [cq+128*18]
7289    mova                 m5, [cq+128*22]
7290    mova                 m6, [cq+128*26]
7291    mova                 m7, [cq+128*30]
7292    call m(idct_8x16_internal_10bpc).main_oddhalf
7293    mova                 m0, [cq+128* 0]
7294    mova                 m1, [cq+128* 4]
7295    mova                 m2, [cq+128* 8]
7296    mova                 m3, [cq+128*12]
7297    mova                 m4, [cq+128*16]
7298    mova                 m5, [cq+128*20]
7299    mova                 m6, [cq+128*24]
7300    mova                 m7, [cq+128*28]
7301    call m(idct_8x8_internal_10bpc).main
7302    call m(idct_8x16_internal_10bpc).main_evenhalf
7303    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end
7304    pxor                m15, m15
7305    mov                 r7d, 128*29
7306.main_zero_loop:
7307    mova      [cq+r7-128*1], m15
7308    mova      [cq+r7+128*0], m15
7309    mova      [cq+r7+128*1], m15
7310    mova      [cq+r7+128*2], m15
7311    sub                 r7d, 128*4
7312    jg .main_zero_loop
7313    add                  cq, 32
7314    mova          [r4-32*4], m0
7315    mova          [r4-32*3], m1
7316    mova          [r4-32*2], m2
7317    mova          [r4-32*1], m3
7318    mova          [r4+32*0], m4
7319    mova          [r4+32*1], m5
7320    mova          [r4+32*2], m6
7321    mova          [r4+32*3], m7
7322    mova                 m0, [r5+32*3]
7323    mova                 m1, [r5+32*2]
7324    mova                 m2, [r5+32*1]
7325    mova                 m3, [r5+32*0]
7326    mova                 m4, [r5-32*1]
7327    mova                 m5, [r5-32*2]
7328    mova                 m6, [r5-32*3]
7329    mova                 m7, [r5-32*4]
7330    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
7331    mova          [r5-32*4], m0
7332    mova          [r5-32*3], m1
7333    mova          [r5-32*2], m2
7334    mova          [r5-32*1], m3
7335    mova          [r5+32*0], m4
7336    mova          [r5+32*1], m5
7337    mova          [r5+32*2], m6
7338    mova          [r5+32*3], m7
7339    ret
7340ALIGN function_align
7341.pass2_oddhalf:
7342    mova                 m0, [r3+32* 1] ;  1
7343    mova                 m1, [r3+32* 3] ;  3
7344    mova                 m2, [r3+32* 5] ;  5
7345    mova                 m3, [r3+32* 7] ;  7
7346    mova                 m4, [r3+32*17] ;  9
7347    mova                 m5, [r3+32*19] ; 11
7348    mova                 m6, [r3+32*21] ; 13
7349    mova                 m7, [r3+32*23] ; 15
7350    mova                 m8, [r3+32*33] ; 17
7351    mova                 m9, [r3+32*35] ; 19
7352    mova                m10, [r3+32*37] ; 21
7353    mova                m11, [r3+32*39] ; 23
7354    mova                m12, [r3+32*49] ; 25
7355    mova                m13, [r3+32*51] ; 27
7356    mova                m14, [r3+32*53] ; 29
7357    mova                m15, [r3+32*55] ; 31
7358    jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
7359ALIGN function_align
7360.pass2_evenhalf:
7361    mova                 m0, [r3+32* 0] ;  0
7362    mova                 m1, [r3+32* 2] ;  2
7363    mova                 m2, [r3+32* 4] ;  4
7364    mova                 m3, [r3+32* 6] ;  6
7365    mova                 m4, [r3+32*16] ;  8
7366    mova                 m5, [r3+32*18] ; 10
7367    mova                 m6, [r3+32*20] ; 12
7368    mova                 m7, [r3+32*22] ; 14
7369    mova                 m8, [r3+32*32] ; 16
7370    mova                 m9, [r3+32*34] ; 18
7371    mova                m10, [r3+32*36] ; 20
7372    mova                m11, [r3+32*38] ; 22
7373    mova                m12, [r3+32*48] ; 24
7374    mova                m13, [r3+32*50] ; 26
7375    mova                m14, [r3+32*52] ; 28
7376    mova                m15, [r3+32*54] ; 30
7377    mova      [rsp+gprsize], m15
7378    jmp m(idct_16x16_internal_8bpc).main
7379
7380cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob
7381%undef cmp
7382    vpbroadcastd         m7, [pixel_10bpc_max]
7383.pass1:
7384    vpbroadcastd         m5, [pw_8192]
7385    pxor                 m6, m6
7386    lea                  r6, [strideq*3]
7387    lea                  r5, [strideq*5]
7388    lea                  r4, [strideq+r6*2] ; strideq*7
7389    call .main                              ; 0
7390    cmp                eobd, 36
7391    jl .ret
7392    add                  cq, 128*8          ; 0 1
7393    mov                  r7, dstq           ; 1
7394    add                dstq, 16
7395    call .main
7396    call .main2
7397    cmp                eobd, 136
7398    jl .ret
7399    add                  cq, 128*16-32      ; 0 1 2
7400    lea                dstq, [r7+16*2]      ; 1 2
7401    call .main                              ; 2
7402    call .main2
7403    call .main2
7404    cmp                eobd, 300
7405    jl .ret
7406    add                  cq, 128*24-64      ; 0 1 2 3
7407    add                  r7, 16*3           ; 1 2 3
7408    mov                dstq, r7             ; 2 3
7409    call .main                              ; 3
7410    call .main2
7411    call .main2
7412    call .main2
7413    cmp                eobd, 535
7414    jl .ret
7415    add                  cq, 128*24-64      ; 0 1 2 3
7416    lea                dstq, [r7+strideq*8] ; 1 2 3 4
7417    mov                  r7, dstq           ; 2 3 4
7418    call .main                              ; 3 4
7419    call .main2
7420    call .main2
7421    cmp                eobd, 755
7422    jl .ret
7423    add                  cq, 128*16-32      ; 0 1 2 3
7424    lea                dstq, [r7+strideq*8] ; 1 2 3 4
7425    call .main                              ; 2 3 4 5
7426    call .main2                             ; 3 4 5
7427    cmp                eobd, 911
7428    jl .ret
7429    add                  cq, 128*8          ; 0 1 2 3
7430    add                dstq, 16             ; 1 2 3 4
7431    call .main                              ; 2 3 4 5
7432.ret:                                       ; 3 4 5 6
7433    RET
7434ALIGN function_align
7435.main2:
7436    sub                  cq, 128*8-32
7437    lea                dstq, [dstq+strideq*8-16]
7438.main:
7439    mova                 m0, [cq+128*0]
7440    packssdw             m0, [cq+128*1]
7441    mova                 m1, [cq+128*2]
7442    packssdw             m1, [cq+128*3]
7443    mova                 m2, [cq+128*4]
7444    packssdw             m2, [cq+128*5]
7445    mova                 m3, [cq+128*6]
7446    packssdw             m3, [cq+128*7]
7447    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
7448    jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero
7449
7450cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob
7451    vpbroadcastd         m7, [pixel_12bpc_max]
7452    jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1
7453
7454%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
7455%if %1 & 1
7456    mova                m%5, [r5-32*(51-%1)] ; idct16 out 0+n
7457    mova                m%4, [r4-32*(14+%1)] ; idct32 out31-n
7458%else
7459    mova                m%5, [r4-32*(45-%1)]
7460    mova                m%4, [r5-32*(20+%1)]
7461%endif
7462    paddsw              m%6, m%5, m%4 ; idct32 out 0+n
7463    psubsw              m%5, m%4      ; idct32 out31-n
7464    paddsw              m%4, m%5, m%3 ; out31-n
7465    psubsw              m%5, m%3      ; out32+n
7466    paddsw              m%3, m%6, m%2 ; out 0+n
7467    psubsw              m%6, m%2      ; out63-n
7468    REPX  {pmulhrsw x, m14}, m%5, m%6, m%4, m%3
7469%if %1 & 1
7470    %define %%d0 r2
7471    %define %%d1 dstq
7472%else
7473    %define %%d0 dstq
7474    %define %%d1 r2
7475%endif
7476    paddw               m%3, [%%d0+%7 ]
7477    paddw               m%4, [%%d1+%8 ]
7478    paddw               m%5, [%%d0+%9 ]
7479    paddw               m%6, [%%d1+%10]
7480    pxor                m%2, m%2
7481    REPX    {pmaxsw x, m%2}, m%3, m%4, m%5, m%6
7482    vpbroadcastd        m%2, [pixel_10bpc_max]
7483    REPX    {pminsw x, m%2}, m%3, m%4, m%5, m%6
7484    mova         [%%d0+%7 ], m%3
7485    mova         [%%d1+%8 ], m%4
7486    mova         [%%d0+%9 ], m%5
7487    mova         [%%d1+%10], m%6
7488%endmacro
7489
7490cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
7491    test               eobd, eobd
7492    jz .dconly
7493    PROLOGUE              0, 10, 16, 32*98, dst, stride, c, eob
7494%undef cmp
7495    vpbroadcastd        m11, [pd_2048]
7496    vpbroadcastd        m12, [clip_18b_min]
7497    vpbroadcastd        m13, [clip_18b_max]
7498    vpbroadcastd        m14, [pd_2896]
7499    lea                  r6, [rsp+32*6]
7500    call .main
7501    sub                eobd, 44
7502    jl .fast
7503    call .main
7504    sub                eobd, 107
7505    jl .fast
7506    call .main
7507    sub                eobd, 128
7508    jl .fast
7509    call .main
7510    jmp .pass2
7511.dconly:
7512    imul                r6d, [cq], 181
7513    vpbroadcastd         m3, [dconly_10bpc]
7514    mov                [cq], eobd ; 0
7515    or                  r3d, 64
7516    add                 r6d, 640
7517    sar                 r6d, 10
7518    jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3
7519.fast:
7520    lea                  r4, [rsp+32*38]
7521    pxor                 m0, m0
7522.fast_loop:
7523    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
7524    add                  r6, 32*8
7525    cmp                  r6, r4
7526    jl .fast_loop
7527.pass2:
7528    lea                  r6, [pw_5+128]
7529    mova                 m0, [rsp+32* 2] ; in0
7530    mova                 m1, [rsp+32* 6] ; in4
7531    mova                 m2, [rsp+32*10] ; in8
7532    mova                 m3, [rsp+32*14] ; in12
7533    mova                 m4, [rsp+32*18] ; in16
7534    mova                 m5, [rsp+32*22] ; in20
7535    mova                 m6, [rsp+32*26] ; in24
7536    mova                 m7, [rsp+32*30] ; in28
7537    pxor                 m8, m8
7538    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
7539    mova              [rsp], m8
7540    call m(idct_16x16_internal_8bpc).main
7541    mova                 m1, [rsp+32*1]
7542    lea                  r4, [rsp+32*38]
7543    mova          [r4-32*4], m0
7544    mova          [r4-32*3], m1
7545    mova          [r4-32*2], m2
7546    mova          [r4-32*1], m3
7547    mova          [r4+32*0], m4
7548    mova          [r4+32*1], m5
7549    mova          [r4+32*2], m6
7550    mova          [r4+32*3], m7
7551    add                  r4, 32*8
7552    mova          [r4-32*4], m8
7553    mova          [r4-32*3], m9
7554    mova          [r4-32*2], m10
7555    mova          [r4-32*1], m11
7556    mova          [r4+32*0], m12
7557    mova          [r4+32*1], m13
7558    mova          [r4+32*2], m14
7559    mova          [r4+32*3], m15
7560    mova                 m0, [rsp+32* 4] ; in2
7561    mova                 m1, [rsp+32* 8] ; in6
7562    mova                 m2, [rsp+32*12] ; in10
7563    mova                 m3, [rsp+32*16] ; in14
7564    mova                 m4, [rsp+32*20] ; in18
7565    mova                 m5, [rsp+32*24] ; in22
7566    mova                 m6, [rsp+32*28] ; in26
7567    mova                 m7, [rsp+32*32] ; in30
7568    lea                  r5, [r4+32*16]
7569    add                  r4, 32*8
7570    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
7571    mova                 m0, [rsp+32* 3] ; in1
7572    mova                 m1, [rsp+32*33] ; in31
7573    mova                 m2, [rsp+32*19] ; in17
7574    mova                 m3, [rsp+32*17] ; in15
7575    mova                 m4, [rsp+32*11] ; in9
7576    mova                 m5, [rsp+32*25] ; in23
7577    mova                 m6, [rsp+32*27] ; in25
7578    mova                 m7, [rsp+32* 9] ; in7
7579    lea                  r6, [idct64_mul - 8]
7580    add                  r4, 32*16
7581    add                  r5, 32*32
7582    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
7583    mova                 m0, [rsp+32* 7] ; in5
7584    mova                 m1, [rsp+32*29] ; in27
7585    mova                 m2, [rsp+32*23] ; in21
7586    mova                 m3, [rsp+32*13] ; in11
7587    mova                 m4, [rsp+32*15] ; in13
7588    mova                 m5, [rsp+32*21] ; in19
7589    mova                 m6, [rsp+32*31] ; in29
7590    mova                 m7, [rsp+32* 5] ; in3
7591    add                  r6, 8
7592    add                  r4, 32*8
7593    sub                  r5, 32*8
7594    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
7595    lea                  r8, [strideq*4]
7596    lea                  r9, [strideq*5]
7597    lea                  r3, [r9+strideq*1] ; stride*6
7598    lea                  r7, [r9+strideq*2] ; stride*7
7599    call .main_part2_pass2
7600    RET
7601ALIGN function_align
7602.main:
7603    mova                 m0, [cq+128* 1]
7604    mova                 m1, [cq+128* 3]
7605    mova                 m2, [cq+128* 5]
7606    mova                 m3, [cq+128* 7]
7607    mova                 m4, [cq+128* 9]
7608    mova                 m5, [cq+128*11]
7609    mova                 m6, [cq+128*13]
7610    mova                 m7, [cq+128*15]
7611    call m(idct_8x16_internal_10bpc).main_oddhalf
7612    mova                 m0, [cq+128* 0]
7613    mova                 m1, [cq+128* 2]
7614    mova                 m2, [cq+128* 4]
7615    mova                 m3, [cq+128* 6]
7616    mova                 m4, [cq+128* 8]
7617    mova                 m5, [cq+128*10]
7618    mova                 m6, [cq+128*12]
7619    mova                 m7, [cq+128*14]
7620    call m(idct_8x8_internal_10bpc).main
7621    call m(idct_8x16_internal_10bpc).main_evenhalf
7622    pxor                m15, m15
7623    mov                 r7d, 128*13
7624.main_zero_loop:
7625    mova      [cq+r7-128*1], m15
7626    mova      [cq+r7+128*0], m15
7627    mova      [cq+r7+128*1], m15
7628    mova      [cq+r7+128*2], m15
7629    sub                 r7d, 128*4
7630    jg .main_zero_loop
7631    add                  cq, 32
7632    psrld               m15, m11, 10 ; pd_2
7633    mova                 m8, [r6-32*4]
7634    mova                 m9, [r6+32*3]
7635    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
7636    psubd               m10, m0, m8 ; out15
7637    paddd                m0, m8     ; out0
7638    mova                 m8, [r6-32*3]
7639    psubd               m15, m7, m9 ; out8
7640    paddd                m7, m9     ; out7
7641    mova                 m9, [r6+32*2]
7642    REPX       {psrad x, 2}, m0, m15, m10, m7
7643    packssdw             m0, m15
7644    packssdw             m7, m10
7645    psubd               m10, m1, m8 ; out14
7646    paddd                m1, m8     ; out1
7647    mova                 m8, [r6-32*2]
7648    psubd               m15, m6, m9 ; out9
7649    paddd                m6, m9     ; out6
7650    mova                 m9, [r6+32*1]
7651    REPX       {psrad x, 2}, m1, m15, m10, m6
7652    packssdw             m1, m15
7653    packssdw             m6, m10
7654    psubd               m10, m2, m8 ; out13
7655    paddd                m2, m8     ; out2
7656    mova                 m8, [r6-32*1]
7657    psubd               m15, m5, m9 ; out10
7658    paddd                m5, m9     ; out5
7659    mova                 m9, [r6+32*0]
7660    REPX       {psrad x, 2}, m2, m15, m10, m5
7661    packssdw             m2, m15
7662    packssdw             m5, m10
7663    psubd               m10, m3, m8 ; out12
7664    paddd                m3, m8     ; out3
7665    psubd               m15, m4, m9 ; out11
7666    paddd                m4, m9     ; out4
7667    REPX       {psrad x, 2}, m3, m15, m10, m4
7668    packssdw             m3, m15
7669    packssdw             m4, m10
7670    call m(idct_16x8_internal_10bpc).transpose3
7671    mova          [r6-32*4], m0
7672    mova          [r6-32*3], m1
7673    mova          [r6-32*2], m2
7674    mova          [r6-32*1], m3
7675    mova          [r6+32*0], m4
7676    mova          [r6+32*1], m5
7677    mova          [r6+32*2], m6
7678    mova          [r6+32*3], m7
7679    add                  r6, 32*8
7680    ret
7681.main_part2_pass2:
7682    vpbroadcastd        m11, [pw_1567_3784]
7683    vpbroadcastd        m12, [pw_m3784_1567]
7684    vpbroadcastd        m13, [pw_2896_2896]
7685    lea                  r6, [pw_5+128]
7686    lea                  r2, [dstq+r7]
7687.main_part2_pass2_loop:
7688    vpbroadcastd        m14, [pw_m2896_2896]
7689    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal
7690    vpbroadcastd        m14, [pw_2048]
7691    IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*8, r7*8
7692    IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*8, r7*8
7693    IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*4, r9*8, r3*8
7694    IDCT64_PART2_END     15,  3,  4,  0,  6,  7, strideq*8, r8*4, r9*8, r3*8
7695    add                dstq, strideq
7696    sub                  r2, strideq
7697    cmp                  r4, r5
7698    jne .main_part2_pass2_loop
7699    ret
7700ALIGN function_align
7701.main_part1_rect2:
7702    REPX     {paddd x, m11}, m0, m1, m2, m3
7703    REPX     {psrad x, 12 }, m0, m1, m2, m3
7704.main_part1: ; idct64 steps 1-5
7705    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
7706    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
7707    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
7708    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
7709    vpbroadcastd         m7, [r5+4*0]
7710    vpbroadcastd         m8, [r5+4*1]
7711    vpbroadcastd         m6, [r5+4*2]
7712    vpbroadcastd         m9, [r5+4*3]
7713    vpbroadcastd         m5, [r5+4*4]
7714    vpbroadcastd        m10, [r5+4*5]
7715    vpbroadcastd         m4, [r5+4*6]
7716    vpbroadcastd        m15, [r5+4*7]
7717    pmulld               m7, m0     ; t63a
7718    pmulld               m0, m8     ; t32a
7719    pmulld               m6, m1     ; t62a
7720    pmulld               m1, m9     ; t33a
7721    pmulld               m5, m2     ; t61a
7722    pmulld               m2, m10    ; t34a
7723    pmulld               m4, m3     ; t60a
7724    pmulld               m3, m15    ; t35a
7725    vpbroadcastd        m10, [r5+4*8]
7726    vpbroadcastd        m15, [r5+4*9]
7727    REPX     {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
7728    REPX     {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
7729    psubd                m8, m0, m1 ; t33
7730    paddd                m0, m1     ; t32
7731    psubd                m1, m7, m6 ; t62
7732    paddd                m7, m6     ; t63
7733    psubd                m6, m3, m2 ; t34
7734    paddd                m3, m2     ; t35
7735    psubd                m2, m4, m5 ; t61
7736    paddd                m4, m5     ; t60
7737    REPX    {pmaxsd x, m12}, m8, m1, m6, m2
7738    REPX    {pminsd x, m13}, m8, m1, m6, m2
7739    ITX_MULSUB_2D         1, 8, 5, 9, _, 11, 10, 15    ; t33a, t62a
7740    ITX_MULSUB_2D         2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a
7741    REPX    {pmaxsd x, m12}, m0, m3, m7, m4
7742    REPX    {pminsd x, m13}, m0, m3, m7, m4
7743    vpbroadcastd        m10, [r5+4*10]
7744    vpbroadcastd        m15, [r5+4*11]
7745    psubd                m5, m0, m3 ; t35a
7746    paddd                m0, m3     ; t32a
7747    psubd                m3, m7, m4 ; t60a
7748    paddd                m7, m4     ; t63a
7749    psubd                m4, m1, m6 ; t34
7750    paddd                m1, m6     ; t33
7751    psubd                m6, m8, m2 ; t61
7752    paddd                m8, m2     ; t62
7753    REPX    {pmaxsd x, m12}, m5, m3, m4, m6
7754    REPX    {pminsd x, m13}, m5, m3, m4, m6
7755    ITX_MULSUB_2D         3, 5, 2, 9, _, 11, 10, 15 ; t35,  t60
7756    ITX_MULSUB_2D         6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
7757    REPX    {pmaxsd x, m12}, m0, m7, m1, m8
7758    REPX    {pminsd x, m13}, m0, m7, m1, m8
7759    add                  r5, 4*12
7760    mova          [r6-32*4], m0
7761    mova          [r6+32*3], m7
7762    mova          [r6-32*3], m1
7763    mova          [r6+32*2], m8
7764    mova          [r6-32*2], m6
7765    mova          [r6+32*1], m4
7766    mova          [r6-32*1], m3
7767    mova          [r6+32*0], m5
7768    add                  r6, 32*8
7769    ret
7770.main_part2: ; idct64 steps 6-9
7771    lea                  r5, [r6+32*3]
7772    sub                  r6, 32*4
7773    vpbroadcastd        m10, [pd_1567]
7774    vpbroadcastd        m15, [pd_3784]
7775.main_part2_loop:
7776    mova                 m0, [r6-32*32] ; t32a
7777    mova                 m1, [r5-32*24] ; t39a
7778    mova                 m2, [r5-32*32] ; t63a
7779    mova                 m3, [r6-32*24] ; t56a
7780    mova                 m4, [r6-32*16] ; t40a
7781    mova                 m5, [r5-32* 8] ; t47a
7782    mova                 m6, [r5-32*16] ; t55a
7783    mova                 m7, [r6-32* 8] ; t48a
7784    psubd                m8, m0, m1 ; t39
7785    paddd                m0, m1     ; t32
7786    psubd                m1, m2, m3 ; t56
7787    paddd                m2, m3     ; t63
7788    psubd                m3, m5, m4 ; t40
7789    paddd                m5, m4     ; t47
7790    psubd                m4, m7, m6 ; t55
7791    paddd                m7, m6     ; t48
7792    REPX    {pmaxsd x, m12}, m8, m1, m3, m4
7793    REPX    {pminsd x, m13}, m8, m1, m3, m4
7794    ITX_MULSUB_2D         1, 8, 6, 9, _, 11, 10, 15    ; t39a, t56a
7795    ITX_MULSUB_2D         4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a
7796    REPX    {pmaxsd x, m12}, m0, m2, m5, m7
7797    REPX    {pminsd x, m13}, m0, m5, m2, m7
7798    psubd                m6, m2, m7 ; t48a
7799    paddd                m2, m7     ; t63a
7800    psubd                m7, m0, m5 ; t47a
7801    paddd                m0, m5     ; t32a
7802    psubd                m5, m8, m4 ; t55
7803    paddd                m8, m4     ; t56
7804    psubd                m4, m1, m3 ; t40
7805    paddd                m1, m3     ; t39
7806    REPX    {pmaxsd x, m12}, m6, m7, m5, m4
7807    REPX    {pminsd x, m13}, m6, m7, m5, m4
7808    REPX    {pmulld x, m14}, m6, m7, m5, m4
7809    REPX    {pmaxsd x, m12}, m2, m0, m8, m1
7810    REPX    {pminsd x, m13}, m2, m0, m8, m1
7811    paddd                m6, m11
7812    paddd                m5, m11
7813    psubd                m3, m6, m7 ; t47
7814    paddd                m6, m7     ; t48
7815    psubd                m7, m5, m4 ; t40a
7816    paddd                m5, m4     ; t55a
7817    REPX      {psrad x, 12}, m3, m6, m7, m5
7818    mova         [r5-32* 8], m2
7819    mova         [r6-32*32], m0
7820    mova         [r6-32* 8], m8
7821    mova         [r5-32*32], m1
7822    mova         [r5-32*24], m3
7823    mova         [r6-32*16], m6
7824    mova         [r6-32*24], m7
7825    mova         [r5-32*16], m5
7826    add                  r6, 32
7827    sub                  r5, 32
7828    cmp                  r6, r5
7829    jl .main_part2_loop
7830    ret
7831
7832cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
7833    test               eobd, eobd
7834    jz .dconly
7835    PROLOGUE              0, 11, 16, 32*134, dst, stride, c, eob
7836%undef cmp
7837    vpbroadcastd        m12, [clip_18b_min]
7838    vpbroadcastd        m13, [clip_18b_max]
7839    lea                  r6, [rsp+32*6]
7840    call .main
7841    cmp                eobd, 36
7842    jl .fast
7843    call .main
7844    cmp                eobd, 136
7845    jl .fast
7846    call .main
7847    cmp                eobd, 300
7848    jl .fast
7849    call .main
7850    jmp .pass2
7851.dconly:
7852    imul                r6d, [cq], 181
7853    vpbroadcastd         m3, [dconly_10bpc]
7854    mov                [cq], eobd ; 0
7855    or                  r3d, 64
7856    add                 r6d, 128
7857    sar                 r6d, 8
7858    imul                r6d, 181
7859    add                 r6d, 384
7860    sar                 r6d, 9
7861    jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2
7862.fast:
7863    lea                  r4, [rsp+32*70]
7864    pxor                 m0, m0
7865.fast_loop:
7866    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
7867    add                  r6, 32*8
7868    cmp                  r6, r4
7869    jl .fast_loop
7870.pass2:
7871    lea                  r6, [pw_5 + 128]
7872    mov                 r10, rsp
7873    lea                  r8, [strideq*4]
7874    lea                  r9, [strideq*5]
7875    lea                  r3, [r9+strideq*1] ; stride*6
7876    lea                  r7, [r9+strideq*2] ; stride*7
7877.pass2_loop:
7878    mova                 m0, [r10+32* 2] ; in0
7879    mova                 m1, [r10+32* 6] ; in4
7880    mova                 m2, [r10+32*18] ; in8
7881    mova                 m3, [r10+32*22] ; in12
7882    mova                 m4, [r10+32*34] ; in16
7883    mova                 m5, [r10+32*38] ; in20
7884    mova                 m6, [r10+32*50] ; in24
7885    mova                 m7, [r10+32*54] ; in28
7886    pxor                 m8, m8
7887    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
7888    mova              [rsp], m8
7889    call m(idct_16x16_internal_8bpc).main
7890    mova                 m1, [rsp+32*1]
7891    lea                  r4, [rsp+32*70]
7892    mova          [r4-32*4], m0
7893    mova          [r4-32*3], m1
7894    mova          [r4-32*2], m2
7895    mova          [r4-32*1], m3
7896    mova          [r4+32*0], m4
7897    mova          [r4+32*1], m5
7898    mova          [r4+32*2], m6
7899    mova          [r4+32*3], m7
7900    add                  r4, 32*8
7901    mova          [r4-32*4], m8
7902    mova          [r4-32*3], m9
7903    mova          [r4-32*2], m10
7904    mova          [r4-32*1], m11
7905    mova          [r4+32*0], m12
7906    mova          [r4+32*1], m13
7907    mova          [r4+32*2], m14
7908    mova          [r4+32*3], m15
7909    mova                 m0, [r10+32* 4] ; in2
7910    mova                 m1, [r10+32* 8] ; in6
7911    mova                 m2, [r10+32*20] ; in10
7912    mova                 m3, [r10+32*24] ; in14
7913    mova                 m4, [r10+32*36] ; in18
7914    mova                 m5, [r10+32*40] ; in22
7915    mova                 m6, [r10+32*52] ; in26
7916    mova                 m7, [r10+32*56] ; in30
7917    lea                  r5, [r4+32*16]
7918    add                  r4, 32*8
7919    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
7920    mova                 m0, [r10+32* 3] ; in1
7921    mova                 m1, [r10+32*57] ; in31
7922    mova                 m2, [r10+32*35] ; in17
7923    mova                 m3, [r10+32*25] ; in15
7924    mova                 m4, [r10+32*19] ; in9
7925    mova                 m5, [r10+32*41] ; in23
7926    mova                 m6, [r10+32*51] ; in25
7927    mova                 m7, [r10+32* 9] ; in7
7928    lea                  r6, [idct64_mul - 8]
7929    add                  r4, 32*16
7930    add                  r5, 32*32
7931    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
7932    mova                 m0, [r10+32* 7] ; in5
7933    mova                 m1, [r10+32*53] ; in27
7934    mova                 m2, [r10+32*39] ; in21
7935    mova                 m3, [r10+32*21] ; in11
7936    mova                 m4, [r10+32*23] ; in13
7937    mova                 m5, [r10+32*37] ; in19
7938    mova                 m6, [r10+32*55] ; in29
7939    mova                 m7, [r10+32* 5] ; in3
7940    add                  r6, 8
7941    add                  r4, 32*8
7942    sub                  r5, 32*8
7943    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
7944    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
7945    add                 r10, 32*8
7946    sub                  r4, 32*98 ; rsp+32*16
7947    sub                dstq, r8
7948    add                dstq, 32
7949    cmp                 r10, r4
7950    jl .pass2_loop
7951    RET
7952ALIGN function_align
7953.main:
7954    vpbroadcastd        m14, [pd_2896]
7955    vpbroadcastd        m11, [pd_2048]
7956    pmulld               m0, m14, [cq+128* 1]
7957    pmulld               m1, m14, [cq+128* 7]
7958    pmulld               m2, m14, [cq+128* 9]
7959    pmulld               m3, m14, [cq+128*15]
7960    pmulld               m4, m14, [cq+128*17]
7961    pmulld               m5, m14, [cq+128*23]
7962    pmulld               m6, m14, [cq+128*25]
7963    pmulld               m7, m14, [cq+128*31]
7964    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2
7965    pmulld               m0, m14, [cq+128* 3]
7966    pmulld               m1, m14, [cq+128* 5]
7967    pmulld               m2, m14, [cq+128*11]
7968    pmulld               m3, m14, [cq+128*13]
7969    pmulld               m4, m14, [cq+128*19]
7970    pmulld               m5, m14, [cq+128*21]
7971    pmulld               m6, m14, [cq+128*27]
7972    pmulld               m7, m14, [cq+128*29]
7973    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2
7974    pmulld               m0, m14, [cq+128* 2]
7975    pmulld               m1, m14, [cq+128* 6]
7976    pmulld               m2, m14, [cq+128*10]
7977    pmulld               m3, m14, [cq+128*14]
7978    pmulld               m4, m14, [cq+128*18]
7979    pmulld               m5, m14, [cq+128*22]
7980    pmulld               m6, m14, [cq+128*26]
7981    pmulld               m7, m14, [cq+128*30]
7982    call m(idct_8x16_internal_10bpc).main_oddhalf_rect2
7983    pmulld               m0, m14, [cq+128* 0]
7984    pmulld               m1, m14, [cq+128* 4]
7985    pmulld               m2, m14, [cq+128* 8]
7986    pmulld               m3, m14, [cq+128*12]
7987    pmulld               m4, m14, [cq+128*16]
7988    pmulld               m5, m14, [cq+128*20]
7989    pmulld               m6, m14, [cq+128*24]
7990    pmulld               m7, m14, [cq+128*28]
7991    pxor                m15, m15
7992    mov                 r7d, 128*29
7993.main_zero_loop:
7994    mova      [cq+r7-128*1], m15
7995    mova      [cq+r7+128*0], m15
7996    mova      [cq+r7+128*1], m15
7997    mova      [cq+r7+128*2], m15
7998    sub                 r7d, 128*4
7999    jg .main_zero_loop
8000    add                  cq, 32
8001    call m(idct_8x8_internal_10bpc).main_rect2
8002    call m(idct_8x16_internal_10bpc).main_evenhalf
8003    call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end
8004    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
8005    mova          [r4-32*4], m0
8006    mova          [r4-32*3], m1
8007    mova          [r4-32*2], m2
8008    mova          [r4-32*1], m3
8009    mova          [r4+32*0], m4
8010    mova          [r4+32*1], m5
8011    mova          [r4+32*2], m6
8012    mova          [r4+32*3], m7
8013    mova                 m0, [r5+32*3]
8014    mova                 m1, [r5+32*2]
8015    mova                 m2, [r5+32*1]
8016    mova                 m3, [r5+32*0]
8017    mova                 m4, [r5-32*1]
8018    mova                 m5, [r5-32*2]
8019    mova                 m6, [r5-32*3]
8020    mova                 m7, [r5-32*4]
8021    call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose
8022    mova          [r5-32*4], m0
8023    mova          [r5-32*3], m1
8024    mova          [r5-32*2], m2
8025    mova          [r5-32*1], m3
8026    mova          [r5+32*0], m4
8027    mova          [r5+32*1], m5
8028    mova          [r5+32*2], m6
8029    mova          [r5+32*3], m7
8030    ret
8031
8032cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
8033    test               eobd, eobd
8034    jnz .normal
8035    imul                r6d, [cq], 181
8036    mov                [cq], eobd ; 0
8037    or                  r3d, 16
8038.dconly:
8039    add                 r6d, 640
8040    sar                 r6d, 10
8041.dconly2:
8042    vpbroadcastd         m5, [dconly_10bpc]
8043    imul                r6d, 181
8044    add                 r6d, 2176
8045    sar                 r6d, 12
8046    movd                xm0, r6d
8047    paddsw              xm0, xm5
8048    vpbroadcastw         m0, xm0
8049.dconly_loop:
8050    paddsw               m1, m0, [dstq+32*0]
8051    paddsw               m2, m0, [dstq+32*1]
8052    paddsw               m3, m0, [dstq+32*2]
8053    paddsw               m4, m0, [dstq+32*3]
8054    REPX    {psubusw x, m5}, m1, m2, m3, m4
8055    mova        [dstq+32*0], m1
8056    mova        [dstq+32*1], m2
8057    mova        [dstq+32*2], m3
8058    mova        [dstq+32*3], m4
8059    add                dstq, strideq
8060    dec                 r3d
8061    jg .dconly_loop
8062    RET
8063.normal:
8064    PROLOGUE              0, 8, 16, 32*96, dst, stride, c, eob
8065%undef cmp
8066    vpbroadcastd        m11, [pd_2048]
8067    vpbroadcastd        m12, [clip_18b_min]
8068    vpbroadcastd        m13, [clip_18b_max]
8069    vpbroadcastd        m14, [pd_2896]
8070    lea                  r6, [rsp+32*4]
8071    call .main
8072    call .shift_transpose
8073    cmp                eobd, 36
8074    jl .fast
8075    call .main
8076    call .shift_transpose
8077    jmp .pass2
8078.fast:
8079    pxor                 m0, m0
8080    mov                 r3d, 4
8081.fast_loop:
8082    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
8083    add                  r6, 32*8
8084    dec                 r3d
8085    jg .fast_loop
8086.pass2:
8087    lea                  r7, [r6-32*64]
8088    lea                  r4, [r6-32*32]
8089    lea                  r6, [pw_5+128]
8090    mov                  r5, dstq
8091.pass2_loop:
8092    mova                 m0, [r7-32*4]
8093    mova                 m1, [r7-32*3]
8094    mova                 m2, [r7-32*2]
8095    mova                 m3, [r7-32*1]
8096    mova                 m4, [r7+32*0]
8097    mova                 m5, [r7+32*1]
8098    mova                 m6, [r7+32*2]
8099    mova                 m7, [r7+32*3]
8100    add                  r7, 32*32
8101    mova                 m8, [r7-32*4]
8102    mova                 m9, [r7-32*3]
8103    mova                m10, [r7-32*2]
8104    mova                m11, [r7-32*1]
8105    mova                m12, [r7+32*0]
8106    mova                m13, [r7+32*1]
8107    mova                m14, [r7+32*2]
8108    mova                m15, [r7+32*3]
8109    sub                  r7, 32*24
8110    mova              [rsp], m15
8111    call m(idct_16x16_internal_8bpc).main
8112    mova                 m1, [rsp+32*1]
8113    call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16
8114    add                  r5, 32
8115    mov                dstq, r5
8116    cmp                  r7, r4
8117    jl .pass2_loop
8118    RET
8119ALIGN function_align
8120.main:
8121    lea                  r5, [idct64_mul_16bpc]
8122    mova                 m0, [cq+64* 1]
8123    mova                 m1, [cq+64*31]
8124    mova                 m2, [cq+64*17]
8125    mova                 m3, [cq+64*15]
8126    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
8127    mova                 m0, [cq+64* 7]
8128    mova                 m1, [cq+64*25]
8129    mova                 m2, [cq+64*23]
8130    mova                 m3, [cq+64* 9]
8131    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
8132    mova                 m0, [cq+64* 5]
8133    mova                 m1, [cq+64*27]
8134    mova                 m2, [cq+64*21]
8135    mova                 m3, [cq+64*11]
8136    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
8137    mova                 m0, [cq+64* 3]
8138    mova                 m1, [cq+64*29]
8139    mova                 m2, [cq+64*19]
8140    mova                 m3, [cq+64*13]
8141    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
8142    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
8143    mova                 m0, [cq+64* 2]
8144    mova                 m1, [cq+64*14]
8145    mova                 m2, [cq+64*18]
8146    mova                 m3, [cq+64*30]
8147    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
8148    mova                 m0, [cq+64* 6]
8149    mova                 m1, [cq+64*10]
8150    mova                 m2, [cq+64*22]
8151    mova                 m3, [cq+64*26]
8152    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
8153    mova                 m0, [cq+64* 4]
8154    mova                 m1, [cq+64*12]
8155    mova                 m2, [cq+64*20]
8156    mova                 m3, [cq+64*28]
8157    call m(idct_8x16_internal_10bpc).main_oddhalf_fast
8158    mova                 m0, [cq+64* 0]
8159    mova                 m1, [cq+64* 8]
8160    mova                 m2, [cq+64*16]
8161    mova                 m3, [cq+64*24]
8162    pxor                m15, m15
8163    mov                 r7d, 64*30
8164.main_zero_loop:
8165    mova       [cq+r7-64*2], m15
8166    mova       [cq+r7-64*1], m15
8167    mova       [cq+r7+64*0], m15
8168    mova       [cq+r7+64*1], m15
8169    sub                 r7d, 64*4
8170    jg .main_zero_loop
8171.main_end:
8172    psrld               m15, m11, 10 ; pd_2
8173.main_end2:
8174    add                  cq, 32
8175    pxor                 m4, m4
8176    REPX       {mova x, m4}, m5, m6, m7
8177    call m(idct_8x8_internal_10bpc).main
8178    add                  r6, 32*8
8179    call m(idct_8x16_internal_10bpc).main_evenhalf
8180    mova          [r6+32*2], m1
8181    mova          [r6+32*1], m2
8182    mova          [r6+32*0], m3
8183    mova          [r6-32*1], m4
8184    mova          [r6-32*2], m5
8185    mova          [r6-32*3], m6
8186    mova          [r6-32*4], m7
8187    jmp .main_end_loop_start
8188.main_end_loop:
8189    mova                 m0, [r6+32* 3] ; idct8  0  + n
8190.main_end_loop_start:
8191    mova                 m1, [r5+32* 4] ; idct16 15 - n
8192    mova                 m2, [r5-32*12] ; idct32 16 + n
8193    mova                 m3, [r6-32*13] ; idct32 31 - n
8194    mova                 m4, [r6-32*29] ; idct64 63 - n
8195    mova                 m5, [r5-32*28] ; idct64 48 + n
8196    mova                 m6, [r6-32*45] ; idct64 47 - n
8197    mova                 m7, [r5-32*44] ; idct64 32 + n
8198    paddd                m8, m0, m1     ; idct16 out0  + n
8199    psubd                m0, m1         ; idct16 out15 - n
8200    REPX    {pmaxsd x, m12}, m8, m0
8201    REPX    {pminsd x, m13}, m8, m0
8202    paddd                m1, m8, m3     ; idct32 out0  + n
8203    psubd                m8, m3         ; idct32 out31 - n
8204    paddd                m3, m0, m2     ; idct32 out15 - n
8205    psubd                m0, m2         ; idct32 out16 + n
8206    REPX    {pmaxsd x, m12}, m1, m8, m3, m0
8207    REPX    {pminsd x, m13}, m1, m3, m8, m0
8208    REPX    {paddd  x, m15}, m1, m3, m0, m8
8209    paddd                m2, m1, m4     ; idct64 out0  + n (unshifted)
8210    psubd                m1, m4         ; idct64 out63 - n (unshifted)
8211    paddd                m4, m3, m5     ; idct64 out15 - n (unshifted)
8212    psubd                m3, m5         ; idct64 out48 + n (unshifted)
8213    paddd                m5, m0, m6     ; idct64 out16 + n (unshifted)
8214    psubd                m0, m6         ; idct64 out47 - n (unshifted)
8215    paddd                m6, m8, m7     ; idct64 out31 - n (unshifted)
8216    psubd                m8, m7         ; idct64 out32 + n (unshifted)
8217    mova         [r5-32*44], m2
8218    mova         [r6+32* 3], m1
8219    mova         [r6-32*45], m4
8220    mova         [r5+32* 4], m3
8221    mova         [r5-32*28], m5
8222    mova         [r6-32*13], m0
8223    mova         [r6-32*29], m6
8224    mova         [r5-32*12], m8
8225    add                  r5, 32
8226    sub                  r6, 32
8227    cmp                  r5, r6
8228    jl .main_end_loop
8229    ret
8230.shift_transpose:
8231%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift
8232    sub                  r6, 32*48
8233    mov                  r5, r6
8234%%loop:
8235    mova                 m0, [r6-32* 4]
8236    mova                 m4, [r6+32* 4]
8237    mova                 m1, [r6-32* 3]
8238    mova                 m5, [r6+32* 5]
8239    mova                 m2, [r6-32* 2]
8240    mova                 m6, [r6+32* 6]
8241    mova                 m3, [r6-32* 1]
8242    mova                 m7, [r6+32* 7]
8243    REPX      {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7
8244    packssdw             m0, m4
8245    packssdw             m1, m5
8246    packssdw             m2, m6
8247    packssdw             m3, m7
8248    mova                 m4, [r6+32* 0]
8249    mova                 m6, [r6+32* 8]
8250    mova                 m5, [r6+32* 1]
8251    mova                 m7, [r6+32* 9]
8252    REPX      {psrad x, %1}, m4, m6, m5, m7
8253    packssdw             m4, m6
8254    packssdw             m5, m7
8255    mova                 m6, [r6+32* 2]
8256    mova                 m8, [r6+32*10]
8257    mova                 m7, [r6+32* 3]
8258    mova                 m9, [r6+32*11]
8259    REPX      {psrad x, %1}, m6, m8, m7, m9
8260    packssdw             m6, m8
8261    packssdw             m7, m9
8262    call m(idct_16x8_internal_10bpc).transpose3
8263    mova          [r5-32*4], m0
8264    mova          [r5-32*3], m1
8265    mova          [r5-32*2], m2
8266    mova          [r5-32*1], m3
8267    mova          [r5+32*0], m4
8268    mova          [r5+32*1], m5
8269    mova          [r5+32*2], m6
8270    mova          [r5+32*3], m7
8271    add                  r6, 32*16
8272    add                  r5, 32*8
8273    cmp                  r5, r4
8274    jl %%loop
8275    mov                  r6, r4
8276%endmacro
8277    IDCT64_SHIFT_TRANSPOSE 2
8278    ret
8279
8280cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
8281    test               eobd, eobd
8282    jz .dconly
8283    PROLOGUE              0, 8, 16, 32*163, dst, stride, c, eob
8284%undef cmp
8285    vpbroadcastd        m11, [pd_2048]
8286    vpbroadcastd        m12, [clip_18b_min]
8287    vpbroadcastd        m13, [clip_18b_max]
8288    vpbroadcastd        m14, [pd_2896]
8289    lea                  r6, [rsp+32*7]
8290    call .main
8291    cmp                eobd, 36
8292    jl .fast
8293    call .main
8294    cmp                eobd, 136
8295    jl .fast
8296    call .main
8297    cmp                eobd, 300
8298    jl .fast
8299    call .main
8300    jmp .pass2
8301.dconly:
8302    imul                r6d, [cq], 181
8303    mov                [cq], eobd ; 0
8304    or                  r3d, 32
8305    add                 r6d, 128
8306    sar                 r6d, 8
8307    imul                r6d, 181
8308    add                 r6d, 384
8309    sar                 r6d, 9
8310    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2
8311.fast:
8312    pxor                 m0, m0
8313    lea                  r4, [rsp+32*135]
8314.fast_loop:
8315    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
8316    add                  r6, 32*8
8317    cmp                  r6, r4
8318    jl .fast_loop
8319.pass2:
8320    lea                  r7, [r6-32*32]
8321    lea                  r5, [r6+32*8]
8322    lea                  r6, [pw_5+128]
8323    imul                 r2, strideq, 19
8324    lea                  r3, [strideq*3]
8325    add                  r2, dstq
8326.pass2_loop:
8327    mova                 m0, [r7-32*99]
8328    mova                 m1, [r7-32*97]
8329    mova                 m2, [r7-32*95]
8330    mova                 m3, [r7-32*93]
8331    mova                 m4, [r7-32*67]
8332    mova                 m5, [r7-32*65]
8333    mova                 m6, [r7-32*63]
8334    mova                 m7, [r7-32*61]
8335    mova                 m8, [r7-32*35]
8336    mova                 m9, [r7-32*33]
8337    mova                m10, [r7-32*31]
8338    mova                m11, [r7-32*29]
8339    mova                m12, [r7-32* 3]
8340    mova                m13, [r7-32* 1]
8341    mova                m14, [r7+32* 1]
8342    mova                m15, [r7+32* 3]
8343    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
8344    mova                 m0, [r7-32*100]
8345    mova                 m1, [r7-32*98]
8346    mova                 m2, [r7-32*96]
8347    mova                 m3, [r7-32*94]
8348    mova                 m4, [r7-32*68]
8349    mova                 m5, [r7-32*66]
8350    mova                 m6, [r7-32*64]
8351    mova                 m7, [r7-32*62]
8352    mova                 m8, [r7-32*36]
8353    mova                 m9, [r7-32*34]
8354    mova                m10, [r7-32*32]
8355    mova                m11, [r7-32*30]
8356    mova                m12, [r7-32* 4]
8357    mova                m13, [r7-32* 2]
8358    mova                m14, [r7+32* 0]
8359    mova                m15, [r7+32* 2]
8360    add                  r7, 32*8
8361    mova              [rsp], m15
8362    call m(idct_16x16_internal_8bpc).main
8363    call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end
8364    sub                dstq, r3
8365    lea                  r2, [r2+r3+32]
8366    add                dstq, 32
8367    cmp                  r7, r4
8368    jl .pass2_loop
8369    RET
8370ALIGN function_align
8371.main:
8372    lea                  r5, [idct64_mul_16bpc]
8373    pmulld               m0, m14, [cq+128* 1]
8374    pmulld               m1, m14, [cq+128*31]
8375    pmulld               m2, m14, [cq+128*17]
8376    pmulld               m3, m14, [cq+128*15]
8377    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
8378    pmulld               m0, m14, [cq+128* 7]
8379    pmulld               m1, m14, [cq+128*25]
8380    pmulld               m2, m14, [cq+128*23]
8381    pmulld               m3, m14, [cq+128* 9]
8382    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
8383    pmulld               m0, m14, [cq+128* 5]
8384    pmulld               m1, m14, [cq+128*27]
8385    pmulld               m2, m14, [cq+128*21]
8386    pmulld               m3, m14, [cq+128*11]
8387    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
8388    pmulld               m0, m14, [cq+128* 3]
8389    pmulld               m1, m14, [cq+128*29]
8390    pmulld               m2, m14, [cq+128*19]
8391    pmulld               m3, m14, [cq+128*13]
8392    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2
8393    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
8394    pmulld               m0, m14, [cq+128* 2]
8395    pmulld               m1, m14, [cq+128*14]
8396    pmulld               m2, m14, [cq+128*18]
8397    pmulld               m3, m14, [cq+128*30]
8398    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2
8399    pmulld               m0, m14, [cq+128* 6]
8400    pmulld               m1, m14, [cq+128*10]
8401    pmulld               m2, m14, [cq+128*22]
8402    pmulld               m3, m14, [cq+128*26]
8403    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2
8404    pmulld               m0, m14, [cq+128* 4]
8405    pmulld               m1, m14, [cq+128*12]
8406    pmulld               m2, m14, [cq+128*20]
8407    pmulld               m3, m14, [cq+128*28]
8408    call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2
8409    pmulld               m0, m14, [cq+128* 0]
8410    pmulld               m1, m14, [cq+128* 8]
8411    pmulld               m2, m14, [cq+128*16]
8412    pmulld               m3, m14, [cq+128*24]
8413    pxor                m15, m15
8414    mov                 r7d, 128*29
8415.main_zero_loop:
8416    mova      [cq+r7-128*1], m15
8417    mova      [cq+r7+128*0], m15
8418    mova      [cq+r7+128*1], m15
8419    mova      [cq+r7+128*2], m15
8420    sub                 r7d, 128*4
8421    jg .main_zero_loop
8422    psrld               m15, m11, 11 ; pd_1
8423    REPX     {paddd x, m11}, m0, m1, m2, m3
8424    REPX     {psrad x, 12 }, m0, m1, m2, m3
8425    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2
8426    IDCT64_SHIFT_TRANSPOSE 1
8427    ret
8428
8429cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
8430    test               eobd, eobd
8431    jz .dconly
8432    PROLOGUE              0, 11, 16, 32*195, dst, stride, c, eob
8433%undef cmp
8434    vpbroadcastd        m11, [pd_2048]
8435    vpbroadcastd        m12, [clip_18b_min]
8436    vpbroadcastd        m13, [clip_18b_max]
8437    vpbroadcastd        m14, [pd_2896]
8438    lea                  r6, [rsp+32*7]
8439    call .main
8440    cmp                eobd, 36
8441    jl .fast
8442    call .main
8443    cmp                eobd, 136
8444    jl .fast
8445    call .main
8446    cmp                eobd, 300
8447    jl .fast
8448    call .main
8449    jmp .pass2
8450.dconly:
8451    imul                r6d, [cq], 181
8452    mov                [cq], eobd ; 0
8453    or                  r3d, 64
8454    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly
8455.fast:
8456    pxor                 m0, m0
8457    lea                  r4, [rsp+32*135]
8458.fast_loop:
8459    REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
8460    add                  r6, 32*8
8461    cmp                  r6, r4
8462    jl .fast_loop
8463.pass2:
8464    lea                 r10, [r6-32*32]
8465    lea                  r6, [pw_5+128]
8466    lea                  r8, [strideq*4]
8467    lea                  r9, [strideq*5]
8468    lea                  r3, [r9+strideq*1] ; stride*6
8469    lea                  r7, [r9+strideq*2] ; stride*7
8470.pass2_loop:
8471    mova                 m0, [r10-32*100] ; in0
8472    mova                 m1, [r10-32*96]  ; in4
8473    mova                 m2, [r10-32*68]  ; in8
8474    mova                 m3, [r10-32*64]  ; in12
8475    mova                 m4, [r10-32*36]  ; in16
8476    mova                 m5, [r10-32*32]  ; in20
8477    mova                 m6, [r10-32* 4]  ; in24
8478    mova                 m7, [r10+32* 0]  ; in28
8479    pxor                 m8, m8
8480    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
8481    mova              [rsp], m8
8482    call m(idct_16x16_internal_8bpc).main
8483    mova                 m1, [rsp+32*1]
8484    mova          [r4-32*4], m0
8485    mova          [r4-32*3], m1
8486    mova          [r4-32*2], m2
8487    mova          [r4-32*1], m3
8488    mova          [r4+32*0], m4
8489    mova          [r4+32*1], m5
8490    mova          [r4+32*2], m6
8491    mova          [r4+32*3], m7
8492    add                  r4, 32*8
8493    mova          [r4-32*4], m8
8494    mova          [r4-32*3], m9
8495    mova          [r4-32*2], m10
8496    mova          [r4-32*1], m11
8497    mova          [r4+32*0], m12
8498    mova          [r4+32*1], m13
8499    mova          [r4+32*2], m14
8500    mova          [r4+32*3], m15
8501    mova                 m0, [r10-32*98] ; in2
8502    mova                 m1, [r10-32*94] ; in6
8503    mova                 m2, [r10-32*66] ; in10
8504    mova                 m3, [r10-32*62] ; in14
8505    mova                 m4, [r10-32*34] ; in18
8506    mova                 m5, [r10-32*30] ; in22
8507    mova                 m6, [r10-32* 2] ; in26
8508    mova                 m7, [r10+32* 2] ; in30
8509    lea                  r5, [r4+32*16]
8510    add                  r4, 32*8
8511    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
8512    mova                 m0, [r10-32*99] ; in1
8513    mova                 m1, [r10+32* 3] ; in31
8514    mova                 m2, [r10-32*35] ; in17
8515    mova                 m3, [r10-32*61] ; in15
8516    mova                 m4, [r10-32*67] ; in9
8517    mova                 m5, [r10-32*29] ; in23
8518    mova                 m6, [r10-32* 3] ; in25
8519    mova                 m7, [r10-32*93] ; in7
8520    lea                  r6, [idct64_mul - 8]
8521    add                  r4, 32*16
8522    add                  r5, 32*32
8523    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
8524    mova                 m0, [r10-32*95] ; in5
8525    mova                 m1, [r10-32* 1] ; in27
8526    mova                 m2, [r10-32*31] ; in21
8527    mova                 m3, [r10-32*65] ; in11
8528    mova                 m4, [r10-32*63] ; in13
8529    mova                 m5, [r10-32*33] ; in19
8530    mova                 m6, [r10+32* 1] ; in29
8531    mova                 m7, [r10-32*97] ; in3
8532    add                  r6, 8
8533    add                  r4, 32*8
8534    sub                  r5, 32*8
8535    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
8536    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2
8537    add                 r10, 32*8
8538    sub                dstq, r8
8539    sub                  r4, 32*44
8540    add                dstq, 32
8541    cmp                 r10, r4
8542    jl .pass2_loop
8543    RET
8544ALIGN function_align
8545.main:
8546    lea                  r5, [idct64_mul_16bpc]
8547    mova                 m0, [cq+128* 1]
8548    mova                 m1, [cq+128*31]
8549    mova                 m2, [cq+128*17]
8550    mova                 m3, [cq+128*15]
8551    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
8552    mova                 m0, [cq+128* 7]
8553    mova                 m1, [cq+128*25]
8554    mova                 m2, [cq+128*23]
8555    mova                 m3, [cq+128* 9]
8556    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
8557    mova                 m0, [cq+128* 5]
8558    mova                 m1, [cq+128*27]
8559    mova                 m2, [cq+128*21]
8560    mova                 m3, [cq+128*11]
8561    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
8562    mova                 m0, [cq+128* 3]
8563    mova                 m1, [cq+128*29]
8564    mova                 m2, [cq+128*19]
8565    mova                 m3, [cq+128*13]
8566    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1
8567    call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2
8568    mova                 m0, [cq+128* 2]
8569    mova                 m1, [cq+128*14]
8570    mova                 m2, [cq+128*18]
8571    mova                 m3, [cq+128*30]
8572    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast
8573    mova                 m0, [cq+128* 6]
8574    mova                 m1, [cq+128*10]
8575    mova                 m2, [cq+128*22]
8576    mova                 m3, [cq+128*26]
8577    call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast
8578    mova                 m0, [cq+128* 4]
8579    mova                 m1, [cq+128*12]
8580    mova                 m2, [cq+128*20]
8581    mova                 m3, [cq+128*28]
8582    call m(idct_8x16_internal_10bpc).main_oddhalf_fast
8583    mova                 m0, [cq+128* 0]
8584    mova                 m1, [cq+128* 8]
8585    mova                 m2, [cq+128*16]
8586    mova                 m3, [cq+128*24]
8587    pxor                m15, m15
8588    mov                 r7d, 128*29
8589.main_zero_loop:
8590    mova      [cq+r7-128*1], m15
8591    mova      [cq+r7+128*0], m15
8592    mova      [cq+r7+128*1], m15
8593    mova      [cq+r7+128*2], m15
8594    sub                 r7d, 128*4
8595    jg .main_zero_loop
8596    call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end
8597    jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose
8598
8599%endif ; ARCH_X86_64
8600