xref: /aosp_15_r20/external/libdav1d/src/x86/itx16_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; Copyright © 2017-2021, The rav1e contributors
4; Copyright © 2020, Nathan Egge
5; Copyright © 2021, Matthias Dressel
6; All rights reserved.
7;
8; Redistribution and use in source and binary forms, with or without
9; modification, are permitted provided that the following conditions are met:
10;
11; 1. Redistributions of source code must retain the above copyright notice, this
12;    list of conditions and the following disclaimer.
13;
14; 2. Redistributions in binary form must reproduce the above copyright notice,
15;    this list of conditions and the following disclaimer in the documentation
16;    and/or other materials provided with the distribution.
17;
18; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29%include "config.asm"
30%include "ext/x86/x86inc.asm"
31
32SECTION_RODATA
33%macro COEF 1-2
34pd_%1: times 4 dd %1
35%if %0 == 2
36pd_m%1: times 4 dd -%1
37%endif
38%endmacro
39
40COEF  201
41COEF  401
42COEF  601, 1
43COEF  799
44COEF  995
45COEF 1189, 1
46COEF 1380, 1
47COEF 1567
48COEF 1751
49COEF 1931
50COEF 2106, 1
51COEF 2276, 1
52COEF 2440
53COEF 2598, 1
54COEF 2751, 1
55COEF 2896
56COEF 3035
57COEF 3166
58COEF 3290
59COEF 3406
60COEF 3513
61COEF 3612
62COEF 3703
63COEF 3784
64COEF 3857
65COEF 3920
66COEF 3973
67COEF 4017
68COEF 4052
69COEF 4076
70COEF 4091
71
72deint_shuf:  db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
73
74%if ARCH_X86_32
75pd_1:            times 4 dd     1
76%endif
77pd_2:            times 4 dd     2
78pw_5:            times 8 dw     5
79pd_1321:         times 4 dd  1321
80pd_2482:         times 4 dd  2482
81pd_m3344:        times 4 dd -3344
82pd_2048:         times 4 dd  2048
83pw_4x2048_4xm2048: times 4 dw 2048
84                   times 4 dw -2048
85pw_4xm2048_4x2048: times 4 dw -2048
86                   times 4 dw 2048
87pw_2048:         times 8 dw  2048
88pw_m2048:        times 8 dw  -2048
89pd_3803:         times 4 dd  3803
90pw_4096:         times 8 dw  4096
91pd_5793:         times 4 dd  5793
92pd_6144:         times 4 dd  6144
93pw_8192:         times 8 dw  8192
94pd_10240:        times 4 dd 10240
95pd_11586:        times 4 dd 11586
96pw_1697x8:       times 8 dw  1697*8
97pw_2896x8:       times 8 dw  2896*8
98pw_1697x16:      times 8 dw  1697*16
99pw_16384:        times 8 dw 16384
100pixel_10bpc_max: times 8 dw  0x03ff
101
102pw_1567_3784:    times 4 dw  1567,  3784
103pw_m3784_1567:   times 4 dw -3784,  1567
104pw_2896_2896:    times 4 dw  2896,  2896
105pw_m2896_2896:   times 4 dw -2896,  2896
106
107clip_18b_min: times 4 dd -0x20000
108clip_18b_max: times 4 dd  0x1ffff
109
110idct64_mul_16bpc:
111dd 4095,  101, 2967, -2824,  3745, 1660, 3822, -1474,   401,  4076,   799,  4017
112dd -700, 4036, 2359,  3349, -2191, 3461,  897,  3996, -2598, -3166, -4017,  -799
113dd 4065,  501, 3229, -2520,  3564, 2019, 3948, -1092,  1931,  3612,  3406,  2276
114dd -301, 4085, 2675,  3102, -1842, 3659, 1285,  3889, -1189, -3920, -2276, -3406
115
116cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3
117cextern iadst_4x4_internal_8bpc_ssse3.main
118cextern idct_4x8_internal_8bpc_ssse3.main
119cextern iadst_4x8_internal_8bpc_ssse3.main
120cextern idct_16x4_internal_8bpc_ssse3.main
121cextern iadst_16x4_internal_8bpc_ssse3.main
122cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end
123cextern idct_8x4_internal_8bpc_ssse3.main
124cextern iadst_8x4_internal_8bpc_ssse3.main
125cextern idct_8x8_internal_8bpc_ssse3.main
126cextern idct_8x8_internal_8bpc_ssse3.pass1_end3
127cextern iadst_8x8_internal_8bpc_ssse3.main
128cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end
129cextern idct_16x8_internal_8bpc_ssse3.main
130cextern iadst_16x8_internal_8bpc_ssse3.main
131cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end
132cextern idct_8x32_internal_8bpc_ssse3.main
133cextern idct_8x32_internal_8bpc_ssse3.main_fast
134cextern idct_8x32_internal_8bpc_ssse3.main_veryfast
135cextern idct_16x64_internal_8bpc_ssse3.main
136cextern idct_16x64_internal_8bpc_ssse3.main_fast
137
138tbl_4x16_2d: db 0, 13, 29, 45
139tbl_4x16_h: db 0, 16, 32, 48
140tbl_4x16_v: db 0, 4, 8, 12
141
142tbl_8x16_2d: db 0, 14, 30, 46
143tbl_8x16_v: db 0, 4, 8, 12
144tbl_8x16_h: db 0, 32, 64, 96
145
146tbl_16x16_2d: db 0, 10, 36, 78
147tbl_16x16_v: db 0, 4, 8, 12
148tbl_16x16_h: db 0, 64, 128, 192
149
150tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203
151
152tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343
153
154tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one
155tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406
156
157tbl_Nx32_odd_offset: db 2*16, 2*23
158                     db 2*20, 2*19
159                     db 2*18, 2*21
160                     db 2*22, 2*17
161                     db 2*30, 2*25
162                     db 2*26, 2*29
163                     db 2*28, 2*27
164                     db 2*24, 2*31
165
166tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
167                 db 2* 8, 2*40, 2*23, 2*38
168                 db 2* 1, 2*36, 2*20, 2*42
169                 db 2* 9, 2*44, 2*19, 2*34
170                 db 2* 2, 2*60, 2*18, 2*50
171                 db 2*10, 2*52, 2*21, 2*58
172                 db 2* 3, 2*56, 2*22, 2*54
173                 db 2*11, 2*48, 2*17, 2*62
174
175SECTION .text
176
177%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
178%define m(x) m_suffix(x, SUFFIX)
179
180; This refers to the first function in itx_sse i.e. the start of the text section
181; which is needed as a base pointer for constants.
182%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3)
183
184%if ARCH_X86_64
185%define o(x) x
186%else
187%define o(x) r6-$$+x ; PIC
188%endif
189
190%macro IWHT4_1D 0
191    ; m0 = in0,  m1 = in1,  m2 = in2,  m3 = in3
192    paddd                m0, m1      ; in0 += in1
193    psubd                m4, m2, m3  ; tmp0 = in2 - in3
194    psubd                m5, m0, m4  ; tmp1 = (in0 - tmp0) >> 1
195    psrad                m5, 1
196    psubd                m2, m5, m1  ; in2 = tmp1 - in1
197    psubd                m5, m3      ; in1 = tmp1 - in3
198    psubd                m0, m5      ; in0 -= in1
199    paddd                m4, m2      ; in3 = tmp0 + in2
200    ; m0 = out0,  m1 = in1,  m2 = out2,  m3 = in3
201    ; m4 = out3,  m5 = out1
202%endmacro
203
204INIT_XMM sse2
205cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax
206    mova                 m0, [cq+16*0]
207    mova                 m1, [cq+16*1]
208    mova                 m2, [cq+16*2]
209    mova                 m3, [cq+16*3]
210    REPX       {psrad x, 2}, m0, m1, m2, m3
211    IWHT4_1D
212    punpckldq            m1, m0, m5
213    punpckhdq            m3, m0, m5
214    punpckldq            m5, m2, m4
215    punpckhdq            m2, m4
216    punpcklqdq           m0, m1, m5
217    punpckhqdq           m1, m5
218    punpcklqdq           m4, m3, m2
219    punpckhqdq           m3, m2
220    mova                 m2, m4
221    IWHT4_1D
222    packssdw             m0, m4 ; low: out3,  high: out0
223    packssdw             m2, m5 ; low: out2,  high: out1
224    pxor                 m4, m4
225    mova          [cq+16*0], m4
226    mova          [cq+16*1], m4
227    mova          [cq+16*2], m4
228    mova          [cq+16*3], m4
229    lea                  r2, [dstq+strideq*2]
230    movq                 m1, [dstq+strideq*0]
231    movhps               m1, [r2  +strideq*1]
232    movq                 m3, [r2  +strideq*0]
233    movhps               m3, [dstq+strideq*1]
234    movd                 m5, bdmaxm
235    pshuflw              m5, m5, q0000  ; broadcast
236    punpcklqdq           m5, m5         ; broadcast
237    paddsw               m0, m1
238    paddsw               m2, m3
239    pmaxsw               m0, m4
240    pmaxsw               m2, m4
241    pminsw               m0, m5
242    pminsw               m2, m5
243    movhps [r2  +strideq*1], m0 ; write out0
244    movhps [dstq+strideq*1], m2 ; write out1
245    movq   [r2  +strideq*0], m2 ; write out2
246    movq   [dstq+strideq*0], m0 ; write out3
247    RET
248
249; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
250; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
251; flags: 2 = inv_dst1, 4 = inv_dst2
252; skip round/shift if rnd is not a number
253%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
254; %1 dst/src[1]
255; %2 dst/src[2]
256; %3 tmp[1]
257; %4 tmp[2]
258; %5 tmp[3]
259; %6 rnd
260; %7 coef[1]
261; %8 coef[2]
262; %9 flags
263%ifnidn %7,%8   ; optimize when coef1 == coef2
264%if %8 < 32
265    pmulld              m%4, m%1, m%8
266    pmulld              m%3, m%2, m%8
267%else
268    mova                m%3, [o(pd_%8)]
269    pmulld              m%4, m%1, m%3
270    pmulld              m%3, m%2
271%endif
272%endif
273%if %7 < 32
274    pmulld              m%1, m%7
275    pmulld              m%2, m%7
276%else
277    mova                m%5, [o(pd_%7)]
278    pmulld              m%1, m%5
279    pmulld              m%2, m%5
280%endif
281%if %9 & 4  ; invert dst2
282    paddd               m%4, m%2
283    psubd               m%2, m%6, m%4
284%else
285%ifnum %6
286%ifnidn %7,%8
287    paddd               m%4, m%6
288%else
289    paddd               m%1, m%6
290%endif
291%endif
292%ifnidn %7,%8
293    paddd               m%2, m%4
294%else
295    mova                m%3, m%2
296    paddd               m%2, m%1
297%endif
298%endif
299%if %9 & 2  ; invert dst1
300    psubd               m%3, m%1
301    paddd               m%1, m%3, m%6
302%else
303%ifnum %6
304%ifnidn %7,%8
305    paddd               m%1, m%6
306%endif
307%endif
308    psubd               m%1, m%3
309%endif
310%ifnum %6
311    psrad               m%2, 12
312    psrad               m%1, 12
313%endif
314%endmacro
315
316%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack
317cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
318    %define %%p1 m(i%1_%4_internal_16bpc)
319%if ARCH_X86_32
320    LEA                  r6, $$
321%endif
322%if has_epilogue
323%ifidn %1_%2, dct_dct
324    test               eobd, eobd
325    jz %%end
326%endif
327    lea                tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
328%ifnum %3
329%if %3
330    add                eobd, %3
331%endif
332%else
333    lea                  r5, [o(%3)]
334%endif
335    call %%p1
336    RET
337%%end:
338%else
339    ; Jump to the 1st txfm function if we're not taking the fast path, which
340    ; in turn performs an indirect jump to the 2nd txfm function.
341    lea                tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
342%ifnum %3
343%if %3
344    add                eobd, %3
345%endif
346%else
347    lea                  r5, [o(%3)]
348%endif
349%ifidn %1_%2, dct_dct
350    test               eobd, eobd
351    jnz %%p1
352%else
353    ; jump to the 1st txfm function unless it's located directly after this
354    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
355ALIGN function_align
356%%end:
357%endif
358%endif
359%endmacro
360
361%macro INV_TXFM_4X4_FN 2 ; type1, type2
362    INV_TXFM_FN          %1, %2, 0, 4x4
363%ifidn %1_%2, dct_dct
364    imul                r5d, [cq], 181
365    mov                [cq], eobd ; 0
366    mov                 r3d, 4
367.dconly:
368    add                 r5d, 128
369    sar                 r5d, 8
370.dconly2:
371    imul                r5d, 2896
372    mova                 m2, [o(pixel_10bpc_max)]
373    add                 r5d, 34816
374    movd                 m0, r5d
375    pshuflw              m0, m0, q1111
376    pxor                 m3, m3
377    punpcklqdq           m0, m0
378.dconly_loop:
379    movq                 m1, [dstq+strideq*0]
380    movhps               m1, [dstq+strideq*1]
381    paddw                m1, m0
382    pminsw               m1, m2
383    pmaxsw               m1, m3
384    movq   [dstq+strideq*0], m1
385    movhps [dstq+strideq*1], m1
386    lea                dstq, [dstq+strideq*2]
387    sub                 r3d, 2
388    jg .dconly_loop
389    RET
390%endif
391%endmacro
392
393%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
394    ; butterfly rotation
395    ITX_MULSUB_2D        %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1  %3 out0
396    ITX_MULSUB_2D        %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2  %4 out3
397    ; Hadamard rotation
398    psubd               m%5, m%1, m%2
399    paddd               m%2, m%1
400    paddd               m%1, m%3, m%4
401    psubd               m%3, m%4
402    ; %1 (src1) = out0
403    ; %2 (src2) = out1
404    ; %3 (src3) = out3
405    ; $5 (tmp1) = out2
406%endmacro
407
408INIT_XMM sse4
409
410INV_TXFM_4X4_FN dct, dct
411INV_TXFM_4X4_FN dct, identity
412INV_TXFM_4X4_FN dct, adst
413INV_TXFM_4X4_FN dct, flipadst
414
415cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
416    mova                 m0, [cq+16*0]
417    mova                 m1, [cq+16*1]
418    mova                 m2, [cq+16*2]
419    mova                 m3, [cq+16*3]
420    mova                 m5, [o(pd_2048)]
421    call .pass1_main
422    packssdw             m0, m1     ; out0 out1
423    packssdw             m4, m2     ; out2 out3
424    ; transpose
425    punpckhwd            m2, m0, m4
426    punpcklwd            m0, m4
427    punpckhwd            m1, m0, m2
428    punpcklwd            m0, m2
429    ; m0 = out0 out1
430    ; m1 = out2 out3
431    ; m5 = pd_2048
432    jmp                tx2q
433.pass1_main:
434    IDCT4_1D              0, 1, 2, 3, 4, 6, 7, 5
435    ret
436.pass2:
437    ; m0 = in0 in1
438    ; m1 = in2 in3
439    ; m5 = pd_2048
440    punpckhwd            m2, m1, m0
441    punpcklwd            m1, m0
442    pmaddwd              m4, m2, [o(pw_m3784_1567)]
443    pmaddwd              m2, [o(pw_1567_3784)]
444    pmaddwd              m0, m1, [o(pw_m2896_2896)]
445    pmaddwd              m1, [o(pw_2896_2896)]
446    REPX      {paddd x, m5}, m4, m2, m0, m1
447    packssdw             m5, m5     ; pw_2048
448    REPX      {psrad x, 12}, m4, m2, m0, m1
449    packssdw             m2, m4     ; t3 t2
450    packssdw             m1, m0     ; t0 t1
451    paddsw               m0, m1, m2 ; out0 out1
452    psubsw               m1, m2     ; out3 out2
453    pmulhrsw             m0, m5
454    pmulhrsw             m1, m5
455    movq                 m2, [dstq+strideq*0]
456    movhps               m2, [dstq+strideq*1]
457    lea                  r5, [dstq+strideq*2]
458    movq                 m3, [r5  +strideq*1]
459    movhps               m3, [r5  +strideq*0]
460    mova                 m5, [o(pixel_10bpc_max)]
461    pxor                 m4, m4
462    mova          [cq+16*0], m4
463    mova          [cq+16*1], m4
464    mova          [cq+16*2], m4
465    mova          [cq+16*3], m4
466    paddw                m0, m2
467    paddw                m1, m3
468    pmaxsw               m0, m4
469    pmaxsw               m1, m4
470    pminsw               m0, m5
471    pminsw               m1, m5
472    movq   [dstq+strideq*0], m0
473    movhps [dstq+strideq*1], m0
474    movhps [r5  +strideq*0], m1
475    movq   [r5  +strideq*1], m1
476    RET
477
478INV_TXFM_4X4_FN adst, dct
479INV_TXFM_4X4_FN adst, adst
480INV_TXFM_4X4_FN adst, flipadst
481INV_TXFM_4X4_FN adst, identity
482
483cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
484    call .main
485    packssdw             m0, m2            ; out0 out1
486    packssdw             m1, m4            ; out2 out3
487    ; transpose
488    punpckhwd            m2, m0, m1
489    punpcklwd            m0, m1
490    punpckhwd            m1, m0, m2
491    punpcklwd            m0, m2
492    ; m0 = out0 out1
493    ; m1 = out2 out3
494    ; m5 = pd_2048
495    jmp                tx2q
496.pass2:
497    ; m0 = in0 in1
498    ; m1 = in2 in3
499%if ARCH_X86_32
500    lea                  r5, [o(itx8_start)]
501%endif
502    call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
503.end:
504    mova                 m4, [o(pw_2048)]
505    movq                 m2, [dstq+strideq*0]
506    movhps               m2, [dstq+strideq*1]
507    lea                  r5, [dstq+strideq*2]
508    movq                 m3, [r5  +strideq*0]
509    movhps               m3, [r5  +strideq*1]
510    mova                 m5, [o(pixel_10bpc_max)]
511    pmulhrsw             m0, m4
512    pmulhrsw             m1, m4
513    pxor                 m4, m4
514    mova          [cq+16*0], m4
515    mova          [cq+16*1], m4
516    mova          [cq+16*2], m4
517    mova          [cq+16*3], m4
518    paddw                m0, m2
519    paddw                m1, m3
520    pmaxsw               m0, m4
521    pmaxsw               m1, m4
522    pminsw               m0, m5
523    pminsw               m1, m5
524    movq   [dstq+strideq*0], m0
525    movhps [dstq+strideq*1], m0
526    movq   [r5  +strideq*0], m1
527    movhps [r5  +strideq*1], m1
528    RET
529ALIGN function_align
530.main:
531    mova                 m1, [cq+16*2]
532    mova                 m3, [cq+16*3]
533    mova                 m5, [cq+16*0]
534    lea                  r3, [cq+16*1]
535.main2:
536    mova                 m0, [o(pd_1321)]  ; SINPI_1_9
537    mova                 m2, [o(pd_2482)]  ; SINPI_2_9
538    mova                 m6, [o(pd_3803)]  ; SINPI_4_9
539    pmulld               m4, m0, m1        ; s[4] = SINPI_1_9 * T[2]
540    pmulld               m7, m3, m6        ; s[6] = SINPI_4_9 * T[3]
541    pmulld               m6, m1            ; s[3] = SINPI_4_9 * T[2]
542    pmulld               m0, m5            ; s[0] = SINPI_1_9 * T[0]
543    psubd                m1, m3            ; T[2] - T[3]
544    pmulld               m3, m2            ; s[5] = SINPI_2_9 * T[3]
545    pmulld               m2, m5            ; s[1] = SINPI_2_9 * T[0]
546    paddd                m0, m6            ; s[0] += s[3]
547    paddd                m0, m3            ; s[0] += s[5]
548    mova                 m3, [o(pd_m3344)] ; -SINPI_3_9
549    psubd                m2, m4            ; s[1] -= s[4]
550    psubd                m2, m7            ; s[1] -= s[6]
551    psubd                m1, m5            ; -b7 = (T[2] -T[3]) - T[0]
552    pmulld               m1, m3            ; s[2]  = -SINPI_3_9 * -b7
553    pmulld               m3, [r3]          ; -s[3] = -SINPI_3_9 * T[1]
554    mova                 m5, [o(pd_2048)]
555    REPX      {paddd x, m5}, m0, m1        ; {s[0], s[2]} + 2048
556    paddd                m4, m0, m2        ; x[3]  = s[0] + s[1]
557    psubd                m2, m3            ; x[1]  = s[1] + s[3]
558    psubd                m0, m3            ; x[0]  = s[0] + s[3]
559    paddd                m4, m3            ; x[3] -= s[3]
560    paddd                m2, m5            ; x[1] + 2048
561    REPX      {psrad x, 12}, m0, m2, m1, m4
562    ret
563
564
565INV_TXFM_4X4_FN flipadst, dct
566INV_TXFM_4X4_FN flipadst, adst
567INV_TXFM_4X4_FN flipadst, flipadst
568INV_TXFM_4X4_FN flipadst, identity
569
570cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
571    call m(iadst_4x4_internal_16bpc).main
572    packssdw             m0, m2            ; out0 out1
573    packssdw             m1, m4            ; out2 out3
574    ; transpose
575    punpcklwd            m2, m1, m0
576    punpckhwd            m1, m0
577    punpcklwd            m0, m1, m2
578    punpckhwd            m1, m2
579    ; m0 = out0 out1
580    ; m1 = out2 out3
581    ; m5 = pd_2048
582    jmp                tx2q
583.pass2:
584    ; m0 = in0 in1
585    ; m1 = in2 in3
586%if ARCH_X86_32
587    lea                 r5, [o(itx8_start)]
588%endif
589    call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
590    mova                 m4, [o(pw_2048)]
591    movq                 m3, [dstq+strideq*1]
592    movhps               m3, [dstq+strideq*0]
593    lea                  r5, [dstq+strideq*2]
594    movq                 m2, [r5  +strideq*1]
595    movhps               m2, [r5  +strideq*0]
596    mova                 m5, [o(pixel_10bpc_max)]
597    pmulhrsw             m0, m4
598    pmulhrsw             m1, m4
599    pxor                 m4, m4
600    mova          [cq+16*0], m4
601    mova          [cq+16*1], m4
602    mova          [cq+16*2], m4
603    mova          [cq+16*3], m4
604    paddw                m0, m2
605    paddw                m1, m3
606    pmaxsw               m0, m4
607    pmaxsw               m1, m4
608    pminsw               m0, m5
609    pminsw               m1, m5
610    movhps [dstq+strideq*0], m1
611    movq   [dstq+strideq*1], m1
612    movhps [r5  +strideq*0], m0
613    movq   [r5  +strideq*1], m0
614    RET
615
616INV_TXFM_4X4_FN identity, dct
617INV_TXFM_4X4_FN identity, adst
618INV_TXFM_4X4_FN identity, flipadst
619INV_TXFM_4X4_FN identity, identity
620
621cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
622    mova                 m3, [o(pd_5793)]
623    pmulld               m0, m3, [cq+16*0]
624    pmulld               m1, m3, [cq+16*1]
625    pmulld               m2, m3, [cq+16*2]
626    pmulld               m3,     [cq+16*3]
627    mova                 m5, [o(pd_2048)]
628    REPX      {paddd x, m5}, m0, m1, m2, m3
629    REPX      {psrad x, 12}, m0, m1, m2, m3
630    packssdw             m0, m1
631    packssdw             m2, m3
632    ; transpose
633    punpckhwd            m3, m0, m2
634    punpcklwd            m0, m2
635    punpckhwd            m1, m0, m3
636    punpcklwd            m0, m3
637    ; m0 = out0 out1
638    ; m1 = out2 out3
639    ; m5 = pd_2048
640    jmp                tx2q
641.pass2:
642    ; m0 = in0 in1
643    ; m1 = in2 in3
644    ; m5 = pd_2048
645    mova                 m4, [o(pw_1697x8)]
646    movq                 m2, [dstq+strideq*0]
647    movhps               m2, [dstq+strideq*1]
648    lea                  r5, [dstq+strideq*2]
649    pmulhrsw             m3, m4, m0
650    pmulhrsw             m4, m1
651    paddsw               m0, m3
652    paddsw               m1, m4
653    movq                 m3, [r5  +strideq*0]
654    movhps               m3, [r5  +strideq*1]
655    mova                 m4, [o(pixel_10bpc_max)]
656    packssdw             m5, m5 ; pw_2048
657    pmulhrsw             m0, m5
658    pmulhrsw             m1, m5
659    pxor                 m5, m5
660    mova          [cq+16*0], m5
661    mova          [cq+16*1], m5
662    mova          [cq+16*2], m5
663    mova          [cq+16*3], m5
664    paddw                m0, m2
665    paddw                m1, m3
666    pmaxsw               m0, m5
667    pmaxsw               m1, m5
668    pminsw               m0, m4
669    pminsw               m1, m4
670    movq   [dstq+strideq*0], m0
671    movhps [dstq+strideq*1], m0
672    movq   [r5  +strideq*0], m1
673    movhps [r5  +strideq*1], m1
674    RET
675
676%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
677    INV_TXFM_FN          %1, %2, %3, 4x8
678%ifidn %1_%2, dct_dct
679    imul                r5d, [cq], 181
680    mov                [cq], eobd ; 0
681    mov                 r3d, 8
682    add                 r5d, 128
683    sar                 r5d, 8
684    imul                r5d, 181
685    jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
686%endif
687%endmacro
688
689INV_TXFM_4X8_FN dct, dct
690INV_TXFM_4X8_FN dct, identity, 9
691INV_TXFM_4X8_FN dct, adst
692INV_TXFM_4X8_FN dct, flipadst
693
694cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
695%undef cmp
696    mova                 m5, [o(pd_2048)]
697%if ARCH_X86_64
698    xor                 r5d, r5d
699    cmp                eobd, 13
700    setge               r5b
701%else
702    mov                 r5d, 1
703    cmp                eobd, 13
704    sbb                 r5d, 0
705%endif
706    shl                 r5d, 4
707.loop_pass1:
708    mova                 m3, [o(pd_2896)]
709    pmulld               m0, m3, [cq+32*0+r5]
710    pmulld               m1, m3, [cq+32*1+r5]
711    pmulld               m2, m3, [cq+32*2+r5]
712    pmulld               m3, [cq+32*3+r5]
713    REPX      {paddd x, m5}, m0, m1, m2, m3
714    REPX      {psrad x, 12}, m0, m1, m2, m3
715    call m(idct_4x4_internal_16bpc).pass1_main
716    packssdw             m0, m1     ; out0 out1
717    packssdw             m4, m2     ; out2 out3
718    test                r5d, r5d
719    jz .end_pass1
720    mova       [cq+32*0+16], m0
721    mova       [cq+32*1+16], m4
722    xor                 r5d, r5d
723    jmp .loop_pass1
724.end_pass1:
725    punpckhwd            m2, m0, m4
726    punpcklwd            m0, m4
727    punpckhwd            m1, m0, m2
728    punpcklwd            m0, m2
729    mova                 m2, [cq+32*0+16]
730    mova                 m6, [cq+32*1+16]
731    punpckhwd            m4, m2, m6
732    punpcklwd            m2, m6
733    punpckhwd            m3, m2, m4
734    punpcklwd            m2, m4
735    ; m0-3 = packed & transposed output
736    jmp                tx2q
737.pass2:
738%if ARCH_X86_32
739    lea                  r5, [o(itx8_start)]
740%endif
741    call m_suffix(idct_4x8_internal_8bpc, _ssse3).main
742    ; m0-3 is now out0/1,3/2,4/5,7/6
743    mova                 m4, [o(pw_2048)]
744    shufps               m1, m1, q1032
745    shufps               m3, m3, q1032
746.end:
747    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
748    pxor                 m4, m4
749    REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
750    mova                 m7, [o(pixel_10bpc_max)]
751    lea                  r2, [strideq*3]
752    movq                 m5, [dstq+strideq*0]
753    movq                 m6, [dstq+strideq*2]
754    movhps               m5, [dstq+strideq*1]
755    movhps               m6, [dstq+r2]
756    lea                  r4, [dstq+strideq*4]
757    paddw                m0, m5
758    paddw                m1, m6
759    movq                 m5, [r4+strideq*0]
760    movq                 m6, [r4+strideq*2]
761    movhps               m5, [r4+strideq*1]
762    movhps               m6, [r4+r2]
763    paddw                m2, m5
764    paddw                m3, m6
765    REPX     {pminsw x, m7}, m0, m1, m2, m3
766    REPX     {pmaxsw x, m4}, m0, m1, m2, m3
767    movq   [dstq+strideq*0], m0
768    movhps [dstq+strideq*1], m0
769    movq   [dstq+strideq*2], m1
770    movhps [dstq+r2       ], m1
771    movq   [r4  +strideq*0], m2
772    movhps [r4  +strideq*1], m2
773    movq   [r4  +strideq*2], m3
774    movhps [r4  +r2       ], m3
775    RET
776
777INV_TXFM_4X8_FN adst, dct
778INV_TXFM_4X8_FN adst, adst
779INV_TXFM_4X8_FN adst, flipadst
780INV_TXFM_4X8_FN adst, identity, 9
781
782cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
783    call .pass1_main
784    punpckhwd            m2, m0, m1
785    punpcklwd            m0, m1
786    punpckhwd            m1, m0, m2
787    punpcklwd            m0, m2
788    mova                 m2, [cq+32*2+16]
789    mova                 m6, [cq+32*3+16]
790    punpckhwd            m4, m2, m6
791    punpcklwd            m2, m6
792    punpckhwd            m3, m2, m4
793    punpcklwd            m2, m4
794    ; m0-3 = packed & transposed output
795    jmp                tx2q
796.pass1_main:
797%undef cmp
798%if ARCH_X86_64
799    xor                 r5d, r5d
800    cmp                eobd, 13
801    setge               r5b
802%else
803    mov                 r5d, 1
804    cmp                eobd, 13
805    sbb                 r5d, 0
806%endif
807    shl                 r5d, 4
808    lea                  r3, [cq+32*1+16]
809.loop_pass1:
810    mova                 m0, [o(pd_2048)]
811    mova                 m3, [o(pd_2896)]
812    pmulld               m5, m3, [cq+32*0+r5]
813    pmulld               m2, m3, [cq+32*1+r5]
814    pmulld               m1, m3, [cq+32*2+r5]
815    pmulld               m3, [cq+32*3+r5]
816    REPX      {paddd x, m0}, m5, m2, m1, m3
817    REPX      {psrad x, 12}, m5, m2, m1, m3
818    mova               [r3], m2
819    call m(iadst_4x4_internal_16bpc).main2
820    packssdw             m0, m2            ; out0 out1
821    packssdw             m1, m4            ; out2 out3
822    test                r5d, r5d
823    jz .end_pass1
824    mova       [cq+32*2+16], m0
825    mova       [cq+32*3+16], m1
826    xor                 r5d, r5d
827    jmp .loop_pass1
828.end_pass1:
829    ret
830.pass2:
831    shufps               m0, m0, q1032
832    shufps               m1, m1, q1032
833%if ARCH_X86_32
834    lea                  r5, [o(itx8_start)]
835%endif
836    call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
837    mova                 m4, [o(pw_4x2048_4xm2048)]
838    jmp m(idct_4x8_internal_16bpc).end
839
840INV_TXFM_4X8_FN flipadst, dct
841INV_TXFM_4X8_FN flipadst, adst
842INV_TXFM_4X8_FN flipadst, flipadst
843INV_TXFM_4X8_FN flipadst, identity, 9
844
845cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
846    call m(iadst_4x8_internal_16bpc).pass1_main
847    punpcklwd            m2, m1, m0
848    punpckhwd            m1, m0
849    punpcklwd            m0, m1, m2
850    punpckhwd            m1, m2
851    mova                 m6, [cq+32*2+16]
852    mova                 m2, [cq+32*3+16]
853    punpcklwd            m4, m2, m6
854    punpckhwd            m2, m6
855    punpckhwd            m3, m2, m4
856    punpcklwd            m2, m4
857    ; m0-3 = packed & transposed output
858    jmp                tx2q
859.pass2:
860    shufps               m0, m0, q1032
861    shufps               m1, m1, q1032
862%if ARCH_X86_32
863    lea                  r5, [o(itx8_start)]
864%endif
865    call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
866    mova                 m4, m0
867    mova                 m5, m1
868    pshufd               m0, m3, q1032
869    pshufd               m1, m2, q1032
870    pshufd               m2, m5, q1032
871    pshufd               m3, m4, q1032
872    mova                 m4, [o(pw_4xm2048_4x2048)]
873    jmp m(idct_4x8_internal_16bpc).end
874
875INV_TXFM_4X8_FN identity, dct
876INV_TXFM_4X8_FN identity, adst
877INV_TXFM_4X8_FN identity, flipadst
878INV_TXFM_4X8_FN identity, identity, 3
879
880cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
881%undef cmp
882    mova                 m5, [o(pd_2048)]
883    mova                 m4, [o(pd_2896)]
884    mova                 m6, [o(pd_5793)]
885    ; clear m7 in case we skip the bottom square
886    pxor                 m7, m7
887%if ARCH_X86_64
888    xor                 r5d, r5d
889    cmp                eobd, 16
890    setge               r5b
891%else
892    mov                 r5d, 1
893    cmp                eobd, 16
894    sbb                 r5d, 0
895%endif
896    shl                 r5d, 4
897.loop_pass1:
898    pmulld               m0, m4, [cq+32*0+r5]
899    pmulld               m1, m4, [cq+32*1+r5]
900    pmulld               m2, m4, [cq+32*2+r5]
901    pmulld               m3, m4, [cq+32*3+r5]
902    REPX      {paddd x, m5}, m0, m1, m2, m3
903    REPX      {psrad x, 12}, m0, m1, m2, m3
904    REPX     {pmulld x, m6}, m0, m1, m2, m3
905    REPX      {paddd x, m5}, m0, m1, m2, m3
906    REPX      {psrad x, 12}, m0, m1, m2, m3
907    packssdw             m0, m1
908    packssdw             m2, m3
909    test                r5d, r5d
910    jz .end_pass1
911    mova       [cq+32*0+16], m0
912    mova                 m7, m2
913    xor                 r5d, r5d
914    jmp .loop_pass1
915.end_pass1:
916    punpckhwd            m4, m0, m2
917    punpcklwd            m0, m2
918    punpckhwd            m1, m0, m4
919    punpcklwd            m0, m4
920    mova                 m2, [cq+32*0+16]
921    punpckhwd            m4, m2, m7
922    punpcklwd            m2, m7
923    punpckhwd            m3, m2, m4
924    punpcklwd            m2, m4
925    ; m0-3 = packed & transposed output
926    jmp                tx2q
927.pass2:
928    mova                 m4, [o(pw_4096)]
929    jmp m(idct_4x8_internal_16bpc).end
930
931%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
932    INV_TXFM_FN          %1, %2, tbl_4x16_%3, 4x16
933%ifidn %1_%2, dct_dct
934    imul                r5d, [cq], 181
935    mov                [cq], eobd ; 0
936    mov                 r3d, 16
937    add                 r5d, 384
938    sar                 r5d, 9
939    jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
940%endif
941%endmacro
942
943INV_TXFM_4X16_FN dct, dct
944INV_TXFM_4X16_FN dct, identity, v
945INV_TXFM_4X16_FN dct, adst
946INV_TXFM_4X16_FN dct, flipadst
947
948cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
949%undef cmp
950%if ARCH_X86_32
951    mov                 r5m, r6d
952%endif
953    mov                 r6d, 4
954.zero_loop:
955    dec                 r6d
956    cmp                eobb, byte [r5+r6]
957    jl .zero_loop
958    mov                 r5d, r6d
959    shl                 r5d, 4
960%if ARCH_X86_32
961    ; restore pic-ptr
962    mov                  r6, r5m
963%endif
964    mova                 m5, [o(pd_2048)]
965.loop_pass1:
966    mova                 m0, [cq+64*0+r5]
967    mova                 m1, [cq+64*1+r5]
968    mova                 m2, [cq+64*2+r5]
969    mova                 m3, [cq+64*3+r5]
970    call m(idct_4x4_internal_16bpc).pass1_main
971    pcmpeqd              m3, m3
972    REPX      {psubd x, m3}, m0, m1, m4, m2
973    REPX       {psrad x, 1}, m0, m1, m4, m2
974    packssdw             m0, m1     ; out0 out1
975    packssdw             m4, m2     ; out2 out3
976    punpckhwd            m2, m0, m4
977    punpcklwd            m0, m4
978    punpckhwd            m1, m0, m2
979    punpcklwd            m0, m2
980    test                r5d, r5d
981    jz .end_pass1
982    mova       [cq+64*0+r5], m0
983    mova       [cq+64*1+r5], m1
984    sub                 r5d, 16
985    jmp .loop_pass1
986.end_pass1:
987    mova                 m2, [cq+64*0+16]
988    mova                 m3, [cq+64*1+16]
989    mova                 m4, [cq+64*0+32]
990    mova                 m5, [cq+64*1+32]
991    mova                 m6, [cq+64*0+48]
992    mova                 m7, [cq+64*1+48]
993    ; m0-7 = packed & transposed output
994    jmp                tx2q
995.pass2:
996%if ARCH_X86_32
997    lea                  r5, [o(itx8_start)]
998%endif
999    call m_suffix(idct_16x4_internal_8bpc, _ssse3).main
1000    ; m0-6 is out0-13 [with odd registers having inversed output]
1001    ; [coeffq+16*7] has out15/14
1002    mova                 m7, [o(pw_2048)]
1003    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1004    pmulhrsw             m7, [cq+16*7]
1005    REPX {shufps x, x, q1032}, m1, m3, m5, m7
1006    mova          [cq+16*0], m4
1007    mova          [cq+16*1], m5
1008    mova          [cq+16*2], m6
1009    mova          [cq+16*3], m7
1010.end:
1011    pxor                 m4, m4
1012    REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1013    mova                 m7, [o(pixel_10bpc_max)]
1014    mov                 r5d, 2
1015    lea                  r3, [strideq*3]
1016.loop:
1017    movq                 m5, [dstq+strideq*0]
1018    movq                 m6, [dstq+strideq*2]
1019    movhps               m5, [dstq+strideq*1]
1020    movhps               m6, [dstq+r3]
1021    lea                  r4, [dstq+strideq*4]
1022    paddw                m0, m5
1023    paddw                m1, m6
1024    movq                 m5, [r4+strideq*0]
1025    movq                 m6, [r4+strideq*2]
1026    movhps               m5, [r4+strideq*1]
1027    movhps               m6, [r4+r3]
1028    paddw                m2, m5
1029    paddw                m3, m6
1030    REPX     {pminsw x, m7}, m0, m1, m2, m3
1031    REPX     {pmaxsw x, m4}, m0, m1, m2, m3
1032    movq   [dstq+strideq*0], m0
1033    movhps [dstq+strideq*1], m0
1034    movq   [dstq+strideq*2], m1
1035    movhps [dstq+r3       ], m1
1036    movq   [r4  +strideq*0], m2
1037    movhps [r4  +strideq*1], m2
1038    movq   [r4  +strideq*2], m3
1039    movhps [r4  +r3       ], m3
1040    dec                 r5d
1041    jz .end2
1042    lea                dstq, [dstq+strideq*8]
1043    mova                 m0, [cq+0*16]
1044    mova                 m1, [cq+1*16]
1045    mova                 m2, [cq+2*16]
1046    mova                 m3, [cq+3*16]
1047    REPX {mova [cq+x*16], m4}, 0, 1, 2, 3
1048    jmp .loop
1049.end2:
1050    RET
1051
1052INV_TXFM_4X16_FN adst, dct
1053INV_TXFM_4X16_FN adst, adst
1054INV_TXFM_4X16_FN adst, flipadst
1055INV_TXFM_4X16_FN adst, identity, v
1056
1057cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1058%undef cmp
1059%if ARCH_X86_32
1060    mov                 r5m, r6d
1061%endif
1062    mov                 r6d, 4
1063.zero_loop:
1064    dec                 r6d
1065    cmp                eobb, byte [r6+r5]
1066    jl .zero_loop
1067    mov                 r5d, r6d
1068    shl                 r5d, 4
1069%if ARCH_X86_32
1070    ; restore pic-ptr
1071    mov                  r6, r5m
1072%endif
1073.loop_pass1:
1074    mova                 m5, [cq+64*0+r5]
1075    lea                  r3, [cq+64*1+r5]
1076    mova                 m1, [cq+64*2+r5]
1077    mova                 m3, [cq+64*3+r5]
1078    call m(iadst_4x4_internal_16bpc).main2
1079    pcmpeqd              m3, m3
1080    REPX      {psubd x, m3}, m0, m2, m1, m4
1081    REPX       {psrad x, 1}, m0, m2, m1, m4
1082    packssdw             m0, m2            ; out0 out1
1083    packssdw             m1, m4            ; out2 out3
1084    punpckhwd            m2, m0, m1
1085    punpcklwd            m0, m1
1086    punpckhwd            m1, m0, m2
1087    punpcklwd            m0, m2
1088    test                r5d, r5d
1089    jz m(idct_4x16_internal_16bpc).end_pass1
1090    mova       [cq+64*0+r5], m0
1091    mova       [cq+64*1+r5], m1
1092    sub                 r5d, 16
1093    jmp .loop_pass1
1094.pass2:
1095%if ARCH_X86_32
1096    lea                  r5, [o(itx8_start)]
1097%endif
1098    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
1099    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
1100    ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8
1101    ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13
1102    mova                 m1, [o(pw_4x2048_4xm2048)]
1103    REPX   {pmulhrsw x, m1}, m7, m2, m0
1104    pshufd               m6, m1, q1032  ; 4x-2048,4x2048
1105    pmulhrsw             m1, [cq+16*7]
1106    REPX   {pmulhrsw x, m6}, m5, m4, m3
1107    pmulhrsw             m6, [cq+16*6]
1108    ; m7/5/2/4 = out4/11,5/10,6/9,7/8
1109    ; m0/3/6/1 = out0/15,3/12,1/14,2/13
1110    ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
1111    movhps         [cq+0*8], m4
1112    movhps         [cq+1*8], m2
1113    movhps         [cq+2*8], m5
1114    movhps         [cq+3*8], m7
1115    movhps         [cq+4*8], m3
1116    movhps         [cq+5*8], m1
1117    movhps         [cq+6*8], m6
1118    movhps         [cq+7*8], m0
1119    punpcklqdq           m0, m6
1120    punpcklqdq           m1, m3
1121    punpcklqdq           m3, m2, m4
1122    punpcklqdq           m2, m7, m5
1123    jmp m(idct_4x16_internal_16bpc).end
1124
1125INV_TXFM_4X16_FN flipadst, dct
1126INV_TXFM_4X16_FN flipadst, adst
1127INV_TXFM_4X16_FN flipadst, flipadst
1128INV_TXFM_4X16_FN flipadst, identity, v
1129
1130cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1131%undef cmp
1132%if ARCH_X86_32
1133    mov                 r5m, r6d
1134%endif
1135    mov                 r6d, 4
1136.zero_loop:
1137    dec                 r6d
1138    cmp                eobb, byte [r5+r6]
1139    jl .zero_loop
1140    mov                 r5d, r6d
1141    shl                 r5d, 4
1142%if ARCH_X86_32
1143    ; restore pic-ptr
1144    mov                  r6, r5m
1145%endif
1146.loop_pass1:
1147    mova                 m5, [cq+64*0+r5]
1148    lea                  r3, [cq+64*1+r5]
1149    mova                 m1, [cq+64*2+r5]
1150    mova                 m3, [cq+64*3+r5]
1151    call m(iadst_4x4_internal_16bpc).main2
1152    pcmpeqd              m3, m3
1153    REPX      {psubd x, m3}, m0, m2, m1, m4
1154    REPX       {psrad x, 1}, m0, m2, m1, m4
1155    packssdw             m0, m2            ; out3 out2
1156    packssdw             m1, m4            ; out1 out0
1157    punpcklwd            m2, m1, m0
1158    punpckhwd            m1, m0
1159    punpcklwd            m0, m1, m2
1160    punpckhwd            m1, m2
1161    test                r5d, r5d
1162    jz m(idct_4x16_internal_16bpc).end_pass1
1163    mova       [cq+64*0+r5], m0
1164    mova       [cq+64*1+r5], m1
1165    sub                 r5d, 16
1166    jmp .loop_pass1
1167.pass2:
1168%if ARCH_X86_32
1169    lea                  r5, [o(itx8_start)]
1170%endif
1171    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
1172    call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
1173    ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7
1174    ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2
1175    mova                 m1, [o(pw_4x2048_4xm2048)]
1176    REPX   {pmulhrsw x, m1}, m7, m2, m0
1177    pshufd               m6, m1, q1032  ; 4x-2048,4x2048
1178    pmulhrsw             m1, [cq+16*7]
1179    REPX   {pmulhrsw x, m6}, m5, m4, m3
1180    pmulhrsw             m6, [cq+16*6]
1181    ; m7/5/2/4 = out11/4,10/5,9/6,8/7
1182    ; m0/3/6/1 = out15/0,12/3,14/1,13/2
1183    ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
1184    movq           [cq+0*8], m4
1185    movq           [cq+1*8], m2
1186    movq           [cq+2*8], m5
1187    movq           [cq+3*8], m7
1188    movq           [cq+4*8], m3
1189    movq           [cq+5*8], m1
1190    movq           [cq+6*8], m6
1191    movq           [cq+7*8], m0
1192    punpckhqdq           m0, m6
1193    punpckhqdq           m1, m3
1194    punpckhqdq           m3, m2, m4
1195    punpckhqdq           m2, m7, m5
1196    jmp m(idct_4x16_internal_16bpc).end
1197
1198INV_TXFM_4X16_FN identity, dct, h
1199INV_TXFM_4X16_FN identity, adst, h
1200INV_TXFM_4X16_FN identity, flipadst, h
1201INV_TXFM_4X16_FN identity, identity
1202
1203cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1204%undef cmp
1205%if ARCH_X86_32
1206    mov                 r5m, r6d
1207%endif
1208    mov                 r6d, 4
1209.zero_loop:
1210    dec                 r6d
1211    cmp                eobb, byte [r5+r6]
1212    jl .zero_loop
1213    mov                 r5d, r6d
1214    shl                 r5d, 4
1215%if ARCH_X86_32
1216    ; restore pic-ptr
1217    mov                  r6, r5m
1218%endif
1219    mova                 m5, [o(pd_6144)]
1220    mova                 m4, [o(pd_5793)]
1221.loop_pass1:
1222    pmulld               m0, m4, [cq+64*0+r5]
1223    pmulld               m1, m4, [cq+64*1+r5]
1224    pmulld               m2, m4, [cq+64*2+r5]
1225    pmulld               m3, m4, [cq+64*3+r5]
1226    REPX      {paddd x, m5}, m0, m1, m2, m3
1227    REPX      {psrad x, 13}, m0, m1, m2, m3
1228    packssdw             m0, m1
1229    packssdw             m2, m3
1230    punpckhwd            m3, m0, m2
1231    punpcklwd            m0, m2
1232    punpckhwd            m1, m0, m3
1233    punpcklwd            m0, m3
1234    test                r5d, r5d
1235    jz m(idct_4x16_internal_16bpc).end_pass1
1236    mova       [cq+64*0+r5], m0
1237    mova       [cq+64*1+r5], m1
1238    sub                 r5d, 16
1239    jmp .loop_pass1
1240.pass2:
1241    mova          [cq+16*4], m0
1242    mova          [cq+16*5], m1
1243    mova          [cq+16*6], m2
1244    mova          [cq+16*7], m7
1245    mova                 m0, [o(pw_1697x16)]
1246    mova                 m7, [o(pw_2048)]
1247    pmulhrsw             m1, m0, m4
1248    pmulhrsw             m2, m0, m5
1249    REPX      {paddsw x, x}, m4, m5
1250    paddsw               m4, m1
1251    paddsw               m5, m2
1252    REPX   {pmulhrsw x, m7}, m4, m5
1253    mova          [cq+16*0], m4
1254    mova          [cq+16*1], m5
1255    mova                 m4, [cq+16*7]
1256    pmulhrsw             m1, m0, m6
1257    pmulhrsw             m2, m0, m4
1258    REPX      {paddsw x, x}, m6, m4
1259    paddsw               m6, m1
1260    paddsw               m4, m2
1261    REPX   {pmulhrsw x, m7}, m6, m4
1262    mova          [cq+16*2], m6
1263    mova          [cq+16*3], m4
1264    mova                 m4, [cq+16*4]
1265    mova                 m1, [cq+16*5]
1266    mova                 m2, [cq+16*6]
1267    pmulhrsw             m5, m0, m2
1268    pmulhrsw             m6, m0, m3
1269    REPX      {paddsw x, x}, m2, m3
1270    paddsw               m2, m5
1271    paddsw               m3, m6
1272    pmulhrsw             m6, m0, m1
1273    pmulhrsw             m0, m4
1274    REPX      {paddsw x, x}, m1, m4
1275    paddsw               m1, m6
1276    paddsw               m0, m4
1277    REPX   {pmulhrsw x, m7}, m2, m3, m1, m0
1278    jmp m(idct_4x16_internal_16bpc).end
1279
1280%macro INV_TXFM_8X4_FN 2 ; type1, type2
1281%if ARCH_X86_64
1282    INV_TXFM_FN          %1, %2, 0, 8x4, 15
1283%else
1284    INV_TXFM_FN          %1, %2, 0, 8x4, 8, 0-4*16
1285%endif
1286%ifidn %1_%2, dct_dct
1287    imul                r5d, [cq], 181
1288    mov                [cq], eobd ; 0
1289    add                 r5d, 128
1290    sar                 r5d, 8
1291    imul                r5d, 181
1292    add                 r5d, 128
1293    sar                 r5d, 8
1294    imul                r5d, 2896
1295    add                 r5d, 34816
1296    movd                 m0, r5d
1297    pshuflw              m0, m0, q1111
1298    punpcklqdq           m0, m0
1299    mova                 m6, [o(pixel_10bpc_max)]
1300    pxor                 m5, m5
1301    lea                  r2, [strideq*3]
1302    mova                 m1, [dstq+strideq*0]
1303    mova                 m2, [dstq+strideq*1]
1304    mova                 m3, [dstq+strideq*2]
1305    mova                 m4, [dstq+r2]
1306    REPX      {paddw x, m0}, m1, m2, m3, m4
1307    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
1308    REPX     {pminsw x, m6}, m1, m2, m3, m4
1309    mova   [dstq+strideq*0], m1
1310    mova   [dstq+strideq*1], m2
1311    mova   [dstq+strideq*2], m3
1312    mova   [dstq+r2       ], m4
1313    RET
1314%endif
1315%endmacro
1316
1317INV_TXFM_8X4_FN dct, dct
1318INV_TXFM_8X4_FN dct, identity
1319INV_TXFM_8X4_FN dct, adst
1320INV_TXFM_8X4_FN dct, flipadst
1321
1322cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1323    lea                  r5, [o(.main)]
1324.pass1_entry:
1325%if ARCH_X86_32
1326    lea                  r3, [rsp+gprsize]
1327%else
1328    mova                m11, [o(pd_2048)]
1329    mova                m12, [o(clip_18b_min)]
1330    mova                m13, [o(clip_18b_max)]
1331    mova                m14, [o(pd_2896)]
1332%endif
1333    mova                 m0, [cq+0*16]
1334    mova                 m1, [cq+1*16]
1335    mova                 m2, [cq+2*16]
1336    mova                 m3, [cq+3*16]
1337    mova                 m4, [cq+4*16]
1338    mova                 m5, [cq+5*16]
1339    mova                 m6, [cq+6*16]
1340    mova                 m7, [cq+7*16]
1341    call .rect2_mul
1342    call                 r5
1343    call .transpose4x8packed
1344    ; m0-3 = packed & transposed output
1345    jmp                tx2q
1346.transpose4x8packed:
1347    ; transpose
1348    punpcklwd            m1, m2, m6
1349    punpckhwd            m2, m6
1350    punpckhwd            m6, m0, m4
1351    punpcklwd            m0, m4
1352
1353    punpckhwd            m3, m0, m1
1354    punpcklwd            m0, m1
1355    punpckhwd            m4, m6, m2
1356    punpcklwd            m6, m2
1357
1358    punpcklwd            m2, m3, m4
1359    punpckhwd            m3, m4
1360    punpckhwd            m1, m0, m6
1361    punpcklwd            m0, m6
1362    ret
1363.main:
1364    call .main_pass1
1365    call .round
1366    packssdw             m0, m1
1367    packssdw             m2, m3
1368    packssdw             m4, m5
1369    packssdw             m6, m7
1370    ret
1371.rect2_mul:
1372%if ARCH_X86_64
1373    REPX    {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
1374    REPX    {paddd  x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
1375%else
1376    mova               [r3], m7
1377    mova                 m7, [o(pd_2896)]
1378    REPX     {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
1379    pmulld               m7, [r3]
1380    mova               [r3], m7
1381    mova                 m7, [o(pd_2048)]
1382    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
1383    paddd                m7, [r3]
1384%endif
1385    REPX      {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
1386    ret
1387%if ARCH_X86_64
1388.main_pass1_fast:
1389    pmulld               m5, m3, [o(pd_m2276)]
1390    pmulld               m3, [o(pd_3406)]
1391    pmulld               m7, m1, [o(pd_4017)]
1392    pmulld               m1, [o(pd_799)]
1393    pmulld               m6, m2, [o(pd_3784)]
1394    pmulld               m2, [o(pd_1567)]
1395    pmulld               m0, m14
1396    pxor                 m4, m4
1397    jmp .main_pass1_fast2
1398.main_pass1:
1399    ITX_MULSUB_2D         5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
1400    ITX_MULSUB_2D         1, 7, 8, 9, 10, _,  799, 4017 ; t4a t7a
1401    ITX_MULSUB_2D         2, 6, 8, 9, 10, _, 1567, 3784 ; t2  t3
1402    REPX    {pmulld x, m14}, m0, m4
1403.main_pass1_fast2:
1404    REPX     {paddd x, m11}, m1, m2, m3, m5, m6, m7
1405    REPX     {psrad x, 12 }, m1, m2, m3, m5, m6, m7
1406    paddd                m8, m1, m5 ; t4
1407    psubd                m1, m5     ; t5a
1408    paddd                m9, m7, m3 ; t7
1409    psubd                m7, m3     ; t6a
1410    REPX    {pmaxsd x, m12}, m1, m8, m7, m9
1411    REPX    {pminsd x, m13}, m1, m8, m7, m9
1412    REPX    {pmulld x, m14}, m7, m1
1413    paddd                m0, m11
1414    paddd                m7, m11
1415    psubd                m5, m0, m4
1416    paddd                m0, m4
1417    psubd                m4, m7, m1
1418    paddd                m7, m1
1419    REPX    {psrad  x, 12 }, m5, m0, m4, m7
1420    psubd                m3, m0, m6 ; dct4 out3
1421    paddd                m0, m6     ; dct4 out0
1422    paddd                m6, m5, m2 ; dct4 out1
1423    psubd                m5, m2     ; dct4 out2
1424    REPX    {pmaxsd x, m12}, m0, m6, m5, m3
1425    REPX    {pminsd x, m13}, m0, m6, m5, m3
1426    ret
1427.round:
1428    paddd                m1, m6, m7 ; out1
1429    psubd                m6, m7     ; out6
1430    psubd                m7, m0, m9 ; out7
1431    paddd                m0, m9     ; out0
1432    paddd                m2, m5, m4 ; out2
1433    psubd                m5, m4     ; out5
1434    psubd                m4, m3, m8 ; out4
1435    paddd                m3, m8     ; out3
1436%else
1437.main_pass1_fast:
1438    pmulld               m5, m3, [o(pd_m2276)]
1439    pmulld               m3, [o(pd_3406)]
1440    pmulld               m7, m1, [o(pd_4017)]
1441    pmulld               m1, [o(pd_799)]
1442    pmulld               m6, m2, [o(pd_3784)]
1443    pmulld               m2, [o(pd_1567)]
1444    mova                 m4, [o(pd_2048)]
1445    mova          [r3+0*16], m2
1446    REPX      {paddd x, m4}, m5, m3, m7, m1
1447    REPX      {psrad x, 12}, m5, m3, m7, m1
1448    paddd                m2, m1, m5 ; t4
1449    psubd                m1, m5     ; t5a
1450    pmulld               m5, m0, [o(pd_2896)]
1451    mova                 m0, m4
1452    paddd                m4, m7, m3 ; t7
1453    psubd                m7, m3     ; t6a
1454    mova                 m3, [o(clip_18b_min)]
1455    REPX    {pmaxsd x, m3 }, m1, m2, m7, m4
1456    mova                 m3, [o(clip_18b_max)]
1457    REPX    {pminsd x, m3 }, m1, m2, m7, m4
1458    mova          [r3+3*16], m2
1459    mova          [r3+1*16], m4
1460    pxor                 m4, m4
1461    mova                 m2, [r3+0*16]
1462    mova                 m3, [o(pd_2896)]
1463    jmp .main_pass1_fast2
1464.main_pass1:
1465    mova          [r3+0*16], m0
1466    mova          [r3+1*16], m2
1467    mova          [r3+2*16], m4
1468    mova          [r3+3*16], m6
1469    mova                 m0, [o(pd_2048)]
1470    ITX_MULSUB_2D         5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a
1471    ITX_MULSUB_2D         1, 7, 2, 4, 6, 0,  799, 4017 ; t4a t7a
1472    paddd                m2, m1, m5 ; t4
1473    psubd                m1, m5     ; t5a
1474    paddd                m4, m7, m3 ; t7
1475    psubd                m7, m3     ; t6a
1476    mova                 m6, [o(clip_18b_min)]
1477    REPX    {pmaxsd x, m6 }, m1, m2, m7, m4
1478    mova                 m6, [o(clip_18b_max)]
1479    REPX    {pminsd x, m6 }, m1, m2, m7, m4
1480    mova                 m6, [r3+3*16]
1481    mova          [r3+3*16], m2
1482    mova                 m2, [r3+1*16]
1483    mova          [r3+1*16], m4
1484
1485    ITX_MULSUB_2D         2, 6, 4, 3, 5, _, 1567, 3784 ; t2  t3
1486    mova                 m3, [o(pd_2896)]
1487    mova                 m5, [r3+0*16]
1488    mova                 m4, [r3+2*16]
1489    REPX    {pmulld x, m3 }, m5, m4
1490.main_pass1_fast2:
1491    REPX    {paddd  x, m0 }, m2, m6
1492    REPX    {psrad  x, 12 }, m2, m6
1493    REPX    {pmulld x, m3 }, m7, m1
1494    paddd                m7, m0
1495    paddd                m0, m5
1496
1497    psubd                m5, m0, m4
1498    paddd                m0, m4
1499    psubd                m4, m7, m1
1500    paddd                m7, m1
1501    REPX    {psrad  x, 12 }, m5, m0, m4, m7
1502    psubd                m3, m0, m6 ; dct4 out3
1503    paddd                m0, m6     ; dct4 out0
1504    paddd                m6, m5, m2 ; dct4 out1
1505    psubd                m5, m2     ; dct4 out2
1506
1507    mova                 m1, [o(clip_18b_min)]
1508    REPX    {pmaxsd x, m1 }, m0, m6, m5, m3
1509    mova                 m1, [o(clip_18b_max)]
1510    REPX    {pminsd x, m1 }, m0, m6, m5, m3
1511    ret
1512.round:
1513    paddd                m1, m6, m7 ; out1
1514    psubd                m6, m7     ; out6
1515    mova          [r3+0*16], m6
1516    mova                 m6, [r3+1*16]
1517    psubd                m7, m0, m6 ; out7
1518    paddd                m0, m6     ; out0
1519    paddd                m2, m5, m4 ; out2
1520    psubd                m5, m4     ; out5
1521    mova                 m6, [r3+3*16]
1522    psubd                m4, m3, m6 ; out4
1523    paddd                m3, m6     ; out3
1524    mova                 m6, [r3+0*16]
1525%endif
1526    ret
1527
1528.pass2:
1529%if ARCH_X86_32
1530    lea                  r5, [o(itx8_start)]
1531%endif
1532    call m_suffix(idct_8x4_internal_8bpc, _ssse3).main
1533.end:
1534    lea                  r3, [strideq*3]
1535    call .round2_and_write_8x4
1536    REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
1537    RET
1538.round2_and_write_8x4:
1539    pxor                 m6, m6
1540    mova                 m5, [o(pixel_10bpc_max)]
1541    mova                 m4, [o(pw_2048)]
1542.round1_and_write_8x4:
1543    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
1544.write_8x4:
1545    paddw                m0, [dstq+strideq*0]
1546    paddw                m1, [dstq+strideq*1]
1547    paddw                m2, [dstq+strideq*2]
1548    paddw                m3, [dstq+r3]
1549    REPX     {pminsw x, m5}, m0, m1, m2, m3
1550    REPX     {pmaxsw x, m6}, m0, m1, m2, m3
1551    mova   [dstq+strideq*0], m0
1552    mova   [dstq+strideq*1], m1
1553    mova   [dstq+strideq*2], m2
1554    mova   [dstq+r3       ], m3
1555    ret
1556
1557INV_TXFM_8X4_FN adst, dct
1558INV_TXFM_8X4_FN adst, adst
1559INV_TXFM_8X4_FN adst, flipadst
1560INV_TXFM_8X4_FN adst, identity
1561
1562cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1563    lea                  r5, [o(.main)]
1564    jmp m(idct_8x4_internal_16bpc).pass1_entry
1565.main:
1566    call .main_pass1
1567    call .round
1568    packssdw             m0, m1
1569    packssdw             m2, m3
1570    packssdw             m4, m5
1571    packssdw             m6, m7
1572    ret
1573.main_pass1:
1574%if ARCH_X86_64
1575    ITX_MULSUB_2D         7, 0, 8, 9, 10, 11,  401, 4076 ; t1a, t0a
1576    ITX_MULSUB_2D         1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a
1577    ITX_MULSUB_2D         5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a
1578    ITX_MULSUB_2D         3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a
1579    psubd                m8, m2, m6 ; t6
1580    paddd                m2, m6     ; t2
1581    psubd                m6, m0, m4 ; t4
1582    paddd                m0, m4     ; t0
1583    psubd                m4, m5, m1 ; t7
1584    paddd                m5, m1     ; t3
1585    psubd                m1, m7, m3 ; t5
1586    paddd                m7, m3     ; t1
1587    REPX    {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7
1588    REPX    {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7
1589    ITX_MULSUB_2D         6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a
1590    ITX_MULSUB_2D         4, 8, 3, 9, 10, 11, 3784, 10   ; t6a, t7a
1591    psubd                m9, m6, m8 ;  t7
1592    paddd                m6, m8     ;  out6
1593    mova                 m8, [o(pd_2896)]
1594    psubd                m3, m7, m5 ;  t3
1595    paddd                m7, m5     ; -out7
1596    psubd                m5, m0, m2 ;  t2
1597    paddd                m0, m2     ;  out0
1598    psubd                m2, m1, m4 ;  t6
1599    paddd                m1, m4     ; -out1
1600    REPX    {pmaxsd x, m12}, m5, m3, m2, m9
1601    REPX    {pminsd x, m13}, m5, m3, m2, m9
1602    REPX    {pmulld x, m14}, m5, m3, m2, m9
1603    psubd               m4, m5, m3 ; (t2 - t3) * 2896
1604    paddd               m3, m5     ; (t2 + t3) * 2896
1605    psubd               m5, m2, m9 ; (t6 - t7) * 2896
1606    paddd               m2, m9     ; (t6 + t7) * 2896
1607    ret
1608.round:
1609
1610    ; m0=out0,m1=-out1,m6=out6,m7=-out7
1611
1612    pcmpeqd              m8, m8
1613    REPX     {pxor  x, m8 }, m1, m7, m3, m5
1614    REPX     {psubd x, m8 }, m1, m7
1615    REPX     {paddd x, m11}, m2, m3, m4, m5
1616    REPX     {psrad x, 12 }, m2, m3, m4, m5
1617%else
1618    mova          [r3+0*16], m2
1619    mova          [r3+1*16], m3
1620    mova          [r3+2*16], m4
1621    mova          [r3+3*16], m5
1622    mova                 m5, [o(pd_2048)]
1623
1624    ITX_MULSUB_2D         7, 0, 2, 3, 4, 5,  401, 4076 ; t1a, t0a
1625    ITX_MULSUB_2D         1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a
1626    mova                 m2, [r3+0*16]
1627    mova                 m3, [r3+1*16]
1628    mova                 m4, [r3+2*16]
1629    mova          [r3+0*16], m0
1630    mova          [r3+1*16], m1
1631    mova          [r3+2*16], m6
1632    mova                 m1, [r3+3*16]
1633    mova          [r3+3*16], m7
1634    ITX_MULSUB_2D         1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a
1635    ITX_MULSUB_2D         3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a
1636    mova                 m0, [r3+0*16]
1637    mova                 m6, [r3+2*16]
1638    psubd                m7, m2, m6 ; t6
1639    paddd                m2, m6     ; t2
1640    psubd                m6, m0, m4 ; t4
1641    paddd                m0, m4     ; t0
1642    mova          [r3+0*16], m7
1643    mova                 m5, [r3+1*16]
1644    mova                 m7, [r3+3*16]
1645    psubd                m4, m1, m5 ; t7
1646    paddd                m5, m1     ; t3
1647    psubd                m1, m7, m3 ; t5
1648    paddd                m7, m3     ; t1
1649    mova                 m3, [o(clip_18b_min)]
1650    REPX    {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7
1651    mova          [r3+1*16], m7
1652    mova                 m7, [o(clip_18b_max)]
1653    pmaxsd               m3, [r3+0*16]
1654    REPX    {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5
1655    pminsd               m7, [r3+1*16]
1656    mova          [r3+0*16], m0
1657    mova          [r3+1*16], m2
1658    mova          [r3+2*16], m5
1659    mova          [r3+3*16], m7
1660    mova                 m0, [o(pd_2048)]
1661    ITX_MULSUB_2D         6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a
1662    ITX_MULSUB_2D         4, 3, 2, 5, 7, 0, 3784, 7    ; t6a, t7a
1663    mova                 m5, [r3+2*16]
1664    mova                 m7, [r3+3*16]
1665    psubd                m2, m6, m3 ;  t7
1666    paddd                m6, m3     ;  out6
1667    mova          [r3+3*16], m6
1668    mova                 m0, [r3+0*16]
1669    mova                 m6, [r3+1*16]
1670    psubd                m3, m7, m5 ;  t3
1671    paddd                m7, m5     ; -out7
1672    psubd                m5, m0, m6 ;  t2
1673    paddd                m0, m6     ;  out0
1674    psubd                m6, m1, m4 ;  t6
1675    paddd                m1, m4     ; -out1
1676    mova                 m4, [o(clip_18b_min)]
1677    REPX    {pmaxsd x, m4 }, m5, m3, m6, m2
1678    mova                 m4, [o(clip_18b_max)]
1679    REPX    {pminsd x, m4 }, m5, m3, m6, m2
1680    mova                 m4, [o(pd_2896)]
1681    REPX    {pmulld x, m4 }, m5, m3, m6, m2
1682    psubd               m4, m5, m3 ; (t2 - t3) * 2896
1683    paddd               m3, m5     ; (t2 + t3) * 2896
1684    psubd               m5, m6, m2 ; (t6 - t7) * 2896
1685    paddd               m2, m6     ; (t6 + t7) * 2896
1686    ret
1687.round:
1688    mova          [r3+2*16], m0
1689
1690    pcmpeqd              m0, m0
1691    mova                 m6, [o(pd_2048)]
1692    REPX     {pxor  x, m0 }, m1, m7, m3, m5
1693    REPX     {psubd x, m0 }, m1, m7
1694    REPX     {paddd x, m6 }, m2, m3, m4, m5
1695    REPX     {psrad x, 12 }, m2, m3, m4, m5
1696
1697    mova                 m6, [r3+3*16]
1698    mova                 m0, [r3+2*16]
1699%endif
1700    ret
1701
1702.pass2:
1703%if ARCH_X86_32
1704    lea                  r5, [o(itx8_start)]
1705%endif
1706    call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
1707    jmp m(idct_8x4_internal_16bpc).end
1708
1709INV_TXFM_8X4_FN flipadst, dct
1710INV_TXFM_8X4_FN flipadst, adst
1711INV_TXFM_8X4_FN flipadst, flipadst
1712INV_TXFM_8X4_FN flipadst, identity
1713
1714cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1715    lea                  r5, [o(.main)]
1716    jmp m(idct_8x4_internal_16bpc).pass1_entry
1717.main:
1718    call m(iadst_8x4_internal_16bpc).main_pass1
1719    call m(iadst_8x4_internal_16bpc).round
1720    packssdw             m7, m6
1721    packssdw             m5, m4
1722    packssdw             m3, m2
1723    packssdw             m1, m0
1724    mova                 m0, m7
1725    mova                 m2, m5
1726    mova                 m4, m3
1727    mova                 m6, m1
1728    ret
1729.pass2:
1730%if ARCH_X86_32
1731    lea                  r5, [o(itx8_start)]
1732%endif
1733    call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
1734    lea                  r3, [strideq*3]
1735    add                dstq, r3
1736    neg             strideq
1737    jmp m(idct_8x4_internal_16bpc).end
1738
1739INV_TXFM_8X4_FN identity, dct
1740INV_TXFM_8X4_FN identity, adst
1741INV_TXFM_8X4_FN identity, flipadst
1742INV_TXFM_8X4_FN identity, identity
1743
1744cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1745    lea                  r5, [o(.main)]
1746    jmp m(idct_8x4_internal_16bpc).pass1_entry
1747.main:
1748    REPX       {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7
1749    packssdw             m0, m1
1750    packssdw             m2, m3
1751    packssdw             m4, m5
1752    packssdw             m6, m7
1753    ret
1754.pass2:
1755    mova                 m7, [o(pw_1697x8)]
1756    pmulhrsw             m4, m7, m0
1757    pmulhrsw             m5, m7, m1
1758    pmulhrsw             m6, m7, m2
1759    pmulhrsw             m7, m3
1760    paddsw               m0, m4
1761    paddsw               m1, m5
1762    paddsw               m2, m6
1763    paddsw               m3, m7
1764    jmp m(idct_8x4_internal_16bpc).end
1765
1766%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
1767%if ARCH_X86_64
1768    INV_TXFM_FN          %1, %2, %3, 8x8, 15, 0-3*16
1769%else
1770    INV_TXFM_FN          %1, %2, %3, 8x8, 8, 0-5*16
1771%endif
1772%ifidn %1_%2, dct_dct
1773    imul                r5d, [cq], 181
1774    mov                [cq], eobd ; 0
1775    mov                 r3d, 2
1776.end:
1777    add                 r5d, 384
1778    sar                 r5d, 9
1779.end2:
1780    imul                r5d, 2896
1781    add                 r5d, 34816
1782    movd                 m0, r5d
1783    pshuflw              m0, m0, q1111
1784    punpcklqdq           m0, m0
1785    mova                 m6, [o(pixel_10bpc_max)]
1786    pxor                 m5, m5
1787    lea                  r2, [strideq*3]
1788.loop:
1789    mova                 m1, [dstq+strideq*0]
1790    mova                 m2, [dstq+strideq*1]
1791    mova                 m3, [dstq+strideq*2]
1792    mova                 m4, [dstq+r2]
1793    REPX      {paddw x, m0}, m1, m2, m3, m4
1794    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
1795    REPX     {pminsw x, m6}, m1, m2, m3, m4
1796    mova   [dstq+strideq*0], m1
1797    mova   [dstq+strideq*1], m2
1798    mova   [dstq+strideq*2], m3
1799    mova   [dstq+r2       ], m4
1800    lea                dstq, [dstq+strideq*4]
1801    dec                 r3d
1802    jg .loop
1803    RET
1804%endif
1805%endmacro
1806
1807INV_TXFM_8X8_FN dct, dct
1808INV_TXFM_8X8_FN dct, identity, 6
1809INV_TXFM_8X8_FN dct, adst
1810INV_TXFM_8X8_FN dct, flipadst
1811
1812cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1813%if ARCH_X86_32
1814    DECLARE_REG_TMP 1
1815    mov [rsp+4*16+1*gprsize], r1
1816%else
1817    DECLARE_REG_TMP 6
1818%endif
1819    lea                  t0, [o(.pass1_main)]
1820
1821.pass1_full:
1822%if ARCH_X86_64
1823    mova                m11, [o(pd_2048)]
1824    mova                m12, [o(clip_18b_min)]
1825    mova                m13, [o(clip_18b_max)]
1826    mova                m14, [o(pd_2896)]
1827%endif
1828%undef cmp
1829%if ARCH_X86_64
1830    xor                 r5d, r5d
1831    cmp                eobd, 10
1832    setge               r5b
1833%else
1834    mov                 r5d, 1
1835    cmp                eobd, 10
1836    sbb                 r5d, 0
1837%endif
1838    shl                 r5d, 4
1839%if ARCH_X86_32
1840    lea                  r3, [rsp+gprsize]
1841%endif
1842.loop_pass1:
1843    mova                 m0, [cq+0*32+r5]
1844    mova                 m1, [cq+1*32+r5]
1845    mova                 m2, [cq+2*32+r5]
1846    mova                 m3, [cq+3*32+r5]
1847    mova                 m4, [cq+4*32+r5]
1848    mova                 m5, [cq+5*32+r5]
1849    mova                 m6, [cq+6*32+r5]
1850    mova                 m7, [cq+7*32+r5]
1851    call                 t0
1852
1853    test                r5d, r5d
1854    jz .end_pass1
1855
1856    mova       [cq+0*32+16], m0
1857    mova       [cq+1*32+16], m1
1858    mova       [cq+2*32+16], m2
1859    mova       [cq+3*32+16], m3
1860
1861    sub                 r5d, 16
1862    jmp .loop_pass1
1863.end_pass1:
1864    mova                 m4, [cq+0*32+16]
1865    mova                 m5, [cq+1*32+16]
1866    mova                 m6, [cq+2*32+16]
1867    mova                 m7, [cq+3*32+16]
1868%if ARCH_X86_32
1869    mov                  r1, [rsp+4*16+1*gprsize]
1870%endif
1871    jmp                tx2q
1872.pass1_main:
1873    call m(idct_8x4_internal_16bpc).main_pass1
1874    pcmpeqd              m1, m1
1875    REPX      {psubd x, m1}, m0, m6, m5, m3
1876    call m(idct_8x4_internal_16bpc).round
1877    REPX      {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
1878.pack_and_transpose:
1879    packssdw             m2, m3
1880    packssdw             m6, m7
1881    packssdw             m0, m1
1882    packssdw             m4, m5
1883    jmp m(idct_8x4_internal_16bpc).transpose4x8packed
1884
1885.pass2:
1886%if ARCH_X86_32
1887    lea                  r5, [o(itx8_start)]
1888%endif
1889    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
1890    lea                  r3, [strideq*3]
1891%if ARCH_X86_64
1892    mova                m10, [o(pixel_10bpc_max)]
1893    pxor                 m9, m9
1894%endif
1895    call .round3_and_write_8x8
1896.zero:
1897%if ARCH_X86_64
1898%define mzero m9
1899%else
1900%define mzero m7
1901    pxor                 m7, m7
1902%endif
1903    REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1904%undef mzero
1905    RET
1906
1907    ; round (rounded right-shift by 5) before writing
1908    ; data in m0-7
1909    ; on x86-64, pw_2048 is in m8
1910    ; .round1 is for m0-7
1911    ; .round2 is for m0-6 & [rsp+gprsize*2]
1912    ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
1913    ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7
1914%if ARCH_X86_32
1915.round1_and_write_8x8:
1916    mova    [rsp+gprsize*2], m7
1917.round2_and_write_8x8:
1918%endif
1919.round3_and_write_8x8:
1920    mova                 m7, [o(pw_2048)]
1921%if ARCH_X86_32
1922.round4_and_write_8x8:
1923%endif
1924    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1925    pmulhrsw             m7, [rsp+gprsize*2]
1926%if ARCH_X86_64
1927    jmp .write_8x8
1928.round2_and_write_8x8:
1929    mova                 m7, [rsp+gprsize*2]
1930.round1_and_write_8x8:
1931    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
1932%endif
1933
1934    ; m0-7 have to-be-written data [pre-rounded]
1935    ; on x86-64, m9-10 contain a zero/pixel_max
1936    ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch
1937    ; r0,1,3 contain dstq/strideq/stride3q
1938    ; r5 is a scratch register
1939.write_8x8:
1940    lea                  r5, [dstq+strideq*4]
1941    paddw                m0, [dstq+strideq*0]
1942    paddw                m1, [dstq+strideq*1]
1943    paddw                m2, [dstq+strideq*2]
1944    paddw                m3, [dstq+r3]
1945    paddw                m4, [r5  +strideq*0]
1946    paddw                m5, [r5  +strideq*1]
1947    paddw                m6, [r5  +strideq*2]
1948    paddw                m7, [r5  +r3]
1949%if ARCH_X86_64
1950    REPX    {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
1951    REPX    {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
1952%else
1953    mova    [rsp+gprsize*2], m7
1954    pxor                 m7, m7
1955    REPX     {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1956    pmaxsw               m7, [rsp+gprsize*2]
1957    mova    [rsp+gprsize*2], m7
1958    mova                 m7, [o(pixel_10bpc_max)]
1959    REPX     {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6
1960    pminsw               m7, [rsp+gprsize*2]
1961%endif
1962    mova   [dstq+strideq*0], m0
1963    mova   [dstq+strideq*1], m1
1964    mova   [dstq+strideq*2], m2
1965    mova   [dstq+r3       ], m3
1966    mova   [r5  +strideq*0], m4
1967    mova   [r5  +strideq*1], m5
1968    mova   [r5  +strideq*2], m6
1969    mova   [r5  +r3       ], m7
1970    ret
1971
1972INV_TXFM_8X8_FN adst, dct
1973INV_TXFM_8X8_FN adst, adst
1974INV_TXFM_8X8_FN adst, flipadst
1975INV_TXFM_8X8_FN adst, identity, 6
1976
1977cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
1978%if ARCH_X86_32
1979    mov [rsp+4*16+1*gprsize], r1
1980%endif
1981    lea                  t0, [o(.pass1_main)]
1982    jmp m(idct_8x8_internal_16bpc).pass1_full
1983.pass1_main:
1984    call m(iadst_8x4_internal_16bpc).main_pass1
1985    call .round
1986    jmp m(idct_8x8_internal_16bpc).pack_and_transpose
1987.round:
1988%if ARCH_X86_64
1989    pcmpeqd              m8, m8         ; -1
1990    REPX     {psubd x, m8 }, m0, m6
1991    REPX     {pxor  x, m8 }, m1, m7, m3, m5
1992    REPX     {psrad x, 1  }, m0, m1, m6, m7
1993    REPX     {psubd x, m8 }, m1, m7
1994    mova                 m8, [o(pd_6144)]
1995    REPX     {paddd x, m8 }, m2, m3, m4, m5
1996    REPX     {psrad x, 13 }, m2, m3, m4, m5
1997%else
1998    mova          [r3+2*16], m0
1999
2000    pcmpeqd              m0, m0         ; -1
2001    mova                 m6, [o(pd_6144)]
2002    REPX     {pxor  x, m0 }, m1, m7, m3, m5
2003    REPX     {psrad x, 1  }, m1, m7
2004    REPX     {psubd x, m0 }, m1, m7
2005    REPX     {paddd x, m6 }, m2, m3, m4, m5
2006    REPX     {psrad x, 13 }, m2, m3, m4, m5
2007
2008    mova                 m0, [r3+2*16]
2009    psrld                m6, 12         ; +1
2010    paddd                m0, m6
2011    paddd                m6, [r3+3*16]
2012    REPX     {psrad x, 1  }, m0, m6
2013%endif
2014    ret
2015
2016.pass2:
2017%if ARCH_X86_32
2018    lea                  r5, [o(itx8_start)]
2019%endif
2020    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
2021    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
2022    lea                  r3, [strideq*3]
2023%if ARCH_X86_64
2024    mova                m10, [o(pixel_10bpc_max)]
2025    pxor                 m9, m9
2026%endif
2027    call .round3_and_write_8x8
2028    jmp m(idct_8x8_internal_16bpc).zero
2029
2030    ; round (rounded right-shift by 5) before writing; odd registers are negated
2031    ; data in m0-7
2032    ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11
2033    ; .round1 is for m0-7
2034    ; .round2 is for m0-6 & [rsp+gprsize*2]
2035    ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
2036%if ARCH_X86_64
2037.round2_and_write_8x8:
2038    mova                 m7, [rsp+gprsize*2]
2039.round1_and_write_8x8:
2040    REPX  {pmulhrsw x, m8 }, m0, m2, m4, m6
2041    REPX  {pmulhrsw x, m11}, m1, m3, m5, m7
2042    jmp m(idct_8x8_internal_16bpc).write_8x8
2043%else
2044.round1_and_write_8x8:
2045    mova    [rsp+gprsize*2], m7
2046.round2_and_write_8x8:
2047%endif
2048.round3_and_write_8x8:
2049    mova                 m7, [o(pw_2048)]
2050    REPX   {pmulhrsw x, m7}, m0, m2, m4, m6
2051    mova                 m7, [o(pw_m2048)]
2052    REPX   {pmulhrsw x, m7}, m1, m3, m5
2053    pmulhrsw             m7, [rsp+gprsize*2]
2054    jmp m(idct_8x8_internal_16bpc).write_8x8
2055
2056INV_TXFM_8X8_FN flipadst, dct
2057INV_TXFM_8X8_FN flipadst, adst
2058INV_TXFM_8X8_FN flipadst, flipadst
2059INV_TXFM_8X8_FN flipadst, identity, 6
2060
2061cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2062%if ARCH_X86_32
2063    mov [rsp+4*16+1*gprsize], r1
2064%endif
2065    lea                  t0, [o(.pass1_main)]
2066    jmp m(idct_8x8_internal_16bpc).pass1_full
2067.pass1_main:
2068    call m(iadst_8x4_internal_16bpc).main_pass1
2069    call m(iadst_8x8_internal_16bpc).round
2070    ; invert registers
2071    packssdw             m7, m6
2072    packssdw             m5, m4
2073    packssdw             m3, m2
2074    packssdw             m1, m0
2075    mova                 m0, m7
2076    mova                 m2, m5
2077    mova                 m4, m3
2078    mova                 m6, m1
2079    jmp m(idct_8x4_internal_16bpc).transpose4x8packed
2080
2081.pass2:
2082    lea                dstq, [dstq+strideq*8]
2083    sub                dstq, strideq
2084    neg             strideq
2085    jmp m(iadst_8x8_internal_16bpc).pass2
2086
2087INV_TXFM_8X8_FN identity, dct
2088INV_TXFM_8X8_FN identity, adst
2089INV_TXFM_8X8_FN identity, flipadst
2090INV_TXFM_8X8_FN identity, identity
2091
2092cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2093    mova                 m0, [cq+0*32]
2094    mova                 m1, [cq+1*32]
2095    mova                 m2, [cq+2*32]
2096    mova                 m3, [cq+3*32]
2097    mova                 m4, [cq+4*32]
2098    mova                 m5, [cq+5*32]
2099    mova                 m6, [cq+6*32]
2100    mova                 m7, [cq+7*32]
2101    packssdw             m0, [cq+0*32+16]
2102    packssdw             m1, [cq+1*32+16]
2103    packssdw             m2, [cq+2*32+16]
2104    packssdw             m3, [cq+3*32+16]
2105    packssdw             m4, [cq+4*32+16]
2106    packssdw             m5, [cq+5*32+16]
2107    packssdw             m6, [cq+6*32+16]
2108    packssdw             m7, [cq+7*32+16]
2109    mova [rsp+gprsize+16*1], m6
2110    jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3
2111
2112.pass2:
2113%if ARCH_X86_32
2114    lea                  r5, [o(itx8_start)]
2115%endif
2116    lea                  r3, [strideq*3]
2117%if ARCH_X86_64
2118    mova                m10, [o(pixel_10bpc_max)]
2119    pxor                 m9, m9
2120    mova                 m8, [o(pw_4096)]
2121    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
2122%else
2123    mova      [rsp+gprsize], m7
2124    mova                 m7, [o(pw_4096)]
2125    call m(idct_8x8_internal_16bpc).round4_and_write_8x8
2126%endif
2127    jmp m(idct_8x8_internal_16bpc).zero
2128
2129%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
2130%if ARCH_X86_64
2131    INV_TXFM_FN          %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16
2132%else
2133    INV_TXFM_FN          %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
2134%endif
2135%ifidn %1_%2, dct_dct
2136    imul                r5d, [cq], 181
2137    mov                [cq], eobd ; 0
2138    add                 r5d, 128
2139    sar                 r5d, 8
2140    imul                r5d, 181
2141    mov                 r3d, 4
2142%if stack_size_padded > 0
2143    ; adjust to caller's stack allocation
2144    add                 rsp, (12+ARCH_X86_64)*16
2145%endif
2146    jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end
2147%endif
2148%endmacro
2149
2150INV_TXFM_8X16_FN dct, dct
2151INV_TXFM_8X16_FN dct, identity, v
2152INV_TXFM_8X16_FN dct, adst
2153INV_TXFM_8X16_FN dct, flipadst
2154
2155%if ARCH_X86_64
2156DECLARE_REG_TMP 7
2157%endif
2158
2159cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2160%if WIN64
2161    PUSH                 r7
2162%elif ARCH_X86_32
2163    mov [rsp+16*16+gprsize*1], r1
2164    mov [rsp+16*16+gprsize*2], r6
2165%endif
2166    lea                  t0, [o(m(idct_8x8_internal_16bpc).pass1_main)]
2167.pass1_full:
2168%if ARCH_X86_64
2169    mova                m11, [o(pd_2048)]
2170    mova                m12, [o(clip_18b_min)]
2171    mova                m13, [o(clip_18b_max)]
2172    mova                m14, [o(pd_2896)]
2173%endif
2174%undef cmp
2175    mov                 r6d, 4
2176.zero_loop:
2177    dec                 r6d
2178    cmp                eobb, byte [r5+r6]
2179    jl .zero_loop
2180    mov                 r5d, r6d
2181    shl                 r5d, 4
2182%if ARCH_X86_32
2183    ; restore pic-ptr
2184    mov                  r6, [rsp+16*16+2*gprsize]
2185    ; setup stack pointer
2186    lea                  r3, [rsp+gprsize]
2187%endif
2188.loop_pass1:
2189    mova                 m0, [cq+0*64+r5]
2190    mova                 m1, [cq+1*64+r5]
2191    mova                 m2, [cq+2*64+r5]
2192    mova                 m3, [cq+3*64+r5]
2193    mova                 m4, [cq+4*64+r5]
2194    mova                 m5, [cq+5*64+r5]
2195    mova                 m6, [cq+6*64+r5]
2196    mova                 m7, [cq+7*64+r5]
2197    call m(idct_8x4_internal_16bpc).rect2_mul
2198    call                 t0
2199
2200    mova       [cq+0*64+r5], m0
2201    mova       [cq+1*64+r5], m1
2202    mova       [cq+2*64+r5], m2
2203    mova       [cq+3*64+r5], m3
2204    sub                 r5d, 16
2205    jge .loop_pass1
2206%if WIN64
2207    POP                  r7
2208%elif ARCH_X86_32
2209    mov                  r1, [rsp+16*16+1*gprsize]
2210%endif
2211    jmp                tx2q
2212
2213.pass2:
2214%if ARCH_X86_32
2215    lea                  r5, [o(itx8_start)]
2216%endif
2217
2218    ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15
2219    ; some are still pre-loaded from the final loop iteration in pass=1
2220
2221    mova                 m1, m2
2222    mova                 m2, [cq+ 1*16]
2223    mova                 m3, [cq+ 9*16]
2224    mova                 m4, [cq+ 2*16]
2225    mova                 m5, [cq+10*16]
2226    mova                 m6, [cq+ 3*16]
2227    mova                 m7, [cq+11*16]
2228    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
2229    mova [rsp+gprsize+3*16], m0
2230    mova [rsp+gprsize+4*16], m1
2231    mova [rsp+gprsize+5*16], m2
2232    mova [rsp+gprsize+6*16], m3
2233    mova [rsp+gprsize+7*16], m4
2234    mova [rsp+gprsize+8*16], m5
2235    mova [rsp+gprsize+9*16], m6
2236    ; m7 is already stored in [rsp+gprsize+0*16]
2237    mova                 m0, [cq+ 4*16]
2238    mova                 m1, [cq+12*16]
2239    mova                 m2, [cq+ 5*16]
2240    mova                 m3, [cq+13*16]
2241    mova                 m4, [cq+ 6*16]
2242    mova                 m5, [cq+14*16]
2243    mova                 m6, [cq+ 7*16]
2244    mova                 m7, [cq+15*16]
2245    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
2246
2247    ; out0-7 is in rsp+gprsize+3-10*mmsize
2248    ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
2249
2250%if ARCH_X86_64
2251    mova                 m8, [o(pw_2048)]
2252    mova                m10, [o(pixel_10bpc_max)]
2253    pxor                 m9, m9
2254    mov                  r6, dstq
2255%else
2256    mov [rsp+16*16+gprsize*1], dstq
2257%endif
2258    lea                  r3, [strideq*3]
2259    lea                dstq, [dstq+strideq*8]
2260    call m(idct_8x8_internal_16bpc).round2_and_write_8x8
2261%if ARCH_X86_64
2262%define mzero m9
2263%else
2264%define mzero m7
2265    pxor                 m7, m7
2266%endif
2267    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
2268                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2269%undef mzero
2270    mova                 m0, [rsp+gprsize+ 3*16]
2271    mova                 m1, [rsp+gprsize+ 4*16]
2272    mova                 m2, [rsp+gprsize+ 5*16]
2273    mova                 m3, [rsp+gprsize+ 6*16]
2274    mova                 m4, [rsp+gprsize+ 7*16]
2275    mova                 m5, [rsp+gprsize+ 8*16]
2276    mova                 m6, [rsp+gprsize+ 9*16]
2277    mova                 m7, [rsp+gprsize+10*16]
2278%if ARCH_X86_64
2279    mov                dstq, r6
2280%else
2281    mov                dstq, [rsp+16*16+gprsize*1]
2282%endif
2283    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
2284    RET
2285
2286INV_TXFM_8X16_FN adst, dct
2287INV_TXFM_8X16_FN adst, adst
2288INV_TXFM_8X16_FN adst, flipadst
2289INV_TXFM_8X16_FN adst, identity, v
2290
2291cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2292%if WIN64
2293    PUSH                 r7
2294%elif ARCH_X86_32
2295    mov [rsp+16*16+gprsize*1], r1
2296    mov [rsp+16*16+gprsize*2], r6
2297%endif
2298    lea                  t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)]
2299    jmp m(idct_8x16_internal_16bpc).pass1_full
2300
2301.pass2:
2302%if ARCH_X86_32
2303    lea                  r5, [o(itx8_start)]
2304%endif
2305    mova                 m4, [cq+ 9*16]
2306    mova                 m5, [cq+13*16]
2307    mova [rsp+gprsize+7*16], m0
2308    mova [rsp+gprsize+8*16], m1
2309    mova [rsp+gprsize+5*16], m4
2310    mova [rsp+gprsize+6*16], m5
2311    mova                 m0, m2
2312    mova                 m1, m3
2313    mova                 m2, [cq+ 1*16]
2314    mova                 m3, [cq+ 5*16]
2315    mova                 m4, [cq+ 2*16]
2316    mova                 m5, [cq+ 6*16]
2317    mova                 m6, [cq+11*16]
2318    mova                 m7, [cq+15*16]
2319    mova [rsp+gprsize+ 3*16], m4
2320    mova [rsp+gprsize+ 4*16], m5
2321    mova [rsp+gprsize+ 9*16], m6
2322    mova [rsp+gprsize+10*16], m7
2323    mova                 m4, [cq+10*16]
2324    mova                 m5, [cq+14*16]
2325    mova                 m6, [cq+ 3*16]
2326    mova                 m7, [cq+ 7*16]
2327    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
2328    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
2329
2330%if ARCH_X86_64
2331    mova                m11, [o(pw_m2048)]
2332    mova                 m8, [o(pw_2048)]
2333    mova                m10, [o(pixel_10bpc_max)]
2334    pxor                 m9, m9
2335    mov                  r6, dstq
2336%else
2337    mov [rsp+16*16+gprsize*1], dstq
2338%endif
2339    lea                  r3, [strideq*3]
2340    lea                dstq, [dstq+strideq*8]
2341    call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
2342%if ARCH_X86_64
2343%define mzero m9
2344%else
2345%define mzero m7
2346    pxor                 m7, m7
2347%endif
2348    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
2349                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
2350%undef mzero
2351    mova                 m0, [rsp+gprsize+ 3*16]
2352    mova                 m1, [rsp+gprsize+ 4*16]
2353    mova                 m2, [rsp+gprsize+ 5*16]
2354    mova                 m3, [rsp+gprsize+ 6*16]
2355    mova                 m4, [rsp+gprsize+ 7*16]
2356    mova                 m5, [rsp+gprsize+ 8*16]
2357    mova                 m6, [rsp+gprsize+ 9*16]
2358    mova                 m7, [rsp+gprsize+10*16]
2359%if ARCH_X86_64
2360    mov                dstq, r6
2361%else
2362    mov                dstq, [rsp+16*16+gprsize*1]
2363%endif
2364    call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
2365    RET
2366
2367INV_TXFM_8X16_FN flipadst, dct
2368INV_TXFM_8X16_FN flipadst, adst
2369INV_TXFM_8X16_FN flipadst, flipadst
2370INV_TXFM_8X16_FN flipadst, identity, v
2371
2372cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2373%if WIN64
2374    PUSH                 r7
2375%elif ARCH_X86_32
2376    mov [rsp+16*16+gprsize*1], r1
2377    mov [rsp+16*16+gprsize*2], r6
2378%endif
2379    lea                  t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)]
2380    jmp m(idct_8x16_internal_16bpc).pass1_full
2381
2382.pass2:
2383    lea                  r3, [strideq*3]
2384    lea                  r3, [r3*5]
2385    add                dstq, r3
2386    neg             strideq
2387    jmp m(iadst_8x16_internal_16bpc).pass2
2388
2389INV_TXFM_8X16_FN identity, dct, h
2390INV_TXFM_8X16_FN identity, adst, h
2391INV_TXFM_8X16_FN identity, flipadst, h
2392INV_TXFM_8X16_FN identity, identity
2393
2394cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2395%if WIN64
2396    PUSH                 r7
2397%elif ARCH_X86_32
2398    mov [rsp+16*16+gprsize*1], r1
2399    mov [rsp+16*16+gprsize*2], r6
2400%endif
2401    lea                  t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)]
2402    jmp m(idct_8x16_internal_16bpc).pass1_full
2403
2404.pass2:
2405%if ARCH_X86_64
2406    mova                 m4, [o(pw_2048)]
2407    mova                 m5, [o(pixel_10bpc_max)]
2408    pxor                 m6, m6
2409    mova                 m7, [o(pw_1697x16)]
2410%endif
2411    mov                 r5d, 4
2412    lea                  r3, [strideq*3]
2413.pass2_loop:
2414    call .main
2415%if ARCH_X86_64
2416    call m(idct_8x4_internal_16bpc).round1_and_write_8x4
2417%else
2418    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
2419%endif
2420    REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28
2421    dec                 r5d
2422    jle .end
2423    add                  cq, 16
2424    lea                dstq, [dstq+strideq*4]
2425    mova                 m0, [cq+ 0*16]
2426    mova                 m1, [cq+ 4*16]
2427    mova                 m2, [cq+ 8*16]
2428    mova                 m3, [cq+12*16]
2429    jmp .pass2_loop
2430.end:
2431    RET
2432.main:
2433    ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y)
2434%if ARCH_X86_32
2435    mova                 m7, [o(pw_1697x16)]
2436    pmulhrsw             m4, m7, m0
2437    pmulhrsw             m5, m7, m1
2438    pmulhrsw             m6, m7, m2
2439    pmulhrsw             m7, m3
2440%else
2441    pmulhrsw             m8, m7, m0
2442    pmulhrsw             m9, m7, m1
2443    pmulhrsw            m10, m7, m2
2444    pmulhrsw            m11, m7, m3
2445%endif
2446    REPX      {paddsw x, x}, m0, m1, m2, m3
2447%if ARCH_X86_64
2448    paddsw               m0, m8
2449    paddsw               m1, m9
2450    paddsw               m2, m10
2451    paddsw               m3, m11
2452%else
2453    paddsw               m0, m4
2454    paddsw               m1, m5
2455    paddsw               m2, m6
2456    paddsw               m3, m7
2457%endif
2458    ret
2459
2460%macro INV_TXFM_16X4_FN 2 ; type1, type2
2461%if ARCH_X86_64
2462    INV_TXFM_FN          %1, %2, 0, 16x4, 16, 0-8*16
2463%else
2464    INV_TXFM_FN          %1, %2, 0, 16x4, 8, 0-12*16
2465%endif
2466%ifidn %1_%2, dct_dct
2467    imul                r5d, [cq], 181
2468    mov                [cq], eobd ; 0
2469    mov                 r3d, 4
2470.dconly:
2471    add                 r5d, 384
2472    sar                 r5d, 9
2473.dconly2:
2474    imul                r5d, 2896
2475    add                 r5d, 34816
2476    movd                 m0, r5d
2477    pshuflw              m0, m0, q1111
2478    punpcklqdq           m0, m0
2479    mova                 m3, [o(pixel_10bpc_max)]
2480    pxor                 m4, m4
2481.loop:
2482    mova                 m1, [dstq+ 0]
2483    mova                 m2, [dstq+16]
2484    REPX     {paddw  x, m0}, m1, m2
2485    REPX     {pminsw x, m3}, m1, m2
2486    REPX     {pmaxsw x, m4}, m1, m2
2487    mova          [dstq+ 0], m1
2488    mova          [dstq+16], m2
2489    add                dstq, strideq
2490    dec                 r3d
2491    jg .loop
2492    RET
2493%endif
2494%endmacro
2495
2496INV_TXFM_16X4_FN dct, dct
2497INV_TXFM_16X4_FN dct, identity
2498INV_TXFM_16X4_FN dct, adst
2499INV_TXFM_16X4_FN dct, flipadst
2500
2501cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2502%if ARCH_X86_64
2503    mova                m11, [o(pd_2048)]
2504    mova                m12, [o(clip_18b_min)]
2505    mova                m13, [o(clip_18b_max)]
2506    mova                m14, [o(pd_2896)]
2507%endif
2508    ; setup stack pointer
2509    lea                  r3, [rsp+gprsize]
2510
2511    mova                 m0, [cq+ 1*16]
2512    mova                 m1, [cq+ 3*16]
2513    mova                 m2, [cq+ 5*16]
2514    mova                 m3, [cq+ 7*16]
2515    mova                 m4, [cq+ 9*16]
2516    mova                 m5, [cq+11*16]
2517    mova                 m6, [cq+13*16]
2518    mova                 m7, [cq+15*16]
2519    call .main_oddhalf
2520    mova                 m0, [cq+ 0*16]
2521    mova                 m1, [cq+ 2*16]
2522    mova                 m2, [cq+ 4*16]
2523    mova                 m3, [cq+ 6*16]
2524    mova                 m4, [cq+ 8*16]
2525    mova                 m5, [cq+10*16]
2526    mova                 m6, [cq+12*16]
2527    mova                 m7, [cq+14*16]
2528    call m(idct_8x4_internal_16bpc).main_pass1
2529    call m(idct_8x4_internal_16bpc).round
2530    ; t0-7 is in m0-7
2531
2532    call .round
2533
2534%if ARCH_X86_64
2535.pack_transpose:
2536    ; transpose in two parts
2537    packssdw             m0, m1
2538    packssdw             m2, m3
2539    packssdw             m4, m5
2540    packssdw             m6, m7
2541    packssdw             m8, m9
2542    packssdw            m10, m11
2543    packssdw            m12, m13
2544    packssdw            m14, m15
2545.transpose:
2546    call m(idct_8x4_internal_16bpc).transpose4x8packed
2547    call .transpose4x8packed_hi
2548%else
2549    call m(idct_8x4_internal_16bpc).transpose4x8packed
2550    mova          [r3+0*16], m0
2551    mova          [r3+1*16], m1
2552    mova          [r3+2*16], m2
2553    mova          [r3+3*16], m3
2554    mova                 m0, [r3+ 8*16]
2555    mova                 m2, [r3+ 9*16]
2556    mova                 m4, [r3+10*16]
2557    mova                 m6, [r3+11*16]
2558    call m(idct_8x4_internal_16bpc).transpose4x8packed
2559%endif
2560    jmp                tx2q
2561%if ARCH_X86_64
2562.transpose4x8packed_hi:
2563    punpcklwd            m9, m10, m14
2564    punpckhwd           m10, m14
2565    punpckhwd           m14, m8, m12
2566    punpcklwd            m8, m12
2567
2568    punpckhwd           m11, m8, m9
2569    punpcklwd            m8, m9
2570    punpckhwd           m12, m14, m10
2571    punpcklwd           m14, m10
2572
2573    punpcklwd           m10, m11, m12
2574    punpckhwd           m11, m12
2575    punpckhwd            m9, m8, m14
2576    punpcklwd            m8, m14
2577    ret
2578%endif
2579.main_oddhalf_fast: ; lower half zero
2580    pmulld               m7, m0, [o(pd_4076)]
2581    pmulld               m0, [o(pd_401)]
2582    pmulld               m6, m1, [o(pd_m1189)]
2583    pmulld               m1, [o(pd_3920)]
2584%if ARCH_X86_32
2585    mova                 m4, [o(pd_2048)]
2586    REPX      {paddd x, m4}, m1, m6
2587    REPX      {psrad x, 12}, m1, m6
2588    mova          [r3+1*16], m1
2589%endif
2590    pmulld               m5, m2, [o(pd_3612)]
2591    pmulld               m2, [o(pd_1931)]
2592%if ARCH_X86_32
2593    pmulld               m1, m3, [o(pd_m2598)]
2594%else
2595    pmulld               m4, m3, [o(pd_m2598)]
2596%endif
2597    pmulld               m3, [o(pd_3166)]
2598    jmp .main_oddhalf_fast2
2599.main_oddhalf:
2600%if ARCH_X86_64
2601    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  401, 4076 ; t8a,  t15a
2602    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
2603    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
2604    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3166, 2598 ; t9a,  t14a
2605.main_oddhalf_fast2:
2606    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
2607    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
2608    psubd                m8, m0, m4 ; t9
2609    paddd                m0, m4     ; t8
2610    psubd                m4, m6, m2 ; t10
2611    paddd                m2, m6     ; t11
2612    psubd                m6, m1, m5 ; t13
2613    paddd                m5, m1     ; t12
2614    psubd                m1, m7, m3 ; t14
2615    paddd                m7, m3     ; t15
2616    REPX    {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
2617    REPX    {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
2618    mova                m15, [o(pd_3784)]
2619    mova                m10, [o(pd_1567)]
2620    ITX_MULSUB_2D         1, 8, 3, 9, _, 11, 10, 15
2621    ITX_MULSUB_2D         6, 4, 3, 9, _, 11, 10, 15, 4
2622    psubd                m3, m1, m4 ; t10
2623    paddd                m1, m4     ; t9
2624    psubd                m4, m0, m2 ; t11a
2625    paddd                m0, m2     ; t8a
2626    psubd                m2, m8, m6 ; t13
2627    paddd                m6, m8     ; t14
2628    psubd                m8, m7, m5 ; t12a
2629    paddd                m7, m5     ; t15a
2630    REPX    {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
2631    REPX    {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
2632    REPX    {pmulld x, m14}, m2, m8, m3, m4
2633    paddd                m2, m11
2634    paddd                m8, m11
2635    paddd                m5, m2, m3 ; t13a
2636    psubd                m2, m3     ; t10a
2637    psubd                m3, m8, m4 ; t11
2638    paddd                m4, m8     ; t12
2639    REPX      {psrad x, 12}, m5, m2, m3, m4
2640    mova          [r3+0*16], m0
2641    mova          [r3+1*16], m1
2642    mova          [r3+2*16], m2
2643    mova          [r3+3*16], m3
2644    mova          [r3+4*16], m4
2645    mova          [r3+5*16], m5
2646    mova          [r3+6*16], m6
2647    mova          [r3+7*16], m7
2648%else
2649    mova          [r3+0*16], m2
2650    mova          [r3+1*16], m3
2651    mova          [r3+2*16], m4
2652    mova          [r3+3*16], m5
2653    mova                 m4, [o(pd_2048)]
2654
2655    ITX_MULSUB_2D         0, 7, 2, 3, 5, _,  401, 4076 ; t8a,  t15a
2656    ITX_MULSUB_2D         6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a
2657
2658    mova                 m2, [r3+0*16]
2659    mova                 m3, [r3+1*16]
2660    mova          [r3+0*16], m0
2661    mova          [r3+1*16], m1
2662    mova                 m1, [r3+2*16]
2663    mova                 m5, [r3+3*16]
2664    mova          [r3+2*16], m6
2665    mova          [r3+3*16], m7
2666
2667    ITX_MULSUB_2D         2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a
2668    ITX_MULSUB_2D         1, 3, 0, 6, 7, _, 3166, 2598 ; t9a,  t14a
2669
2670    mova                 m0, [r3+0*16]
2671    mova                 m6, [r3+2*16]
2672    mova                 m7, [r3+3*16]
2673.main_oddhalf_fast2:
2674    REPX      {paddd x, m4}, m0, m7, m2, m5, m1, m3
2675    REPX      {psrad x, 12}, m0, m7, m2, m5, m1, m3
2676    psubd                m4, m0, m1 ; t9
2677    paddd                m0, m1     ; t8
2678    mova                 m1, [r3+1*16]
2679    mova          [r3+0*16], m4
2680    psubd                m4, m6, m2 ; t10
2681    paddd                m2, m6     ; t11
2682    psubd                m6, m1, m5 ; t13
2683    paddd                m5, m1     ; t12
2684    psubd                m1, m7, m3 ; t14
2685    paddd                m7, m3     ; t15
2686    mova                 m3, [o(clip_18b_min)]
2687    REPX     {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7
2688    pmaxsd               m3, [r3+0*16]
2689    mova          [r3+0*16], m3
2690    mova                 m3, [o(clip_18b_max)]
2691    REPX     {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7
2692    pminsd               m3, [r3+0*16]
2693    mova          [r3+0*16], m0
2694    mova          [r3+1*16], m2
2695    mova          [r3+2*16], m5
2696    mova          [r3+3*16], m7
2697    mova                m7, [o(pd_2048)]
2698    ITX_MULSUB_2D         1, 3, 0, 2, 5, 7, 1567, 3784
2699    ITX_MULSUB_2D         6, 4, 0, 2, _, 7,    5, 3784, 4
2700    mova                 m0, [r3+0*16]
2701    mova                 m2, [r3+1*16]
2702    psubd                m5, m1, m4 ; t10
2703    mova          [r3+1*16], m5
2704    paddd                m1, m4     ; t9
2705    psubd                m4, m0, m2 ; t11a
2706    paddd                m0, m2     ; t8a
2707    mova                 m5, [r3+2*16]
2708    mova                 m7, [r3+3*16]
2709    psubd                m2, m3, m6 ; t13
2710    paddd                m6, m3     ; t14
2711    paddd                m3, m7, m5 ; t15a
2712    psubd                m7, m5     ; t12a
2713    mova          [r3+0*16], m3
2714    mova                 m3, [r3+1*16]
2715    mova                 m5, [o(clip_18b_min)]
2716    REPX     {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6
2717    pmaxsd               m5, [r3+0*16]
2718    mova          [r3+0*16], m5
2719    mova                 m5, [o(clip_18b_max)]
2720    REPX     {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6
2721    pminsd               m5, [r3+0*16]
2722    mova          [r3+0*16], m5
2723    mova                 m5, [o(pd_2896)]
2724    REPX     {pmulld x, m5}, m2, m7, m3, m4
2725    mova                 m5, [o(pd_2048)]
2726    REPX     {paddd  x, m5}, m2, m7
2727    paddd                m5, m2, m3 ; t13a
2728    psubd                m2, m3     ; t10a
2729    psubd                m3, m7, m4 ; t11
2730    paddd                m4, m7     ; t12
2731    REPX      {psrad x, 12}, m5, m2, m3, m4
2732    mova                 m7, [r3+0*16]
2733    mova         [r3+11*16], m0
2734    mova         [r3+10*16], m1
2735    mova          [r3+9*16], m2
2736    mova          [r3+8*16], m3
2737    mova          [r3+7*16], m4
2738    mova          [r3+6*16], m5
2739    mova          [r3+5*16], m6
2740    mova          [r3+4*16], m7
2741%endif
2742    ret
2743.round:
2744%if ARCH_X86_64
2745    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
2746    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
2747    pcmpeqd              m8, m8
2748    REPX      {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
2749    mova                 m8, [r3+1*16]
2750    mova                 m9, [r3+2*16]
2751    mova                m10, [r3+3*16]
2752    mova                m11, [r3+4*16]
2753    mova                m12, [r3+5*16]
2754    mova                m13, [r3+6*16]
2755    mova                m14, [r3+7*16]
2756    psubd               m15, m0, m14       ; out15
2757    paddd                m0, m14           ; out0
2758    psubd               m14, m1, m13       ; out14
2759    paddd                m1, m13           ; out1
2760    psubd               m13, m2, m12       ; out13
2761    paddd                m2, m12           ; out2
2762    psubd               m12, m3, m11       ; out12
2763    paddd                m3, m11           ; out3
2764    psubd               m11, m4, m10       ; out11
2765    paddd                m4, m10           ; out4
2766    psubd               m10, m5, m9        ; out10
2767    paddd                m5, m9            ; out5
2768    psubd                m9, m6, m8        ; out9
2769    paddd                m6, m8            ; out6
2770    psubd                m8, m7, [r3+0*16] ; out8
2771    paddd                m7, [r3+0*16]     ; out7
2772    REPX       {psrad x, 1}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
2773                             m8,  m9,  m10, m11, m12, m13, m14, m15
2774    ; and out0-15 is now in m0-15
2775%else
2776    mova         [r3+ 0*16], m0
2777    mova                 m0, [o(clip_18b_min)]
2778    REPX     {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
2779    pmaxsd               m0, [r3+ 0*16]
2780    mova         [r3+ 0*16], m7
2781    mova                 m7, [o(clip_18b_max)]
2782    REPX     {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
2783    pminsd               m7, [r3+ 0*16]
2784    mova         [r3+ 0*16], m0
2785    pcmpeqd              m0, m0
2786    REPX      {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
2787    mova         [r3+ 1*16], m1
2788    mova         [r3+ 2*16], m2
2789    mova                 m1, [r3+ 0*16]
2790    psubd                m1, m0
2791    mova         [r3+ 0*16], m1
2792    mova                 m1, [r3+11*16]
2793    mova                 m2, [r3+10*16]
2794    psubd                m0, m7, m1
2795    paddd                m7, m1
2796    psubd                m1, m6, m2
2797    paddd                m6, m2
2798    REPX       {psrad x, 1}, m0, m1, m6, m7
2799    packssdw             m0, m1     ; out8-9
2800    packssdw             m6, m7     ; out6-7
2801    mova         [r3+11*16], m6
2802    mova                 m1, [r3+9*16]
2803    mova                 m7, [r3+8*16]
2804    psubd                m2, m5, m1
2805    paddd                m5, m1
2806    psubd                m1, m4, m7
2807    paddd                m4, m7
2808    REPX       {psrad x, 1}, m2, m1, m4, m5
2809    packssdw             m2, m1     ; out10-11
2810    packssdw             m4, m5     ; out4-5
2811    mova                 m1, [r3+2*16]
2812    mova         [r3+10*16], m4
2813    mova                 m6, [r3+7*16]
2814    mova                 m7, [r3+6*16]
2815    psubd                m4, m3, m6
2816    paddd                m3, m6
2817    psubd                m6, m1, m7
2818    paddd                m1, m7
2819    REPX       {psrad x, 1}, m4, m6, m1, m3
2820    packssdw             m4, m6     ; out12-13
2821    packssdw             m1, m3     ; out2-3
2822    mova                 m3, [r3+1*16]
2823    mova          [r3+9*16], m1
2824    mova                 m1, [r3+0*16]
2825    mova                 m5, [r3+5*16]
2826    mova                 m7, [r3+4*16]
2827    psubd                m6, m3, m5
2828    paddd                m3, m5
2829    psubd                m5, m1, m7
2830    paddd                m1, m7
2831    REPX       {psrad x, 1}, m6, m5, m1, m3
2832    packssdw             m6, m5     ; out14-15
2833    packssdw             m1, m3     ; out0-1
2834    mova          [r3+8*16], m1
2835%endif
2836    ret
2837
2838.pass2:
2839    lea                  r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)]
2840.pass2_loop:
2841    lea                  r3, [strideq*3]
2842%if ARCH_X86_32
2843    lea                  r5, [o(itx8_start)]
2844%endif
2845    call                 r4
2846    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
2847    REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
2848%if ARCH_X86_64
2849    mova                 m0, m8
2850    mova                 m1, m9
2851    mova                 m2, m10
2852    mova                 m3, m11
2853%else
2854    mova                 m0, [rsp+gprsize+0*16]
2855    mova                 m1, [rsp+gprsize+1*16]
2856    mova                 m2, [rsp+gprsize+2*16]
2857    mova                 m3, [rsp+gprsize+3*16]
2858%endif
2859    add                dstq, 16
2860%if ARCH_X86_32
2861    lea                  r5, [o(itx8_start)]
2862%endif
2863    call                 r4
2864    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
2865    RET
2866
2867INV_TXFM_16X4_FN adst, dct
2868INV_TXFM_16X4_FN adst, adst
2869INV_TXFM_16X4_FN adst, flipadst
2870INV_TXFM_16X4_FN adst, identity
2871
2872cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
2873    ; setup stack pointer
2874    lea                  r3, [rsp+gprsize]
2875    call .main
2876%if ARCH_X86_64
2877    jmp m(idct_16x4_internal_16bpc).pack_transpose
2878%else
2879    call m(idct_8x4_internal_16bpc).transpose4x8packed
2880    mova [rsp+gprsize+0*16], m0
2881    mova [rsp+gprsize+1*16], m1
2882    mova [rsp+gprsize+2*16], m2
2883    mova [rsp+gprsize+3*16], m3
2884    mova                 m0, [rsp+gprsize+ 8*16]
2885    mova                 m2, [rsp+gprsize+ 9*16]
2886    mova                 m4, [rsp+gprsize+10*16]
2887    mova                 m6, [rsp+gprsize+11*16]
2888    call m(idct_8x4_internal_16bpc).transpose4x8packed
2889    jmp                tx2q
2890%endif
2891
2892.main:
2893%if ARCH_X86_64
2894    mova                m11, [o(pd_2048)]
2895    mova                m12, [o(clip_18b_min)]
2896    mova                m13, [o(clip_18b_max)]
2897    mova                m14, [o(pd_2896)]
2898%endif
2899    mova                 m0, [cq+ 2*16]
2900    mova                 m1, [cq+13*16]
2901    mova                 m2, [cq+ 6*16]
2902    mova                 m3, [cq+ 9*16]
2903    mova                 m4, [cq+10*16]
2904    mova                 m5, [cq+ 5*16]
2905    mova                 m6, [cq+14*16]
2906    mova                 m7, [cq+ 1*16]
2907    call .main_part1
2908    mova                 m0, [cq+ 0*16]
2909    mova                 m1, [cq+15*16]
2910    mova                 m2, [cq+ 4*16]
2911    mova                 m3, [cq+11*16]
2912    mova                 m4, [cq+ 8*16]
2913    mova                 m5, [cq+ 7*16]
2914    mova                 m6, [cq+12*16]
2915    mova                 m7, [cq+ 3*16]
2916    call .main_part2
2917.round:
2918%if ARCH_X86_64
2919    mova                m15, [o(pd_6144)]
2920    psrld               m14, 11       ; pd_1
2921    pcmpeqd              m8, m8       ; -1
2922    psubd               m13, m15, m14 ; pd_6143
2923    REPX     {paddd x, m14}, m0, m2
2924    REPX     {paddd x, m15}, m4, m6
2925    REPX     {pxor  x, m8 }, m1, m3, m5, m7
2926    REPX     {psrad x, 1  }, m1, m3
2927    REPX     {paddd x, m15}, m5, m7
2928    REPX     {psubd x, m8 }, m1, m3
2929    paddd                m8, m15, m9
2930    psubd                m9, m13, m10
2931    paddd               m10, m15, m11
2932    psubd               m11, m13, m12
2933    paddd               m12, m14, [r3+3*16]
2934    psubd               m13, m14, [r3+2*16]
2935    psubd               m15, m14, [r3+0*16]
2936    paddd               m14, [r3+1*16]
2937    REPX      {psrad x, 1 }, m0,  m2,  m12, m13, m14, m15
2938    REPX      {psrad x, 13}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
2939%else
2940    mova          [r3+8*16], m1
2941    mova          [r3+9*16], m3
2942    mova                 m3, [o(pd_6144)]
2943    pcmpeqd              m1, m1
2944    REPX      {pxor  x, m1}, m5, m7
2945    REPX      {paddd x, m3}, m4, m5, m6, m7
2946    REPX      {psrad x, 13}, m4, m5, m6, m7
2947    packssdw             m4, m5
2948    packssdw             m6, m7
2949    mova         [r3+10*16], m4
2950    mova         [r3+11*16], m6
2951    mova                 m4, [r3+4*16]
2952    mova                 m5, [r3+5*16]
2953    mova                 m6, [r3+6*16]
2954    mova                 m7, [r3+7*16]
2955    REPX      {pxor  x, m1}, m5, m7
2956    REPX      {psubd x, m1}, m4, m6
2957    REPX      {psrad x, 1 }, m4, m5, m6, m7
2958    REPX      {psubd x, m1}, m5, m7
2959    packssdw             m4, m5
2960    packssdw             m6, m7
2961    mova                 m5, [r3+8*16]
2962    mova                 m7, [r3+9*16]
2963    mova          [r3+8*16], m4
2964    mova          [r3+9*16], m6
2965    REPX      {pxor  x, m1}, m5, m7
2966    REPX      {paddd x, m3}, m0, m5, m2, m7
2967    REPX      {psrad x, 13}, m0, m5, m2, m7
2968    packssdw             m0, m5
2969    packssdw             m2, m7
2970    mova                 m4, [r3+0*16]
2971    mova                 m5, [r3+1*16]
2972    mova                 m6, [r3+2*16]
2973    mova                 m7, [r3+3*16]
2974    REPX      {psubd x, m1}, m4, m6
2975    REPX      {pxor  x, m1}, m5, m7
2976    REPX      {psrad x, 1 }, m4, m5, m6, m7
2977    REPX      {psubd x, m1}, m5, m7
2978    packssdw             m4, m5
2979    packssdw             m6, m7
2980%endif
2981    ret
2982
2983.main_part2:
2984%if ARCH_X86_64
2985    ITX_MULSUB_2D         1, 0, 8, 9, 10, 11,  201, 4091
2986    ITX_MULSUB_2D         3, 2, 8, 9, 10, 11, 1751, 3703
2987    ITX_MULSUB_2D         5, 4, 8, 9, 10, 11, 3035, 2751
2988    ITX_MULSUB_2D         7, 6, 8, 9, 10, 11, 3857, 1380
2989    psubd                m8, m0, m4 ; t8a
2990    paddd                m0, m4     ; t0a
2991    psubd                m4, m1, m5 ; t9a
2992    paddd                m1, m5     ; t1a
2993    psubd                m5, m2, m6 ; t12a
2994    paddd                m2, m6     ; t4a
2995    psubd                m6, m3, m7 ; t13a
2996    paddd                m7, m3     ; t5a
2997    REPX    {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
2998    REPX    {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
2999    mova                m15, [o(pd_4017)]
3000    mova                m10, [o(pd_799)]
3001    ITX_MULSUB_2D         8, 4, 3, 9, _, 11, 10, 15
3002    ITX_MULSUB_2D         6, 5, 3, 9, _, 11, 15, 10
3003    psubd                m3, m0, m2 ; t4
3004    paddd                m0, m2     ; t0
3005    psubd                m2, m1, m7 ; t5
3006    paddd                m1, m7     ; t1
3007    psubd                m7, m4, m6 ; t12a
3008    paddd                m4, m6     ; t8a
3009    psubd                m6, m8, m5 ; t13a
3010    paddd                m5, m8     ; t9a
3011    REPX    {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
3012    REPX    {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
3013    mova                m15, [o(pd_3784)]
3014    mova                m10, [o(pd_1567)]
3015    ITX_MULSUB_2D         3, 2, 8, 9, _, 11, 10, 15
3016    ITX_MULSUB_2D         7, 6, 8, 9, _, 11, 10, 15
3017    mova                m10, [r3+0*16]      ;  t2
3018    mova                 m8, [r3+1*16]      ;  t3
3019    psubd                m9, m0, m10        ;  t2a
3020    paddd                m0, m10            ;  out0
3021    psubd               m10, m1, m8         ;  t3a
3022    paddd                m1, m8             ; -out15
3023    mova          [r3+0*16], m1
3024    mova                m15, [r3+3*16]      ;  t7a
3025    mova                 m1, [r3+2*16]      ;  t6a
3026    psubd                m8, m3, m15        ;  t7
3027    paddd               m15, m3             ;  out12
3028    paddd                m3, m2, m1         ; -out3
3029    psubd                m2, m1             ;  t6
3030    mova          [r3+3*16], m15
3031    mova          [r3+1*16], m2
3032    mova                 m1, [r3+7*16]      ;  t15
3033    mova                 m2, [r3+6*16]      ;  t14
3034    paddd               m15, m7, m1         ; -out13
3035    psubd                m7, m1             ;  t15a
3036    psubd               m11, m6, m2         ;  t14a
3037    paddd                m2, m6             ;  out2
3038    mova          [r3+2*16], m15
3039    mova                 m1, [r3+4*16]      ;  t10a
3040    mova                m15, [r3+5*16]      ;  t11a
3041    psubd                m6, m4, m1         ;  t10
3042    paddd                m1, m4             ; -out1
3043    psubd                m4, m5, m15        ;  t11
3044    paddd                m5, m15            ;  out14
3045    REPX    {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8
3046    pmaxsd              m12, [r3+1*16]      ;  t6
3047    mova          [r3+1*16], m5
3048    REPX    {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8
3049    REPX    {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8
3050    paddd                m5, m11, m7        ; -out5  (unshifted)
3051    psubd               m11, m7             ;  out10 (unshifted)
3052    paddd                m7, m9, m10        ; -out7  (unshifted)
3053    psubd                m9, m10            ;  out8  (unshifted)
3054    psubd               m10, m6, m4         ; -out9  (unshifted)
3055    paddd                m6, m4             ;  out6  (unshifted)
3056    paddd                m4, m12, m8        ;  out4  (unshifted)
3057    psubd               m12, m8             ; -out11 (unshifted)
3058%else
3059    mova          [r3+8*16], m0
3060    mova          [r3+9*16], m1
3061    mova         [r3+10*16], m2
3062    mova         [r3+11*16], m3
3063    mova                 m3, [o(pd_2048)]
3064    ITX_MULSUB_2D         5, 4, 0, 1, 2, 3, 3035, 2751
3065    ITX_MULSUB_2D         7, 6, 0, 1, 2, 3, 3857, 1380
3066    mova                 m0, [r3+8*16]
3067    mova                 m1, [r3+9*16]
3068    mova          [r3+8*16], m4
3069    mova                 m4, [r3+10*16]
3070    mova          [r3+9*16], m5
3071    mova         [r3+10*16], m6
3072    mova                 m5, [r3+11*16]
3073    mova         [r3+11*16], m7
3074    ITX_MULSUB_2D         1, 0, 2, 6, 7, 3,  201, 4091
3075    ITX_MULSUB_2D         5, 4, 2, 6, 7, 3, 1751, 3703
3076    mova                 m2, [r3+8*16]
3077    mova                 m6, [r3+9*16]
3078    psubd                m3, m0, m2 ; t8a
3079    paddd                m0, m2     ; t0a
3080    mova          [r3+8*16], m3
3081    psubd                m2, m1, m6 ; t9a
3082    paddd                m1, m6     ; t1a
3083    mova                 m3, [r3+10*16]
3084    psubd                m6, m4, m3 ; t12a
3085    paddd                m4, m3     ; t4a
3086    mova                 m3, [r3+11*16]
3087    psubd                m7, m5, m3 ; t13a
3088    paddd                m5, m3     ; t5a
3089    mova                 m3, [o(clip_18b_min)]
3090    REPX     {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5
3091    pmaxsd               m3, [r3+8*16]
3092    mova          [r3+8*16], m3
3093    mova                 m3, [o(clip_18b_max)]
3094    REPX     {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5
3095    pminsd               m3, [r3+8*16]
3096    mova          [r3+8*16], m3
3097    psubd                m3, m0, m4 ; t4
3098    paddd                m0, m4     ; t0
3099    psubd                m4, m1, m5 ; t5
3100    paddd                m1, m5     ; t1
3101    mova                 m5, [o(pd_2048)]
3102    mova          [r3+9*16], m1
3103    mova         [r3+10*16], m4
3104    mova         [r3+11*16], m3
3105    mova                 m3, [r3+8*16]
3106    mova          [r3+8*16], m0
3107    ITX_MULSUB_2D         3, 2, 0, 1, 4, 5,  799, 4017
3108    ITX_MULSUB_2D         7, 6, 0, 1, 4, 5, 4017,    4
3109    psubd                m5, m2, m7 ; t12a
3110    paddd                m2, m7     ; t8a
3111    psubd                m7, m3, m6 ; t13a
3112    paddd                m6, m3     ; t9a
3113    mova                 m0, [r3+8*16]
3114    mova                 m1, [r3+9*16]
3115    mova                 m4, [r3+10*16]
3116    mova                 m3, [o(clip_18b_min)]
3117    REPX     {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6
3118    pmaxsd               m3, [r3+11*16]
3119    mova          [r3+8*16], m3
3120    mova                 m3, [o(clip_18b_max)]
3121    REPX     {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6
3122    pminsd               m3, [r3+8*16]
3123    mova          [r3+8*16], m0
3124    mova          [r3+9*16], m1
3125    mova         [r3+10*16], m2
3126    mova         [r3+11*16], m6
3127    mova                 m0, [o(pd_2048)]
3128    ITX_MULSUB_2D         3, 4, 1, 2, 6, 0, 1567, 3784
3129    ITX_MULSUB_2D         5, 7, 1, 2, 6, 0,    6, 3784
3130    mova                 m0, [r3+7*16]      ;  t7a
3131    mova                 m2, [r3+6*16]      ;  t6a
3132    psubd                m1, m3, m0         ;  t7
3133    paddd                m0, m3             ;  out12
3134    paddd                m3, m4, m2         ; -out3
3135    psubd                m4, m2             ;  t6
3136    mova          [r3+7*16], m3
3137    mova                 m3, [r3+3*16]      ;  t15
3138    mova                 m2, [r3+2*16]      ;  t14
3139    paddd                m6, m5, m3         ; -out13
3140    psubd                m5, m3             ;  t15a
3141    psubd                m3, m7, m2         ;  t14a
3142    paddd                m2, m7             ;  out2
3143    mova          [r3+6*16], m2
3144    mova                 m7, [r3+0*16]      ;  t10a
3145    mova                 m2, [r3+1*16]      ;  t11a
3146    mova          [r3+0*16], m0
3147    mova          [r3+1*16], m6
3148    mova                 m6, [r3+11*16]
3149    psubd                m0, m6, m2         ;  t11
3150    paddd                m6, m2             ;  out14
3151    mova          [r3+2*16], m6
3152    mova                 m2, [r3+10*16]
3153    psubd                m6, m2, m7         ;  t10
3154    paddd                m2, m7             ; -out1
3155    mova                 m7, [r3+5*16]      ;  t3
3156    mova          [r3+5*16], m2
3157    mova         [r3+10*16], m1
3158    mova                 m1, [r3+9*16]
3159    psubd                m2, m1, m7         ;  t3a
3160    paddd                m1, m7             ; -out15
3161    mova          [r3+3*16], m1
3162    mova                 m1, [r3+4*16]      ;  t2
3163    mova                 m7, [r3+8*16]
3164    psubd                m7, m1             ;  t2a
3165    paddd                m1, [r3+8*16]      ;  out0
3166    mova          [r3+4*16], m1
3167    mova                 m1, [o(clip_18b_min)]
3168    REPX     {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7
3169    pmaxsd               m1, [r3+10*16]
3170    mova         [r3+10*16], m1
3171    mova                 m1, [o(clip_18b_max)]
3172    REPX     {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7
3173    pminsd               m1, [r3+10*16]
3174    mova         [r3+10*16], m1
3175    mova                 m1, [o(pd_2896)]
3176    REPX     {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7
3177    pmulld               m1, [r3+10*16]
3178    mova         [r3+11*16], m3
3179    psubd                m3, m4, m1         ; -out11 (unshifted)
3180    paddd                m4, m1             ;  out4  (unshifted)
3181    psubd                m1, m6, m0         ; -out9  (unshifted)
3182    paddd                m6, m0             ;  out6  (unshifted)
3183    psubd                m0, m7, m2         ;  out8  (unshifted)
3184    paddd                m7, m2             ; -out7  (unshifted)
3185    mova                 m2, [r3+11*16]
3186    mova         [r3+11*16], m5
3187    paddd                m5, m2             ; -out5  (unshifted)
3188    psubd                m2, [r3+11*16]     ;  out10 (unshifted)
3189    ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted)
3190    ; r[-4,3] contain out0-3 and out12-15
3191%endif
3192    ret
3193.main_part1:
3194%if ARCH_X86_64
3195    ITX_MULSUB_2D         1, 0, 8, 9, 10, 11,  995, 3973
3196    ITX_MULSUB_2D         3, 2, 8, 9, 10, 11, 2440, 3290
3197    ITX_MULSUB_2D         5, 4, 8, 9, 10, 11, 3513, 2106
3198    ITX_MULSUB_2D         7, 6, 8, 9, 10, 11, 4052,  601
3199    psubd                m8, m0, m4 ; t10a
3200    paddd                m0, m4     ; t2a
3201    psubd                m4, m1, m5 ; t11a
3202    paddd                m1, m5     ; t3a
3203    psubd                m5, m2, m6 ; t14a
3204    paddd                m2, m6     ; t6a
3205    psubd                m6, m3, m7 ; t15a
3206    paddd                m7, m3     ; t7a
3207    REPX    {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
3208    REPX    {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
3209    mova                m15, [o(pd_2276)]
3210    mova                m10, [o(pd_3406)]
3211    ITX_MULSUB_2D         8, 4, 3, 9, _, 11, 10, 15
3212    ITX_MULSUB_2D         6, 5, 3, 9, _, 11, 15, 10
3213    psubd                m3, m0, m2 ; t6
3214    paddd                m0, m2     ; t2
3215    psubd                m2, m1, m7 ; t7
3216    paddd                m1, m7     ; t3
3217    psubd                m7, m4, m6 ; t14a
3218    paddd                m4, m6     ; t10a
3219    psubd                m6, m8, m5 ; t15a
3220    paddd                m5, m8     ; t11a
3221    REPX    {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
3222    REPX    {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
3223    mova                m15, [o(pd_1567)]
3224    mova                m10, [o(pd_3784)]
3225    ITX_MULSUB_2D         2, 3, 8, 9, _, 11, 10, 15
3226    ITX_MULSUB_2D         6, 7, 8, 9, _, 11, 10, 15
3227    mova          [r3+0*16], m0
3228    mova          [r3+1*16], m1
3229    mova          [r3+4*16], m4
3230    mova          [r3+5*16], m5
3231    mova          [r3+2*16], m2
3232    mova          [r3+3*16], m3
3233    mova          [r3+6*16], m6
3234    mova          [r3+7*16], m7
3235%else
3236    mova          [r3+4*16], m0
3237    mova          [r3+5*16], m1
3238    mova          [r3+6*16], m2
3239    mova          [r3+7*16], m3
3240    mova                 m3, [o(pd_2048)]
3241    ITX_MULSUB_2D         5, 4, 0, 1, 2, 3, 3513, 2106
3242    ITX_MULSUB_2D         7, 6, 0, 1, 2, 3, 4052,  601
3243    mova          [r3+0*16], m4
3244    mova          [r3+1*16], m5
3245    mova          [r3+2*16], m6
3246    mova          [r3+3*16], m7
3247    mova                 m0, [r3+4*16]
3248    mova                 m1, [r3+5*16]
3249    mova                 m2, [r3+6*16]
3250    mova                 m7, [r3+7*16]
3251    ITX_MULSUB_2D         1, 0, 4, 5, 6, 3,  995, 3973
3252    ITX_MULSUB_2D         7, 2, 4, 5, 6, 3, 2440, 3290
3253    mova                 m4, [r3+0*16]
3254    mova                 m5, [r3+1*16]
3255    psubd                m6, m0, m4 ; t10a
3256    paddd                m0, m4     ; t2a
3257    mova          [r3+4*16], m6
3258    mova                 m6, [r3+2*16]
3259    mova                 m3, [r3+3*16]
3260    psubd                m4, m1, m5 ; t11a
3261    paddd                m1, m5     ; t3a
3262    psubd                m5, m2, m6 ; t14a
3263    paddd                m2, m6     ; t6a
3264    psubd                m6, m7, m3 ; t15a
3265    paddd                m7, m3     ; t7a
3266    mova                 m3, [o(clip_18b_min)]
3267    REPX     {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7
3268    pmaxsd               m3, [r3+4*16]
3269    mova          [r3+4*16], m3
3270    mova                 m3, [o(clip_18b_max)]
3271    REPX     {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7
3272    pminsd               m3, [r3+4*16]
3273    mova          [r3+4*16], m3
3274    psubd                m3, m0, m2 ; t6
3275    paddd                m0, m2     ; t2
3276    psubd                m2, m1, m7 ; t7
3277    paddd                m1, m7     ; t3
3278    mova          [r3+5*16], m1
3279    mova          [r3+6*16], m3
3280    mova          [r3+7*16], m2
3281    mova                 m1, [r3+4*16]
3282    mova          [r3+4*16], m0
3283    mova                 m3, [o(pd_2048)]
3284    ITX_MULSUB_2D         1, 4, 0, 7, 2, 3, 3406, 2276
3285    ITX_MULSUB_2D         6, 5, 0, 7, 2, 3, 2276,    2
3286    psubd                m7, m4, m6 ; t14a
3287    paddd                m4, m6     ; t10a
3288    psubd                m6, m1, m5 ; t15a
3289    paddd                m5, m1     ; t11a
3290    mova                 m1, [r3+5*16]
3291    mova                 m3, [r3+6*16]
3292    mova                 m2, [r3+7*16]
3293    mova                 m0, [o(clip_18b_min)]
3294    REPX     {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5
3295    pmaxsd               m0, [r3+4*16]
3296    mova          [r3+4*16], m0
3297    mova                 m0, [o(clip_18b_max)]
3298    REPX     {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5
3299    pminsd               m0, [r3+4*16]
3300    mova          [r3+4*16], m0
3301    mova          [r3+5*16], m1
3302    mova          [r3+0*16], m4
3303    mova          [r3+1*16], m5
3304    mova                 m0, [o(pd_2048)]
3305    ITX_MULSUB_2D         2, 3, 1, 4, 5, 0, 3784, 1567
3306    ITX_MULSUB_2D         6, 7, 1, 4, 5, 0,    5, 1567
3307    mova          [r3+6*16], m2
3308    mova          [r3+7*16], m3
3309    mova          [r3+2*16], m6
3310    mova          [r3+3*16], m7
3311%endif
3312    ret
3313
3314.pass2:
3315    lea                  r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
3316    jmp m(idct_16x4_internal_16bpc).pass2_loop
3317
3318INV_TXFM_16X4_FN flipadst, dct
3319INV_TXFM_16X4_FN flipadst, adst
3320INV_TXFM_16X4_FN flipadst, flipadst
3321INV_TXFM_16X4_FN flipadst, identity
3322
3323cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3324    lea                  r3, [rsp+gprsize]
3325    call m(iadst_16x4_internal_16bpc).main
3326%if ARCH_X86_64
3327    packssdw             m1, m0
3328    packssdw             m3, m2
3329    packssdw             m5, m4
3330    packssdw             m7, m6
3331    packssdw             m9, m8
3332    packssdw            m11, m10
3333    packssdw            m13, m12
3334    packssdw            m15, m14
3335    mova                 m0, m15
3336    mova                 m2, m13
3337    mova                 m4, m11
3338    mova                 m6, m9
3339    mova                 m8, m7
3340    mova                m10, m5
3341    mova                m12, m3
3342    mova                m14, m1
3343    jmp m(idct_16x4_internal_16bpc).transpose
3344%else
3345    mova [rsp+gprsize+4*16], m0
3346    mova [rsp+gprsize+5*16], m2
3347    mova [rsp+gprsize+6*16], m4
3348    mova [rsp+gprsize+7*16], m6
3349    pshufd               m6, [rsp+gprsize+ 8*16], q1032
3350    pshufd               m4, [rsp+gprsize+ 9*16], q1032
3351    pshufd               m2, [rsp+gprsize+10*16], q1032
3352    pshufd               m0, [rsp+gprsize+11*16], q1032
3353    call m(idct_8x4_internal_16bpc).transpose4x8packed
3354    mova [rsp+gprsize+0*16], m0
3355    mova [rsp+gprsize+1*16], m1
3356    mova [rsp+gprsize+2*16], m2
3357    mova [rsp+gprsize+3*16], m3
3358    pshufd               m6, [rsp+gprsize+ 4*16], q1032
3359    pshufd               m4, [rsp+gprsize+ 5*16], q1032
3360    pshufd               m2, [rsp+gprsize+ 6*16], q1032
3361    pshufd               m0, [rsp+gprsize+ 7*16], q1032
3362    call m(idct_8x4_internal_16bpc).transpose4x8packed
3363    jmp                tx2q
3364%endif
3365
3366.pass2:
3367    lea                  r3, [strideq*3]
3368    lea                dstq, [dstq+r3]
3369    neg             strideq
3370    lea                  r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
3371    jmp m(idct_16x4_internal_16bpc).pass2_loop
3372
3373INV_TXFM_16X4_FN identity, dct
3374INV_TXFM_16X4_FN identity, adst
3375INV_TXFM_16X4_FN identity, flipadst
3376INV_TXFM_16X4_FN identity, identity
3377
3378cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3379%if ARCH_X86_64
3380    mova                m15, [o(pd_11586)]
3381    pmulld               m0, m15, [cq+ 0*16]
3382    pmulld               m1, m15, [cq+ 1*16]
3383    pmulld               m2, m15, [cq+ 2*16]
3384    pmulld               m3, m15, [cq+ 3*16]
3385    pmulld               m4, m15, [cq+ 4*16]
3386    pmulld               m5, m15, [cq+ 5*16]
3387    pmulld               m6, m15, [cq+ 6*16]
3388    pmulld               m7, m15, [cq+ 7*16]
3389    pmulld               m8, m15, [cq+ 8*16]
3390    pmulld               m9, m15, [cq+ 9*16]
3391    pmulld              m10, m15, [cq+10*16]
3392    pmulld              m11, m15, [cq+11*16]
3393    pmulld              m12, m15, [cq+12*16]
3394    pmulld              m13, m15, [cq+13*16]
3395    pmulld              m14, m15, [cq+14*16]
3396    pmulld              m15, [cq+15*16]
3397    mova         [cq+ 0*16], m15
3398    mova                m15, [o(pd_6144)]
3399    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
3400                         m8, m9, m10, m11, m12, m13, m14
3401    paddd               m15, [cq+ 0*16]
3402    REPX     {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
3403                         m8, m9, m10, m11, m12, m13, m14, m15
3404    jmp m(idct_16x4_internal_16bpc).pack_transpose
3405%else
3406    add                  cq, 8*16
3407    mov                 r5d, 2
3408.loop_pass1:
3409    mova                 m7, [o(pd_11586)]
3410    pmulld               m0, m7, [cq+0*16]
3411    pmulld               m1, m7, [cq+1*16]
3412    pmulld               m2, m7, [cq+2*16]
3413    pmulld               m3, m7, [cq+3*16]
3414    pmulld               m4, m7, [cq+4*16]
3415    pmulld               m5, m7, [cq+5*16]
3416    pmulld               m6, m7, [cq+6*16]
3417    pmulld               m7, [cq+7*16]
3418    mova          [cq+7*16], m7
3419    mova                 m7, [o(pd_6144)]
3420    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
3421    paddd                m7, [cq+7*16]
3422    REPX      {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
3423    packssdw             m0, m1
3424    packssdw             m2, m3
3425    packssdw             m4, m5
3426    packssdw             m6, m7
3427    call m(idct_8x4_internal_16bpc).transpose4x8packed
3428    dec                 r5d
3429    jz .end_pass1
3430    mova [rsp+gprsize+0*16], m0
3431    mova [rsp+gprsize+1*16], m1
3432    mova [rsp+gprsize+2*16], m2
3433    mova [rsp+gprsize+3*16], m3
3434    sub                  cq, 8*16
3435    jmp .loop_pass1
3436.end_pass1:
3437    jmp                tx2q
3438%endif
3439
3440.pass2:
3441%if ARCH_X86_64
3442    mova                m12, [o(pw_1697x8)]
3443%endif
3444    lea                  r4, [o(.main)]
3445    jmp m(idct_16x4_internal_16bpc).pass2_loop
3446.main:
3447%if ARCH_X86_64
3448    pmulhrsw             m4, m0, m12
3449    pmulhrsw             m5, m1, m12
3450    pmulhrsw             m6, m2, m12
3451    pmulhrsw             m7, m3, m12
3452%else
3453    mova                 m7, [o(pw_1697x8)]
3454    pmulhrsw             m4, m0, m7
3455    pmulhrsw             m5, m1, m7
3456    pmulhrsw             m6, m2, m7
3457    pmulhrsw             m7, m3
3458%endif
3459    paddsw               m0, m4
3460    paddsw               m1, m5
3461    paddsw               m2, m6
3462    paddsw               m3, m7
3463    ret
3464
3465%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
3466%if ARCH_X86_64
3467    INV_TXFM_FN          %1, %2, %3, 16x8, 16, 0-8*16
3468%else
3469    INV_TXFM_FN          %1, %2, %3, 16x8, 8, 0-13*16
3470%endif
3471%ifidn %1_%2, dct_dct
3472    imul                r5d, [cq], 181
3473    mov                [cq], eobd ; 0
3474    mov                 r3d, 8
3475    add                 r5d, 128
3476    sar                 r5d, 8
3477    imul                r5d, 181
3478%if ARCH_X86_32
3479    add                 rsp, 1*16
3480%endif
3481    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
3482%endif
3483%endmacro
3484
3485INV_TXFM_16X8_FN dct, dct
3486INV_TXFM_16X8_FN dct, identity, 6
3487INV_TXFM_16X8_FN dct, adst
3488INV_TXFM_16X8_FN dct, flipadst
3489
3490cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3491%if ARCH_X86_64
3492    DECLARE_REG_TMP 6, 4, 6
3493%else
3494    mov [rsp+gprsize+12*16], r1
3495    DECLARE_REG_TMP 1, 4, 3
3496%endif
3497    lea                  t0, [o(.main)]
3498.loop_main:
3499%undef cmp
3500%if ARCH_X86_64
3501    xor                 r5d, r5d
3502    cmp                eobd, 10
3503    setge               r5b
3504%else
3505    mov                 r5d, 1
3506    cmp                eobd, 10
3507    sbb                 r5d, 0
3508%endif
3509    shl                 r5d, 4
3510
3511    lea                  r3, [rsp+gprsize]
3512.loop_pass1:
3513    call                 t0
3514%if ARCH_X86_64
3515    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
3516    mova       [cq+4*32+r5], m8
3517    mova       [cq+5*32+r5], m9
3518    mova       [cq+6*32+r5], m10
3519    mova       [cq+7*32+r5], m11
3520%else
3521    call m(idct_8x4_internal_16bpc).transpose4x8packed
3522    mova       [cq+4*32+r5], m0
3523    mova       [cq+5*32+r5], m1
3524    mova       [cq+6*32+r5], m2
3525    mova       [cq+7*32+r5], m3
3526    mova                 m0, [rsp+gprsize+ 8*16]
3527    mova                 m2, [rsp+gprsize+ 9*16]
3528    mova                 m4, [rsp+gprsize+10*16]
3529    mova                 m6, [rsp+gprsize+11*16]
3530%endif
3531    call m(idct_8x4_internal_16bpc).transpose4x8packed
3532    pxor                 m7, m7
3533    REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15
3534    test                r5d, r5d
3535    jz .end
3536    mova       [cq+0*32+r5], m0
3537    mova       [cq+1*32+r5], m1
3538    mova       [cq+2*32+r5], m2
3539    mova       [cq+3*32+r5], m3
3540    xor                 r5d, r5d
3541    jmp .loop_pass1
3542.end:
3543
3544    jmp                tx2q
3545.main:
3546%if ARCH_X86_64
3547    mova                m11, [o(pd_2048)]
3548    mova                m12, [o(clip_18b_min)]
3549    mova                m13, [o(clip_18b_max)]
3550    mova                m14, [o(pd_2896)]
3551%endif
3552    mova                 m0, [cq+ 1*32+r5]
3553    mova                 m1, [cq+ 3*32+r5]
3554    mova                 m2, [cq+ 5*32+r5]
3555    mova                 m3, [cq+ 7*32+r5]
3556    mova                 m4, [cq+ 9*32+r5]
3557    mova                 m5, [cq+11*32+r5]
3558    mova                 m6, [cq+13*32+r5]
3559    mova                 m7, [cq+15*32+r5]
3560    call m(idct_8x4_internal_16bpc).rect2_mul
3561    call m(idct_16x4_internal_16bpc).main_oddhalf
3562
3563    mova                 m0, [cq+ 0*32+r5]
3564    mova                 m1, [cq+ 2*32+r5]
3565    mova                 m2, [cq+ 4*32+r5]
3566    mova                 m3, [cq+ 6*32+r5]
3567    mova                 m4, [cq+ 8*32+r5]
3568    mova                 m5, [cq+10*32+r5]
3569    mova                 m6, [cq+12*32+r5]
3570    mova                 m7, [cq+14*32+r5]
3571    call m(idct_8x4_internal_16bpc).rect2_mul
3572    call m(idct_8x4_internal_16bpc).main_pass1
3573    call m(idct_8x4_internal_16bpc).round
3574    call m(idct_16x4_internal_16bpc).round
3575%if ARCH_X86_64
3576    packssdw             m0, m1
3577    packssdw             m2, m3
3578    packssdw             m4, m5
3579    packssdw             m6, m7
3580    packssdw             m8, m9
3581    packssdw            m10, m11
3582    packssdw            m12, m13
3583    packssdw            m14, m15
3584%endif
3585    ret
3586
3587.pass2:
3588%if ARCH_X86_32
3589    mov             strideq, [rsp+gprsize+12*16]
3590%endif
3591    mov                 r4d, 2
3592.pass2_main:
3593%if ARCH_X86_64
3594    mova                 m8, [o(pw_2048)]
3595    pxor                 m9, m9
3596    mova                m10, [o(pixel_10bpc_max)]
3597%endif
3598    lea                  r3, [strideq*3]
3599    jmp .loop_pass2_entry
3600.loop_pass2:
3601    mova                 m0, [cq+0*32+ 0]
3602    mova                 m1, [cq+1*32+ 0]
3603    mova                 m2, [cq+2*32+ 0]
3604    mova                 m3, [cq+3*32+ 0]
3605.loop_pass2_entry:
3606    mova                 m4, [cq+0*32+16]
3607    mova                 m5, [cq+1*32+16]
3608    mova                 m6, [cq+2*32+16]
3609    mova                 m7, [cq+3*32+16]
3610%if ARCH_X86_32
3611    lea                  r5, [o(itx8_start)]
3612%endif
3613    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
3614    call m(idct_8x8_internal_16bpc).round2_and_write_8x8
3615%if ARCH_X86_64
3616%define mzero m9
3617%else
3618%define mzero m7
3619    pxor                 m7, m7
3620%endif
3621    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
3622    add                dstq, 16
3623    add                  cq, 4*32
3624    dec                 r4d
3625    jg .loop_pass2
3626    RET
3627
3628INV_TXFM_16X8_FN adst, dct
3629INV_TXFM_16X8_FN adst, adst
3630INV_TXFM_16X8_FN adst, flipadst
3631INV_TXFM_16X8_FN adst, identity, 6
3632
3633cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3634%if ARCH_X86_32
3635    mov [rsp+gprsize+12*16], r1
3636%endif
3637    lea                  t0, [o(.main)]
3638    jmp m(idct_16x8_internal_16bpc).loop_main
3639
3640.main:
3641%if ARCH_X86_64
3642    mova                m11, [o(pd_2048)]
3643    mova                m12, [o(clip_18b_min)]
3644    mova                m13, [o(clip_18b_max)]
3645    mova                m14, [o(pd_2896)]
3646%endif
3647    mova                 m0, [cq+ 2*32+r5]
3648    mova                 m1, [cq+13*32+r5]
3649    mova                 m2, [cq+ 6*32+r5]
3650    mova                 m3, [cq+ 9*32+r5]
3651    mova                 m4, [cq+10*32+r5]
3652    mova                 m5, [cq+ 5*32+r5]
3653    mova                 m6, [cq+14*32+r5]
3654    mova                 m7, [cq+ 1*32+r5]
3655    call m(idct_8x4_internal_16bpc).rect2_mul
3656    call m(iadst_16x4_internal_16bpc).main_part1
3657    mova                 m0, [cq+ 0*32+r5]
3658    mova                 m1, [cq+15*32+r5]
3659    mova                 m2, [cq+ 4*32+r5]
3660    mova                 m3, [cq+11*32+r5]
3661    mova                 m4, [cq+ 8*32+r5]
3662    mova                 m5, [cq+ 7*32+r5]
3663    mova                 m6, [cq+12*32+r5]
3664    mova                 m7, [cq+ 3*32+r5]
3665%if ARCH_X86_32
3666    add                  r3, 8*16
3667%endif
3668    call m(idct_8x4_internal_16bpc).rect2_mul
3669%if ARCH_X86_32
3670    sub                  r3, 8*16
3671%endif
3672    call m(iadst_16x4_internal_16bpc).main_part2
3673    call m(iadst_16x4_internal_16bpc).round
3674%if ARCH_X86_64
3675    packssdw             m0, m1
3676    packssdw             m2, m3
3677    packssdw             m4, m5
3678    packssdw             m6, m7
3679    packssdw             m8, m9
3680    packssdw            m10, m11
3681    packssdw            m12, m13
3682    packssdw            m14, m15
3683%endif
3684    ret
3685
3686.pass2:
3687%if ARCH_X86_32
3688    mov             strideq, [rsp+gprsize+12*16]
3689%endif
3690    mov                 r4d, 2
3691%if ARCH_X86_64
3692    mova                 m8, [o(pw_2048)]
3693    pxor                 m9, m9
3694    mova                m10, [o(pixel_10bpc_max)]
3695    mova                m11, [o(pw_m2048)]
3696%endif
3697    lea                  r3, [strideq*3]
3698    jmp .loop_pass2_entry
3699.loop_pass2:
3700    mova                 m0, [cq+0*32+ 0]
3701    mova                 m1, [cq+1*32+ 0]
3702    mova                 m2, [cq+2*32+ 0]
3703    mova                 m3, [cq+3*32+ 0]
3704.loop_pass2_entry:
3705    mova                 m4, [cq+0*32+16]
3706    mova                 m5, [cq+1*32+16]
3707    mova                 m6, [cq+2*32+16]
3708    mova                 m7, [cq+3*32+16]
3709%if ARCH_X86_32
3710    lea                  r5, [o(itx8_start)]
3711%endif
3712    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
3713    call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
3714    call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
3715%if ARCH_X86_64
3716%define mzero m9
3717%else
3718%define mzero m7
3719    pxor                 m7, m7
3720%endif
3721    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
3722    add                dstq, 16
3723    add                  cq, 4*32
3724    dec                 r4d
3725    jg .loop_pass2
3726    RET
3727
3728INV_TXFM_16X8_FN flipadst, dct
3729INV_TXFM_16X8_FN flipadst, adst
3730INV_TXFM_16X8_FN flipadst, flipadst
3731INV_TXFM_16X8_FN flipadst, identity, 6
3732
3733cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3734%if ARCH_X86_32
3735    mov [rsp+gprsize+12*16], r1
3736%endif
3737    lea                  t0, [o(.main)]
3738    jmp m(idct_16x8_internal_16bpc).loop_main
3739.main:
3740    call m(iadst_16x8_internal_16bpc).main
3741%if ARCH_X86_64
3742    pshufd               m1, m0, q1032
3743    pshufd               m3, m2, q1032
3744    pshufd               m5, m4, q1032
3745    pshufd               m7, m6, q1032
3746    pshufd               m0, m14, q1032
3747    pshufd               m2, m12, q1032
3748    pshufd               m4, m10, q1032
3749    pshufd               m6, m8, q1032
3750    mova                m14, m1
3751    mova                m12, m3
3752    mova                m10, m5
3753    mova                 m8, m7
3754%else
3755    pshufd               m1, m0, q1032
3756    pshufd               m3, m2, q1032
3757    pshufd               m5, m4, q1032
3758    pshufd               m7, m6, q1032
3759    pshufd               m0, [r3+11*16], q1032
3760    pshufd               m2, [r3+10*16], q1032
3761    pshufd               m4, [r3+9*16], q1032
3762    pshufd               m6, [r3+8*16], q1032
3763    mova          [r3+8*16], m7
3764    mova          [r3+9*16], m5
3765    mova         [r3+10*16], m3
3766    mova         [r3+11*16], m1
3767%endif
3768    ret
3769
3770.pass2:
3771%if ARCH_X86_32
3772    mov             strideq, [rsp+gprsize+12*16]
3773%endif
3774    lea                dstq, [dstq+strideq*8]
3775    neg             strideq
3776    add                dstq, strideq
3777%if ARCH_X86_32
3778    mov [rsp+gprsize+12*16], strideq
3779%endif
3780    jmp m(iadst_16x8_internal_16bpc).pass2
3781
3782INV_TXFM_16X8_FN identity, dct, -54
3783INV_TXFM_16X8_FN identity, adst, -54
3784INV_TXFM_16X8_FN identity, flipadst, -54
3785INV_TXFM_16X8_FN identity, identity
3786
3787cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3788%if ARCH_X86_32
3789    mov [rsp+gprsize+12*16], r1
3790%endif
3791    lea                  t0, [o(.main)]
3792    jmp m(idct_16x8_internal_16bpc).loop_main
3793.main:
3794%if ARCH_X86_64
3795    mova                m15, [o(pd_2896)]
3796    pmulld               m0, m15, [cq+ 0*32+r5]
3797    pmulld               m1, m15, [cq+ 1*32+r5]
3798    pmulld               m2, m15, [cq+ 2*32+r5]
3799    pmulld               m3, m15, [cq+ 3*32+r5]
3800    pmulld               m4, m15, [cq+ 4*32+r5]
3801    pmulld               m5, m15, [cq+ 5*32+r5]
3802    pmulld               m6, m15, [cq+ 6*32+r5]
3803    pmulld               m7, m15, [cq+ 7*32+r5]
3804    pmulld               m8, m15, [cq+ 8*32+r5]
3805    pmulld               m9, m15, [cq+ 9*32+r5]
3806    pmulld              m10, m15, [cq+10*32+r5]
3807    pmulld              m11, m15, [cq+11*32+r5]
3808    pmulld              m12, m15, [cq+12*32+r5]
3809    pmulld              m13, m15, [cq+13*32+r5]
3810    pmulld              m14, m15, [cq+14*32+r5]
3811    pmulld              m15, [cq+15*32+r5]
3812    mova               [r3], m15
3813    mova                m15, [o(pd_2048)]
3814    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
3815                         m8, m9, m10, m11, m12, m13, m14
3816    paddd               m15, [r3]
3817    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
3818                         m8, m9, m10, m11, m12, m13, m14, m15
3819    mova               [r3], m15
3820    mova                m15, [o(pd_11586)]
3821    REPX    {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
3822                         m8, m9, m10, m11, m12, m13, m14
3823    pmulld              m15, [r3]
3824    mova               [r3], m15
3825    mova                m15, [o(pd_6144)]
3826    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
3827                         m8, m9, m10, m11, m12, m13, m14
3828    paddd               m15, [r3]
3829    REPX     {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
3830                         m8, m9, m10, m11, m12, m13, m14, m15
3831    packssdw             m0, m1
3832    packssdw             m2, m3
3833    packssdw             m4, m5
3834    packssdw             m6, m7
3835    packssdw             m8, m9
3836    packssdw            m10, m11
3837    packssdw            m12, m13
3838    packssdw            m14, m15
3839%else
3840    mova                 m0, [cq+ 0*32+r5]
3841    mova                 m1, [cq+ 1*32+r5]
3842    mova                 m2, [cq+ 2*32+r5]
3843    mova                 m3, [cq+ 3*32+r5]
3844    mova                 m4, [cq+ 4*32+r5]
3845    mova                 m5, [cq+ 5*32+r5]
3846    mova                 m6, [cq+ 6*32+r5]
3847    mova                 m7, [cq+ 7*32+r5]
3848    call m(idct_8x4_internal_16bpc).rect2_mul
3849    mova               [r3], m7
3850    mova                 m7, [o(pd_11586)]
3851    REPX      {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
3852    pmulld               m7, [r3]
3853    mova               [r3], m7
3854    mova                 m7, [o(pd_6144)]
3855    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
3856    paddd                m7, [r3]
3857    REPX      {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
3858    packssdw             m0, m1
3859    packssdw             m2, m3
3860    packssdw             m4, m5
3861    packssdw             m6, m7
3862    mova         [r3+ 8*16], m0
3863    mova         [r3+ 9*16], m2
3864    mova         [r3+10*16], m4
3865    mova         [r3+11*16], m6
3866    mova                 m0, [cq+ 8*32+r5]
3867    mova                 m1, [cq+ 9*32+r5]
3868    mova                 m2, [cq+10*32+r5]
3869    mova                 m3, [cq+11*32+r5]
3870    mova                 m4, [cq+12*32+r5]
3871    mova                 m5, [cq+13*32+r5]
3872    mova                 m6, [cq+14*32+r5]
3873    mova                 m7, [cq+15*32+r5]
3874    call m(idct_8x4_internal_16bpc).rect2_mul
3875    mova               [r3], m7
3876    mova                 m7, [o(pd_11586)]
3877    REPX      {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
3878    pmulld               m7, [r3]
3879    mova               [r3], m7
3880    mova                 m7, [o(pd_6144)]
3881    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
3882    paddd                m7, [r3]
3883    REPX      {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
3884    packssdw             m0, m1
3885    packssdw             m2, m3
3886    packssdw             m4, m5
3887    packssdw             m6, m7
3888%endif
3889    ret
3890.pass2:
3891%if ARCH_X86_32
3892    mov             strideq, [rsp+gprsize+12*16]
3893%endif
3894    mov                 r4d, 2
3895%if ARCH_X86_64
3896    mova                 m8, [o(pw_4096)]
3897    pxor                 m9, m9
3898    mova                m10, [o(pixel_10bpc_max)]
3899%endif
3900    lea                  r3, [strideq*3]
3901    jmp .loop_pass2_entry
3902.loop_pass2:
3903    mova                 m0, [cq+0*32+ 0]
3904    mova                 m1, [cq+1*32+ 0]
3905    mova                 m2, [cq+2*32+ 0]
3906    mova                 m3, [cq+3*32+ 0]
3907.loop_pass2_entry:
3908    mova                 m4, [cq+0*32+16]
3909    mova                 m5, [cq+1*32+16]
3910    mova                 m6, [cq+2*32+16]
3911    mova                 m7, [cq+3*32+16]
3912%if ARCH_X86_64
3913    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
3914%else
3915    mova      [rsp+gprsize], m7
3916    mova                 m7, [o(pw_4096)]
3917    call m(idct_8x8_internal_16bpc).round4_and_write_8x8
3918%endif
3919%if ARCH_X86_64
3920%define mzero m9
3921%else
3922%define mzero m7
3923    pxor                 m7, m7
3924%endif
3925    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
3926    add                dstq, 16
3927    add                  cq, 4*32
3928    dec                 r4d
3929    jg .loop_pass2
3930    RET
3931
3932%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
3933%if ARCH_X86_64
3934    INV_TXFM_FN          %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16
3935%else
3936    INV_TXFM_FN          %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
3937%endif
3938%ifidn %1_%2, dct_dct
3939    imul                r5d, [cq], 181
3940    mov                [cq], eobd ; 0
3941    mov                 r3d, 16
3942    add                 r5d, 640
3943    sar                 r5d, 10
3944    add                 rsp, (5+ARCH_X86_64*3+WIN64)*16
3945    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
3946%endif
3947%endmacro
3948
3949INV_TXFM_16X16_FN dct, dct
3950INV_TXFM_16X16_FN dct, identity, v
3951INV_TXFM_16X16_FN dct, adst
3952INV_TXFM_16X16_FN dct, flipadst
3953
3954cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
3955%if ARCH_X86_64
3956    DECLARE_REG_TMP       6, 7
3957%if WIN64
3958    mov [rsp+16*16+gprsize], r7
3959%endif
3960%elif ARCH_X86_32
3961    DECLARE_REG_TMP       1, 6
3962    mov [rsp+16*16+gprsize*1], r1
3963    mov [rsp+16*16+gprsize*2], r6
3964%endif
3965    lea                  t0, [o(.main)]
3966.pass1_full:
3967%undef cmp
3968    mov                 t1d, 4
3969.zero_loop:
3970    dec                 t1d
3971    cmp                eobb, byte [r5+t1]
3972    jb .zero_loop
3973    mov                 r5d, t1d
3974    shl                 r5d, 4
3975%if ARCH_X86_32
3976    ; restore pic-ptr
3977    mov                  r6, [rsp+16*16+2*gprsize]
3978%endif
3979    ; setup stack pointer
3980    lea                  r3, [rsp+gprsize]
3981.loop_pass1:
3982    call                 t0
3983%if ARCH_X86_64
3984    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
3985    mova       [cq+4*64+r5], m8
3986    mova       [cq+5*64+r5], m9
3987    mova       [cq+6*64+r5], m10
3988    mova       [cq+7*64+r5], m11
3989%else
3990    call m(idct_8x4_internal_16bpc).transpose4x8packed
3991    mova       [cq+4*64+r5], m0
3992    mova       [cq+5*64+r5], m1
3993    mova       [cq+6*64+r5], m2
3994    mova       [cq+7*64+r5], m3
3995    mova                 m0, [rsp+gprsize+ 8*16]
3996    mova                 m2, [rsp+gprsize+ 9*16]
3997    mova                 m4, [rsp+gprsize+10*16]
3998    mova                 m6, [rsp+gprsize+11*16]
3999%endif
4000    call m(idct_8x4_internal_16bpc).transpose4x8packed
4001    mova       [cq+0*64+r5], m0
4002    mova       [cq+1*64+r5], m1
4003    mova       [cq+2*64+r5], m2
4004    mova       [cq+3*64+r5], m3
4005    pxor                 m0, m0
4006    REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15
4007    sub                 r5d, 16
4008    jge .loop_pass1
4009
4010%if ARCH_X86_32
4011    ; restore pic-ptr
4012    mov                  r1, [rsp+16*16+1*gprsize]
4013%endif
4014    jmp                tx2q
4015.main:
4016%if ARCH_X86_64
4017    mova                m11, [o(pd_2048)]
4018    mova                m12, [o(clip_18b_min)]
4019    mova                m13, [o(clip_18b_max)]
4020    mova                m14, [o(pd_2896)]
4021%endif
4022
4023    mova                 m0, [cq+ 1*64+r5]
4024    mova                 m1, [cq+ 3*64+r5]
4025    mova                 m2, [cq+ 5*64+r5]
4026    mova                 m3, [cq+ 7*64+r5]
4027    mova                 m4, [cq+ 9*64+r5]
4028    mova                 m5, [cq+11*64+r5]
4029    mova                 m6, [cq+13*64+r5]
4030    mova                 m7, [cq+15*64+r5]
4031    call m(idct_16x4_internal_16bpc).main_oddhalf
4032
4033    mova                 m0, [cq+ 0*64+r5]
4034    mova                 m1, [cq+ 2*64+r5]
4035    mova                 m2, [cq+ 4*64+r5]
4036    mova                 m3, [cq+ 6*64+r5]
4037    mova                 m4, [cq+ 8*64+r5]
4038    mova                 m5, [cq+10*64+r5]
4039    mova                 m6, [cq+12*64+r5]
4040    mova                 m7, [cq+14*64+r5]
4041    call m(idct_8x4_internal_16bpc).main_pass1
4042    call m(idct_8x4_internal_16bpc).round
4043    call .round
4044%if ARCH_X86_64
4045    packssdw             m0, m1
4046    packssdw             m2, m3
4047    packssdw             m4, m5
4048    packssdw             m6, m7
4049    packssdw             m8, m9
4050    packssdw            m10, m11
4051    packssdw            m12, m13
4052    packssdw            m14, m15
4053%endif
4054    ret
4055.round:
4056%if ARCH_X86_64
4057    REPX    {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
4058    REPX    {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
4059    psrld                m8, m11, 10        ; 2
4060    REPX      {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
4061    mova                 m8, [r3+1*16]
4062    mova                 m9, [r3+2*16]
4063    mova                m10, [r3+3*16]
4064    mova                m11, [r3+4*16]
4065    mova                m12, [r3+5*16]
4066    mova                m13, [r3+6*16]
4067    mova                m14, [r3+7*16]
4068    psubd               m15, m0, m14       ; out15
4069    paddd                m0, m14           ; out0
4070    psubd               m14, m1, m13       ; out14
4071    paddd                m1, m13           ; out1
4072    psubd               m13, m2, m12       ; out13
4073    paddd                m2, m12           ; out2
4074    psubd               m12, m3, m11       ; out12
4075    paddd                m3, m11           ; out3
4076    psubd               m11, m4, m10       ; out11
4077    paddd                m4, m10           ; out4
4078    psubd               m10, m5, m9        ; out10
4079    paddd                m5, m9            ; out5
4080    psubd                m9, m6, m8        ; out9
4081    paddd                m6, m8            ; out6
4082    psubd                m8, m7, [r3+0*16] ; out8
4083    paddd                m7, [r3+0*16]     ; out7
4084    REPX       {psrad x, 2}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
4085                             m8,  m9,  m10, m11, m12, m13, m14, m15
4086    ; and out0-15 is now in m0-15
4087%else
4088    mova         [r3+ 0*16], m0
4089    mova                 m0, [o(clip_18b_min)]
4090    REPX     {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
4091    pmaxsd               m0, [r3+ 0*16]
4092    mova         [r3+ 0*16], m7
4093    mova                 m7, [o(clip_18b_max)]
4094    REPX     {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
4095    pminsd               m7, [r3+ 0*16]
4096    mova         [r3+ 0*16], m0
4097    mova                 m0, [o(pd_2)]
4098    REPX      {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
4099    paddd                m0, [r3+ 0*16]
4100    mova         [r3+ 0*16], m0
4101    mova         [r3+ 1*16], m1
4102    mova         [r3+ 2*16], m2
4103    mova                 m1, [r3+11*16]
4104    mova                 m2, [r3+10*16]
4105    psubd                m0, m7, m1
4106    paddd                m7, m1
4107    psubd                m1, m6, m2
4108    paddd                m6, m2
4109    REPX       {psrad x, 2}, m0, m1, m6, m7
4110    packssdw             m0, m1     ; out8-9
4111    packssdw             m6, m7     ; out6-7
4112    mova         [r3+11*16], m6
4113    mova                 m1, [r3+9*16]
4114    mova                 m7, [r3+8*16]
4115    psubd                m2, m5, m1
4116    paddd                m5, m1
4117    psubd                m1, m4, m7
4118    paddd                m4, m7
4119    REPX       {psrad x, 2}, m2, m1, m4, m5
4120    packssdw             m2, m1     ; out10-11
4121    packssdw             m4, m5     ; out4-5
4122    mova                 m1, [r3+2*16]
4123    mova         [r3+10*16], m4
4124    mova                 m6, [r3+7*16]
4125    mova                 m7, [r3+6*16]
4126    psubd                m4, m3, m6
4127    paddd                m3, m6
4128    psubd                m6, m1, m7
4129    paddd                m1, m7
4130    REPX       {psrad x, 2}, m4, m6, m1, m3
4131    packssdw             m4, m6     ; out12-13
4132    packssdw             m1, m3     ; out2-3
4133    mova                 m3, [r3+1*16]
4134    mova          [r3+9*16], m1
4135    mova                 m1, [r3+0*16]
4136    mova                 m5, [r3+5*16]
4137    mova                 m7, [r3+4*16]
4138    psubd                m6, m3, m5
4139    paddd                m3, m5
4140    psubd                m5, m1, m7
4141    paddd                m1, m7
4142    REPX       {psrad x, 2}, m6, m5, m1, m3
4143    packssdw             m6, m5     ; out14-15
4144    packssdw             m1, m3     ; out0-1
4145    mova          [r3+8*16], m1
4146%endif
4147    ret
4148
4149.pass2:
4150%if ARCH_X86_64
4151    mova                 m8, [o(pw_2048)]
4152    pxor                 m9, m9
4153    mova                m10, [o(pixel_10bpc_max)]
4154    mov                  r7, dstq
4155%else
4156    mov [rsp+2*gprsize+16*16], dstq
4157%endif
4158    lea                  r3, [strideq*3]
4159    mov                 r4d, 2
4160.loop_pass2:
4161%if ARCH_X86_32
4162    lea                  r5, [o(itx8_start)]
4163%endif
4164    mova                 m0, [cq+0*64+ 0]
4165    mova                 m1, [cq+2*64+ 0]
4166    mova                 m2, [cq+0*64+16]
4167    mova                 m3, [cq+2*64+16]
4168    mova                 m4, [cq+0*64+32]
4169    mova                 m5, [cq+2*64+32]
4170    mova                 m6, [cq+0*64+48]
4171    mova                 m7, [cq+2*64+48]
4172    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
4173    mova [rsp+gprsize+3*16], m0
4174    mova [rsp+gprsize+4*16], m1
4175    mova [rsp+gprsize+5*16], m2
4176    mova [rsp+gprsize+6*16], m3
4177    mova [rsp+gprsize+7*16], m4
4178    mova [rsp+gprsize+8*16], m5
4179    mova [rsp+gprsize+9*16], m6
4180    ; m7 is already stored in [rsp+gprsize+0*16]
4181    mova                 m0, [cq+1*64+ 0]
4182    mova                 m1, [cq+3*64+ 0]
4183    mova                 m2, [cq+1*64+16]
4184    mova                 m3, [cq+3*64+16]
4185    mova                 m4, [cq+1*64+32]
4186    mova                 m5, [cq+3*64+32]
4187    mova                 m6, [cq+1*64+48]
4188    mova                 m7, [cq+3*64+48]
4189    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
4190
4191    ; out0-7 is in rsp+gprsize+3-10*mmsize
4192    ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
4193
4194%if ARCH_X86_64
4195    lea                dstq, [r7+strideq*8]
4196%else
4197    mov                dstq, [rsp+2*gprsize+16*16]
4198    lea                dstq, [dstq+strideq*8]
4199%endif
4200    call m(idct_8x8_internal_16bpc).round2_and_write_8x8
4201%if ARCH_X86_64
4202    mov                dstq, r7
4203%else
4204    mov                dstq, [rsp+2*gprsize+16*16]
4205%endif
4206    mova                 m0, [rsp+gprsize+ 3*16]
4207    mova                 m1, [rsp+gprsize+ 4*16]
4208    mova                 m2, [rsp+gprsize+ 5*16]
4209    mova                 m3, [rsp+gprsize+ 6*16]
4210    mova                 m4, [rsp+gprsize+ 7*16]
4211    mova                 m5, [rsp+gprsize+ 8*16]
4212    mova                 m6, [rsp+gprsize+ 9*16]
4213    mova                 m7, [rsp+gprsize+10*16]
4214    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
4215%if ARCH_X86_64
4216    add                  r7, 16
4217%define mzero m9
4218%else
4219    add dword [rsp+2*gprsize+16*16], 16
4220%define mzero m7
4221    pxor                 m7, m7
4222%endif
4223    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
4224    add                  cq, 64*4
4225    REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
4226%undef mzero
4227    dec                 r4d
4228    jg .loop_pass2
4229%if WIN64
4230    mov                  r7, [rsp+16*16+gprsize]
4231%endif
4232    RET
4233
4234INV_TXFM_16X16_FN adst, dct
4235INV_TXFM_16X16_FN adst, adst
4236INV_TXFM_16X16_FN adst, flipadst
4237
4238cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
4239%if WIN64
4240    mov [rsp+16*16+gprsize], r7
4241%elif ARCH_X86_32
4242    mov [rsp+16*16+gprsize*1], r1
4243    mov [rsp+16*16+gprsize*2], r6
4244%endif
4245    lea                  t0, [o(.main)]
4246    jmp m(idct_16x16_internal_16bpc).pass1_full
4247
4248.main:
4249%if ARCH_X86_64
4250    mova                m11, [o(pd_2048)]
4251    mova                m12, [o(clip_18b_min)]
4252    mova                m13, [o(clip_18b_max)]
4253    mova                m14, [o(pd_2896)]
4254%endif
4255    mova                 m0, [cq+ 2*64+r5]
4256    mova                 m1, [cq+13*64+r5]
4257    mova                 m2, [cq+ 6*64+r5]
4258    mova                 m3, [cq+ 9*64+r5]
4259    mova                 m4, [cq+10*64+r5]
4260    mova                 m5, [cq+ 5*64+r5]
4261    mova                 m6, [cq+14*64+r5]
4262    mova                 m7, [cq+ 1*64+r5]
4263    call m(iadst_16x4_internal_16bpc).main_part1
4264    mova                 m0, [cq+ 0*64+r5]
4265    mova                 m1, [cq+15*64+r5]
4266    mova                 m2, [cq+ 4*64+r5]
4267    mova                 m3, [cq+11*64+r5]
4268    mova                 m4, [cq+ 8*64+r5]
4269    mova                 m5, [cq+ 7*64+r5]
4270    mova                 m6, [cq+12*64+r5]
4271    mova                 m7, [cq+ 3*64+r5]
4272    call m(iadst_16x4_internal_16bpc).main_part2
4273    call .round
4274%if ARCH_X86_64
4275    packssdw             m0, m1
4276    packssdw             m2, m3
4277    packssdw             m4, m5
4278    packssdw             m6, m7
4279    packssdw             m8, m9
4280    packssdw            m10, m11
4281    packssdw            m12, m13
4282    packssdw            m14, m15
4283%endif
4284    ret
4285.round:
4286%if ARCH_X86_64
4287    pcmpeqd              m8, m8         ; -1
4288    mova                m15, [o(pd_10240)]
4289    psrld               m14, 10         ; +2
4290    psubd               m13, m14, m8    ; +3
4291    REPX     {pxor  x, m8 }, m1, m3, m5, m7
4292    REPX     {paddd x, m14}, m0, m2
4293    REPX     {paddd x, m13}, m1, m3
4294    REPX     {paddd x, m15}, m4, m5, m6, m7
4295    paddd               m13, m15, m8    ; +10239
4296    paddd                m8, m15, m9
4297    psubd                m9, m13, m10
4298    paddd               m10, m15, m11
4299    psubd               m11, m13, m12
4300    paddd               m12, m14, [r3+3*16]
4301    psubd               m13, m14, [r3+2*16]
4302    psubd               m15, m14, [r3+0*16]
4303    paddd               m14, [r3+1*16]
4304    REPX      {psrad x, 2 }, m0,  m1,  m2,  m3,  m12, m13, m14, m15
4305    REPX      {psrad x, 14}, m4,  m5,  m6,  m7,  m8,  m9,  m10, m11
4306%else
4307    mova          [r3+8*16], m1
4308    mova          [r3+9*16], m3
4309    mova                 m3, [o(pd_10240)]
4310    pcmpeqd              m1, m1
4311    REPX      {pxor  x, m1}, m5, m7
4312    REPX      {paddd x, m3}, m4, m5, m6, m7
4313    REPX      {psrad x, 14}, m4, m5, m6, m7
4314    packssdw             m4, m5
4315    packssdw             m6, m7
4316    mova         [r3+10*16], m4
4317    mova         [r3+11*16], m6
4318    mova                 m4, [r3+4*16]
4319    mova                 m5, [r3+5*16]
4320    mova                 m6, [r3+6*16]
4321    mova                 m7, [r3+7*16]
4322    mova                 m3, [o(pd_2)]
4323    REPX      {pxor  x, m1}, m5, m7
4324    REPX      {paddd x, m3}, m4, m6
4325    psubd                m3, m1
4326    REPX      {paddd x, m3}, m5, m7
4327    REPX      {psrad x, 2 }, m4, m5, m6, m7
4328    packssdw             m4, m5
4329    packssdw             m6, m7
4330    mova                 m5, [r3+8*16]
4331    mova                 m7, [r3+9*16]
4332    mova          [r3+8*16], m4
4333    mova          [r3+9*16], m6
4334    mova                 m3, [o(pd_10240)]
4335    REPX      {pxor  x, m1}, m5, m7
4336    REPX      {paddd x, m3}, m0, m5, m2, m7
4337    REPX      {psrad x, 14}, m0, m5, m2, m7
4338    packssdw             m0, m5
4339    packssdw             m2, m7
4340    mova                 m4, [r3+0*16]
4341    mova                 m5, [r3+1*16]
4342    mova                 m6, [r3+2*16]
4343    mova                 m7, [r3+3*16]
4344    mova                 m3, [o(pd_2)]
4345    REPX      {pxor  x, m1}, m5, m7
4346    REPX      {paddd x, m3}, m4, m6
4347    psubd                m3, m1
4348    REPX      {paddd x, m3}, m5, m7
4349    REPX      {psrad x, 2 }, m4, m5, m6, m7
4350    packssdw             m4, m5
4351    packssdw             m6, m7
4352%endif
4353    ret
4354.pass2:
4355%if ARCH_X86_64
4356    mova                 m8, [o(pw_2048)]
4357    mova                m11, [o(pw_m2048)]
4358    pxor                 m9, m9
4359    mova                m10, [o(pixel_10bpc_max)]
4360    mov                  r7, dstq
4361%else
4362    mov [rsp+2*gprsize+16*16], dstq
4363%endif
4364    lea                  r3, [strideq*3]
4365    mov                 r4d, 2
4366.loop_pass2:
4367%if ARCH_X86_32
4368    lea                  r5, [o(itx8_start)]
4369%endif
4370    mova                 m0, [cq+0*64+32]
4371    mova                 m1, [cq+1*64+32]
4372    mova                 m2, [cq+2*64+16]
4373    mova                 m3, [cq+3*64+16]
4374    mova                 m4, [cq+0*64+ 0]
4375    mova                 m5, [cq+1*64+ 0]
4376    mova                 m6, [cq+2*64+48]
4377    mova                 m7, [cq+3*64+48]
4378    mova [rsp+gprsize+3*16], m0
4379    mova [rsp+gprsize+4*16], m1
4380    mova [rsp+gprsize+5*16], m2
4381    mova [rsp+gprsize+6*16], m3
4382    mova [rsp+gprsize+7*16], m4
4383    mova [rsp+gprsize+8*16], m5
4384    mova [rsp+gprsize+9*16], m6
4385    mova [rsp+gprsize+10*16], m7
4386    mova                 m0, [cq+2*64+ 0]
4387    mova                 m1, [cq+3*64+ 0]
4388    mova                 m2, [cq+0*64+16]
4389    mova                 m3, [cq+1*64+16]
4390    mova                 m4, [cq+2*64+32]
4391    mova                 m5, [cq+3*64+32]
4392    mova                 m6, [cq+0*64+48]
4393    mova                 m7, [cq+1*64+48]
4394    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
4395    call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
4396
4397    ; out0-7 is in rsp+gprsize+3-10*mmsize
4398    ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
4399
4400%if ARCH_X86_64
4401    lea                dstq, [r7+strideq*8]
4402%else
4403    mov                dstq, [rsp+2*gprsize+16*16]
4404    lea                dstq, [dstq+strideq*8]
4405%endif
4406    call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
4407%if ARCH_X86_64
4408    mov                dstq, r7
4409%else
4410    mov                dstq, [rsp+2*gprsize+16*16]
4411%endif
4412    mova                 m0, [rsp+gprsize+ 3*16]
4413    mova                 m1, [rsp+gprsize+ 4*16]
4414    mova                 m2, [rsp+gprsize+ 5*16]
4415    mova                 m3, [rsp+gprsize+ 6*16]
4416    mova                 m4, [rsp+gprsize+ 7*16]
4417    mova                 m5, [rsp+gprsize+ 8*16]
4418    mova                 m6, [rsp+gprsize+ 9*16]
4419    mova                 m7, [rsp+gprsize+10*16]
4420    call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
4421%if ARCH_X86_64
4422    add                  r7, 16
4423%define mzero m9
4424%else
4425    add dword [rsp+2*gprsize+16*16], 16
4426%define mzero m7
4427    pxor                 m7, m7
4428%endif
4429    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
4430    add                  cq, 64*4
4431    REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
4432%undef mzero
4433    dec                 r4d
4434    jg .loop_pass2
4435%if WIN64
4436    mov                  r7, [rsp+16*16+gprsize]
4437%endif
4438    RET
4439
4440INV_TXFM_16X16_FN flipadst, dct
4441INV_TXFM_16X16_FN flipadst, adst
4442INV_TXFM_16X16_FN flipadst, flipadst
4443
4444cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
4445%if WIN64
4446    mov [rsp+16*16+gprsize], r7
4447%elif ARCH_X86_32
4448    mov [rsp+16*16+gprsize*1], r1
4449    mov [rsp+16*16+gprsize*2], r6
4450%endif
4451    lea                  t0, [o(.main)]
4452    jmp m(idct_16x16_internal_16bpc).pass1_full
4453
4454.main:
4455    call m(iadst_16x16_internal_16bpc).main
4456%if ARCH_X86_64
4457    mova                 m1, m0
4458    mova                 m3, m2
4459    mova                 m5, m4
4460    mova                 m7, m6
4461    pshufd               m0, m14, q1032
4462    pshufd               m2, m12, q1032
4463    pshufd               m4, m10, q1032
4464    pshufd               m6, m8, q1032
4465    pshufd               m8, m7, q1032
4466    pshufd              m10, m5, q1032
4467    pshufd              m12, m3, q1032
4468    pshufd              m14, m1, q1032
4469%else
4470    pshufd               m1, m0, q1032
4471    pshufd               m3, m2, q1032
4472    pshufd               m5, m4, q1032
4473    pshufd               m7, m6, q1032
4474    pshufd               m0, [r3+11*16], q1032
4475    pshufd               m2, [r3+10*16], q1032
4476    pshufd               m4, [r3+9*16], q1032
4477    pshufd               m6, [r3+8*16], q1032
4478    mova         [r3+11*16], m1
4479    mova         [r3+10*16], m3
4480    mova         [r3+ 9*16], m5
4481    mova         [r3+ 8*16], m7
4482%endif
4483    ret
4484
4485.pass2:
4486    lea                  r3, [strideq*3]
4487    lea                  r3, [r3*5]
4488    add                dstq, r3
4489    neg             strideq
4490    jmp m(iadst_16x16_internal_16bpc).pass2
4491
4492INV_TXFM_16X16_FN identity, dct, h
4493INV_TXFM_16X16_FN identity, identity
4494
4495cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
4496%if WIN64
4497    mov [rsp+16*16+gprsize], r7
4498%elif ARCH_X86_32
4499    mov [rsp+16*16+gprsize*1], r1
4500    mov [rsp+16*16+gprsize*2], r6
4501%endif
4502    lea                  t0, [o(.main)]
4503    jmp m(idct_16x16_internal_16bpc).pass1_full
4504
4505.main:
4506%if ARCH_X86_64
4507    mova                m15, [o(pd_11586)]
4508    pmulld               m0, m15, [cq+ 0*64+r5]
4509    pmulld               m1, m15, [cq+ 1*64+r5]
4510    pmulld               m2, m15, [cq+ 2*64+r5]
4511    pmulld               m3, m15, [cq+ 3*64+r5]
4512    pmulld               m4, m15, [cq+ 4*64+r5]
4513    pmulld               m5, m15, [cq+ 5*64+r5]
4514    pmulld               m6, m15, [cq+ 6*64+r5]
4515    pmulld               m7, m15, [cq+ 7*64+r5]
4516    pmulld               m8, m15, [cq+ 8*64+r5]
4517    pmulld               m9, m15, [cq+ 9*64+r5]
4518    pmulld              m10, m15, [cq+10*64+r5]
4519    pmulld              m11, m15, [cq+11*64+r5]
4520    pmulld              m12, m15, [cq+12*64+r5]
4521    pmulld              m13, m15, [cq+13*64+r5]
4522    pmulld              m14, m15, [cq+14*64+r5]
4523    pmulld              m15, [cq+15*64+r5]
4524    mova               [r3], m15
4525    mova                m15, [o(pd_10240)]
4526    REPX     {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
4527                         m8, m9, m10, m11, m12, m13, m14
4528    paddd               m15, [r3]
4529    REPX     {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \
4530                         m8, m9, m10, m11, m12, m13, m14, m15
4531    packssdw             m0, m1
4532    packssdw             m2, m3
4533    packssdw             m4, m5
4534    packssdw             m6, m7
4535    packssdw             m8, m9
4536    packssdw            m10, m11
4537    packssdw            m12, m13
4538    packssdw            m14, m15
4539%else
4540    mova                 m7, [o(pd_11586)]
4541    pmulld               m0, m7, [cq+ 0*64+r5]
4542    pmulld               m1, m7, [cq+ 1*64+r5]
4543    pmulld               m2, m7, [cq+ 2*64+r5]
4544    pmulld               m3, m7, [cq+ 3*64+r5]
4545    pmulld               m4, m7, [cq+ 4*64+r5]
4546    pmulld               m5, m7, [cq+ 5*64+r5]
4547    pmulld               m6, m7, [cq+ 6*64+r5]
4548    pmulld               m7, [cq+ 7*64+r5]
4549    mova               [r3], m7
4550    mova                 m7, [o(pd_10240)]
4551    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
4552    paddd                m7, [r3]
4553    REPX      {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
4554    packssdw             m0, m1
4555    packssdw             m2, m3
4556    packssdw             m4, m5
4557    packssdw             m6, m7
4558    mova          [r3+8*16], m0
4559    mova          [r3+9*16], m2
4560    mova         [r3+10*16], m4
4561    mova         [r3+11*16], m6
4562    mova                 m7, [o(pd_11586)]
4563    pmulld               m0, m7, [cq+ 8*64+r5]
4564    pmulld               m1, m7, [cq+ 9*64+r5]
4565    pmulld               m2, m7, [cq+10*64+r5]
4566    pmulld               m3, m7, [cq+11*64+r5]
4567    pmulld               m4, m7, [cq+12*64+r5]
4568    pmulld               m5, m7, [cq+13*64+r5]
4569    pmulld               m6, m7, [cq+14*64+r5]
4570    pmulld               m7, [cq+15*64+r5]
4571    mova               [r3], m7
4572    mova                 m7, [o(pd_10240)]
4573    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
4574    paddd                m7, [r3]
4575    REPX      {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
4576    packssdw             m0, m1
4577    packssdw             m2, m3
4578    packssdw             m4, m5
4579    packssdw             m6, m7
4580%endif
4581    ret
4582
4583.pass2:
4584%if ARCH_X86_64
4585    mova                 m4, [o(pw_2048)]
4586    mova                 m5, [o(pixel_10bpc_max)]
4587    pxor                 m6, m6
4588    mova                 m7, [o(pw_1697x16)]
4589    mov                  r7, dstq
4590%else
4591    mov [rsp+2*gprsize+16*16], dstq
4592%endif
4593    mov                 r5d, 4
4594    lea                  r3, [strideq*3]
4595.pass2_loop:
4596    mova                 m0, [cq+0*64+0]
4597    mova                 m1, [cq+1*64+0]
4598    mova                 m2, [cq+2*64+0]
4599    mova                 m3, [cq+3*64+0]
4600    call m(iidentity_8x16_internal_16bpc).main
4601%if ARCH_X86_64
4602    call m(idct_8x4_internal_16bpc).round1_and_write_8x4
4603%else
4604    call m(idct_8x4_internal_16bpc).round2_and_write_8x4
4605%endif
4606    REPX {mova [cq+x*16], m6}, 0, 4, 8, 12
4607    add                  cq, 16
4608    lea                dstq, [dstq+strideq*4]
4609    dec                 r5w
4610    jg .pass2_loop
4611    add                  cq, 64*3
4612    btc                 r5d, 16
4613    jc .end
4614%if ARCH_X86_64
4615    lea                dstq, [r7+16]
4616%else
4617    mov                dstq, [rsp+2*gprsize+16*16]
4618    add                dstq, 16
4619%endif
4620    add                 r5d, 4
4621    jmp .pass2_loop
4622.end:
4623%if WIN64
4624    mov                  r7, [rsp+16*16+gprsize]
4625%endif
4626    RET
4627
4628cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob
4629%if ARCH_X86_32
4630    LEA                  r6, $$
4631%endif
4632    mova                 m5, [o(pw_5)]
4633    mova                 m7, [o(pixel_10bpc_max)]
4634    pxor                 m6, m6
4635    mov                 r5d, eobd
4636    add                eobb, 21
4637    cmovc              eobd, r5d ; 43, 107, 171 -> 64, 128, 192
4638    lea                  r4, [strideq*3]
4639.loop:
4640    mova                 m0, [cq+128*0]
4641    packssdw             m0, [cq+128*1]
4642    mova                 m1, [cq+128*2]
4643    packssdw             m1, [cq+128*3]
4644    mova                 m2, [cq+128*4]
4645    packssdw             m2, [cq+128*5]
4646    mova                 m3, [cq+128*6]
4647    packssdw             m3, [cq+128*7]
4648    REPX     {paddsw x, m5}, m0, m1, m2, m3
4649    REPX     {psraw  x, 3 }, m0, m1, m2, m3
4650    call .main_zero
4651    add                  cq, 16
4652    lea                dstq, [dstq+strideq*4]
4653    btc                eobd, 16
4654    jnc .loop
4655    sub                eobd, 64
4656    jge .loop
4657    RET
4658ALIGN function_align
4659.main_zero:
4660    REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
4661.main:
4662    punpckhwd            m4, m0, m1
4663    punpcklwd            m0, m1
4664    punpckhwd            m1, m2, m3
4665    punpcklwd            m2, m3
4666    punpckhwd            m3, m0, m4
4667    punpcklwd            m0, m4
4668    punpckhwd            m4, m2, m1
4669    punpcklwd            m2, m1
4670    punpckhqdq           m1, m0, m2
4671    punpcklqdq           m0, m2
4672    punpcklqdq           m2, m3, m4
4673    punpckhqdq           m3, m4
4674    paddw                m0, [dstq+strideq*0]
4675    paddw                m1, [dstq+strideq*1]
4676    paddw                m2, [dstq+strideq*2]
4677    paddw                m3, [dstq+r4       ]
4678    REPX     {pmaxsw x, m6}, m0, m1, m2, m3
4679    REPX     {pminsw x, m7}, m0, m1, m2, m3
4680    mova   [dstq+strideq*0], m0
4681    mova   [dstq+strideq*1], m1
4682    mova   [dstq+strideq*2], m2
4683    mova   [dstq+r4       ], m3
4684    ret
4685
4686cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob
4687%if ARCH_X86_32
4688    LEA                  r6, $$
4689%endif
4690    mova                 m5, [o(pw_4096)]
4691    mova                 m7, [o(pixel_10bpc_max)]
4692    pxor                 m6, m6
4693    mov                 r4d, eobd
4694    add                eobb, 21
4695    cmovc              eobd, r4d
4696    lea                  r4, [strideq*3]
4697    mov                  r5, dstq
4698.loop:
4699    mova                 m0, [cq+32*0]
4700    packssdw             m0, [cq+32*1]
4701    mova                 m1, [cq+32*2]
4702    packssdw             m1, [cq+32*3]
4703    mova                 m2, [cq+32*4]
4704    packssdw             m2, [cq+32*5]
4705    mova                 m3, [cq+32*6]
4706    packssdw             m3, [cq+32*7]
4707    REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
4708    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
4709    call m(inv_txfm_add_identity_identity_8x32_16bpc).main
4710    lea                dstq, [dstq+strideq*4]
4711    add                  cq, 16
4712    btc                eobd, 16
4713    jnc .loop
4714    add                  cq, 32*8-32
4715    add                  r5, 16
4716    mov                dstq, r5
4717    sub                eobd, 64
4718    jge .loop
4719    RET
4720
4721cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob
4722%if ARCH_X86_32
4723    LEA                  r6, $$
4724%else
4725    mova                 m8, [o(pw_2896x8)]
4726    mova                 m9, [o(pw_1697x16)]
4727    mova                m11, [o(pw_8192)]
4728%endif
4729    mova                 m7, [o(pixel_10bpc_max)]
4730    lea                  r4, [strideq*3]
4731    pxor                 m6, m6
4732%if ARCH_X86_64
4733    paddw               m10, m11, m11 ; pw_16384
4734%endif
4735    mov                  r5, dstq
4736    call .main
4737    sub                eobd, 36
4738    jl .ret
4739    add                  cq, 128*8-32
4740    lea                dstq, [r5+16]
4741    call .main
4742    sub                  cq, 128*8
4743    lea                dstq, [r5+strideq*8]
4744    mov                  r5, dstq
4745    call .main
4746    sub                eobd, 107 ; eob < 143
4747    jl .ret
4748    add                  cq, 128*8-32
4749    lea                dstq, [r5+16]
4750    call .main
4751    sub                  cq, 128*8
4752    lea                dstq, [r5+strideq*8]
4753    mov                  r5, dstq
4754    call .main
4755    sub                eobd, 128 ; eob < 271
4756    jl .ret
4757    add                  cq, 128*8-32
4758    lea                dstq, [r5+16]
4759    call .main
4760    sub                  cq, 128*8
4761    lea                dstq, [r5+strideq*8]
4762    mov                  r5, dstq
4763    call .main
4764    sub                eobd, 128 ; eob < 399
4765    jl .ret
4766    add                  cq, 128*8-32
4767    lea                dstq, [r5+16]
4768    call .main
4769.ret:
4770    RET
4771ALIGN function_align
4772.main:
4773    mova                 m0, [cq+128*0]
4774    packssdw             m0, [cq+128*1]
4775    mova                 m1, [cq+128*2]
4776    packssdw             m1, [cq+128*3]
4777    mova                 m2, [cq+128*4]
4778    packssdw             m2, [cq+128*5]
4779    mova                 m3, [cq+128*6]
4780    packssdw             m3, [cq+128*7]
4781%if ARCH_X86_64
4782    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
4783    pmulhrsw             m4, m9, m0
4784    pmulhrsw             m5, m9, m1
4785    REPX  {pmulhrsw x, m10}, m4, m5
4786%else
4787    mova                 m6, [o(pw_2896x8)]
4788    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
4789    mova                 m5, [o(pw_1697x16)]
4790    pmulhrsw             m4, m5, m0
4791    pmulhrsw             m5, m1
4792    mova                 m6, [o(pw_16384)]
4793    REPX  {pmulhrsw x, m6 }, m4, m5
4794%endif
4795    paddsw               m0, m4
4796    paddsw               m1, m5
4797%if ARCH_X86_64
4798    pmulhrsw             m4, m9, m2
4799    pmulhrsw             m5, m9, m3
4800    REPX  {pmulhrsw x, m10}, m4, m5
4801%else
4802    mova                 m5, [o(pw_1697x16)]
4803    pmulhrsw             m4, m5, m2
4804    pmulhrsw             m5, m3
4805    REPX  {pmulhrsw x, m6 }, m4, m5
4806%endif
4807    paddsw               m2, m4
4808    paddsw               m3, m5
4809%if ARCH_X86_64
4810    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3
4811%else
4812    psrlw                m6, 1          ; pw_8192
4813    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
4814    pxor                 m6, m6
4815%endif
4816    call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
4817    lea                dstq, [dstq+strideq*4]
4818    add                  cq, 16
4819    btc                eobd, 16
4820    jnc .main
4821    ret
4822
4823cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob
4824%if ARCH_X86_32
4825    LEA                  r6, $$
4826%else
4827    mova                 m8, [o(pw_2896x8)]
4828    mova                 m9, [o(pw_1697x16)]
4829    mova                m10, [o(pw_2048)]
4830%endif
4831    mova                 m7, [o(pixel_10bpc_max)]
4832    lea                  r4, [strideq*3]
4833    pxor                 m6, m6
4834    mov                  r5, dstq
4835    call .main
4836    sub                eobd, 36
4837    jl .ret
4838    call .main
4839    add                  cq, 64*8-64
4840    lea                dstq, [r5+16*1]
4841    call .main
4842    sub                eobd, 107 ; eob < 143
4843    jl .ret
4844    call .main
4845    add                  cq, 64*8-64
4846    lea                dstq, [r5+16*2]
4847    call .main
4848    sub                eobd, 128 ; eob < 271
4849    jl .ret
4850    call .main
4851    add                  cq, 64*8-64
4852    lea                dstq, [r5+16*3]
4853    call .main
4854    sub                eobd, 128 ; eob < 399
4855    jl .ret
4856    call .main
4857.ret:
4858    RET
4859ALIGN function_align
4860.main:
4861    mova                 m0, [cq+64*0]
4862    packssdw             m0, [cq+64*1]
4863    mova                 m1, [cq+64*2]
4864    packssdw             m1, [cq+64*3]
4865    mova                 m2, [cq+64*4]
4866    packssdw             m2, [cq+64*5]
4867    mova                 m3, [cq+64*6]
4868    packssdw             m3, [cq+64*7]
4869%if ARCH_X86_64
4870    REPX  {pmulhrsw x, m8 }, m0, m1, m2, m3
4871%else
4872    mova                 m6, [o(pw_2896x8)]
4873    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
4874%endif
4875    REPX  {paddsw   x, x  }, m0, m1, m2, m3
4876%if ARCH_X86_64
4877    pmulhrsw             m4, m9, m0
4878    pmulhrsw             m5, m9, m1
4879%else
4880    mova                 m6, [o(pw_1697x16)]
4881    pmulhrsw             m4, m6, m0
4882    pmulhrsw             m5, m6, m1
4883%endif
4884    REPX  {paddsw   x, x  }, m0, m1
4885    paddsw               m0, m4
4886    paddsw               m1, m5
4887%if ARCH_X86_64
4888    pmulhrsw             m4, m9, m2
4889    pmulhrsw             m5, m9, m3
4890%else
4891    pmulhrsw             m4, m6, m2
4892    pmulhrsw             m6, m3
4893%endif
4894    REPX  {paddsw   x, x  }, m2, m3
4895    paddsw               m2, m4
4896%if ARCH_X86_64
4897    paddsw               m3, m5
4898    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3
4899%else
4900    paddsw               m3, m6
4901    mova                 m6, [o(pw_2048)]
4902    REPX  {pmulhrsw x, m6 }, m0, m1, m2, m3
4903    pxor                 m6, m6
4904%endif
4905    REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
4906    call m(inv_txfm_add_identity_identity_8x32_16bpc).main
4907    lea                dstq, [dstq+strideq*4]
4908    add                  cq, 16
4909    btc                eobd, 16
4910    jnc .main
4911    ret
4912
4913cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob
4914%undef cmp
4915%if ARCH_X86_32
4916    LEA                  r6, $$
4917%endif
4918    mova                 m5, [o(pw_8192)]
4919    mova                 m7, [o(pixel_10bpc_max)]
4920    pxor                 m6, m6
4921    lea                  r4, [strideq*3]
4922    mov                  r5, dstq
4923    call .main                              ; 0
4924    cmp                eobd, 36
4925    jl .ret
4926    add                  cq, 128*8-32       ; 0 1
4927    lea                dstq, [r5+16]        ; 1
4928    call .main
4929    call .main2
4930    cmp                eobd, 136
4931    jl .ret
4932    add                  cq, 128*16-64      ; 0 1 2
4933    lea                dstq, [r5+16*2]      ; 1 2
4934    call .main                              ; 2
4935    call .main2
4936    call .main2
4937    cmp                eobd, 300
4938    jl .ret
4939    add                  cq, 128*24-96      ; 0 1 2 3
4940    add                  r5, 16*3           ; 1 2 3
4941    mov                dstq, r5             ; 2 3
4942    call .main                              ; 3
4943    call .main2
4944    call .main2
4945    call .main2
4946    cmp                eobd, 535
4947    jl .ret
4948    add                  cq, 128*24-96      ; 0 1 2 3
4949    lea                dstq, [r5+strideq*8] ; 1 2 3 4
4950    mov                  r5, dstq           ; 2 3 4
4951    call .main                              ; 3 4
4952    call .main2
4953    call .main2
4954    cmp                eobd, 755
4955    jl .ret
4956    add                  cq, 128*16-64      ; 0 1 2 3
4957    lea                dstq, [r5+strideq*8] ; 1 2 3 4
4958    mov                  r5, dstq           ; 2 3 4 5
4959    call .main                              ; 3 4 5
4960    call .main2
4961    cmp                eobd, 911
4962    jl .ret
4963    add                  cq, 128*8-32       ; 0 1 2 3
4964    lea                dstq, [r5+strideq*8] ; 1 2 3 4
4965    call .main                              ; 2 3 4 5
4966.ret:                                       ; 3 4 5 6
4967    RET
4968ALIGN function_align
4969.main2:
4970    sub                  cq, 128*8
4971    sub                dstq, 16
4972.main:
4973    mova                 m0, [cq+128*0]
4974    packssdw             m0, [cq+128*1]
4975    mova                 m1, [cq+128*2]
4976    packssdw             m1, [cq+128*3]
4977    mova                 m2, [cq+128*4]
4978    packssdw             m2, [cq+128*5]
4979    mova                 m3, [cq+128*6]
4980    packssdw             m3, [cq+128*7]
4981    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
4982    call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
4983    lea                dstq, [dstq+strideq*4]
4984    add                  cq, 16
4985    btc                eobd, 16
4986    jnc .main
4987    ret
4988
4989cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
4990                                         dst, stride, c, eob
4991%if ARCH_X86_32
4992    LEA                  r6, $$
4993%define base $$
4994    DECLARE_REG_TMP       0, 4
4995%else
4996    lea                  r6, [tbl_Nx32_odd_offset]
4997%define base tbl_Nx32_odd_offset
4998    DECLARE_REG_TMP       4, 7
4999%if WIN64
5000    mov [rsp+gprsize*1+35*16], r7
5001%endif
5002%endif
5003%define o2(x) r6-base+x
5004    test               eobd, eobd
5005    jz .dconly
5006
5007%if ARCH_X86_32
5008    mov [rsp+gprsize*1+35*16], r0
5009%endif
5010%undef cmp
5011    ; remove entirely-zero iterations
5012    mov                 r5d, 7*2
5013    cmp                eobw, word [o2(tbl_8x32_2d)+r5]
5014    jge .end_zero_loop
5015    pxor                 m0, m0
5016.zero_loop:
5017    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
5018    movzx               t1d, t0b
5019    shr                 t0d, 8
5020    mova   [rsp+ 3*16+r5*8], m0
5021    mova   [rsp+11*16+r5*8], m0
5022    mova   [rsp+ 3*16+t0*8], m0
5023    mova   [rsp+ 3*16+t1*8], m0
5024    sub                 r5d, 2
5025    cmp                eobw, word [o2(tbl_8x32_2d)+r5]
5026    jl .zero_loop
5027.end_zero_loop:
5028    ; actual first pass after skipping all-zero data
5029    mov [rsp+gprsize*0+35*16], eobd
5030    mov                  r3, rsp
5031.loop_pass1:
5032%if ARCH_X86_64
5033    mova                m11, [o(pd_2048)]
5034    mova                m12, [o(clip_18b_min)]
5035    mova                m13, [o(clip_18b_max)]
5036    mova                m14, [o(pd_2896)]
5037%endif
5038    mova                 m0, [cq+0*128+r5*8]
5039    mova                 m1, [cq+1*128+r5*8]
5040    mova                 m2, [cq+2*128+r5*8]
5041    mova                 m3, [cq+3*128+r5*8]
5042    mova                 m4, [cq+4*128+r5*8]
5043    mova                 m5, [cq+5*128+r5*8]
5044    mova                 m6, [cq+6*128+r5*8]
5045    mova                 m7, [cq+7*128+r5*8]
5046    call m(idct_8x4_internal_16bpc).main_pass1
5047    mova                 m1, [o(pd_2)]
5048    REPX      {paddd x, m1}, m0, m6, m5, m3
5049    call m(idct_8x4_internal_16bpc).round
5050    REPX      {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
5051    packssdw             m0, m1
5052    packssdw             m2, m3
5053    packssdw             m4, m5
5054    packssdw             m6, m7
5055    call m(idct_8x4_internal_16bpc).transpose4x8packed
5056
5057    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
5058    movzx               t1d, t0b
5059    shr                 t0d, 8
5060    mova    [r3+ 3*16+r5*8], m0
5061    mova    [r3+11*16+r5*8], m2
5062    mova    [r3+ 3*16+t1*8], m1
5063    mova    [r3+ 3*16+t0*8], m3
5064    pxor                 m7, m7
5065    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7
5066    sub                 r5d, 2
5067    jge .loop_pass1
5068
5069    ; pass 2 code starts here
5070    ; m0 is already loaded from last iteration of first pass
5071%if ARCH_X86_32
5072    mov                  r0, [rsp+gprsize*1+35*16]
5073%endif
5074    mov                eobd, [rsp+gprsize*0+35*16]
5075    cmp                eobd, 43
5076    jl .load_veryfast
5077    cmp                eobd, 107
5078    jl .load_fast
5079    ; load normal
5080    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
5081    jmp .run
5082.load_fast:
5083    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
5084    jmp .run
5085.load_veryfast:
5086    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
5087    ; fall-through
5088.run:
5089    call .pass2
5090%if WIN64
5091    mov                  r7, [rsp+gprsize*1+35*16]
5092%endif
5093    RET
5094
5095.pass2:
5096%if ARCH_X86_32
5097    lea                  r5, [o(itx8_start)]
5098%endif
5099    mova                 m1, [rsp+gprsize+16* 4]
5100    mova                 m2, [rsp+gprsize+16* 5]
5101    mova                 m3, [rsp+gprsize+16* 6]
5102    mova                 m4, [rsp+gprsize+16* 7]
5103    mova                 m5, [rsp+gprsize+16* 8]
5104    mova                 m6, [rsp+gprsize+16* 9]
5105    mova                 m7, [rsp+gprsize+16*10]
5106    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
5107    mova [rsp+gprsize+ 3*16], m0
5108    mova [rsp+gprsize+ 4*16], m1
5109    mova [rsp+gprsize+ 5*16], m2
5110    mova [rsp+gprsize+ 6*16], m3
5111    mova [rsp+gprsize+ 7*16], m4
5112    mova [rsp+gprsize+ 8*16], m5
5113    mova [rsp+gprsize+ 9*16], m6
5114    mova                 m0, [rsp+gprsize+11*16]
5115    mova                 m1, [rsp+gprsize+12*16]
5116    mova                 m2, [rsp+gprsize+13*16]
5117    mova                 m3, [rsp+gprsize+14*16]
5118    mova                 m4, [rsp+gprsize+15*16]
5119    mova                 m5, [rsp+gprsize+16*16]
5120    mova                 m6, [rsp+gprsize+17*16]
5121    mova                 m7, [rsp+gprsize+18*16]
5122    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
5123    mova                 m7, [rsp+gprsize+ 0*16]
5124    mova [rsp+gprsize+11*16], m0
5125    mova [rsp+gprsize+12*16], m1
5126    mova [rsp+gprsize+13*16], m2
5127    mova [rsp+gprsize+14*16], m3
5128    mova [rsp+gprsize+15*16], m4
5129    mova [rsp+gprsize+16*16], m5
5130    mova [rsp+gprsize+17*16], m6
5131    mova [rsp+gprsize+18*16], m7
5132    call                 r4
5133%if ARCH_X86_64
5134    mova                 m8, [o(pw_2048)]
5135    pxor                 m9, m9
5136    mova                m10, [o(pixel_10bpc_max)]
5137%endif
5138    lea                  r3, [strideq*3]
5139    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
5140    lea                dstq, [dstq+strideq*8]
5141    mova                 m0, [rsp+gprsize+11*16]
5142    mova                 m1, [rsp+gprsize+12*16]
5143    mova                 m2, [rsp+gprsize+13*16]
5144    mova                 m3, [rsp+gprsize+14*16]
5145    mova                 m4, [rsp+gprsize+15*16]
5146    mova                 m5, [rsp+gprsize+16*16]
5147    mova                 m6, [rsp+gprsize+17*16]
5148    mova                 m7, [rsp+gprsize+18*16]
5149    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
5150    lea                dstq, [dstq+strideq*8]
5151    mova                 m0, [rsp+gprsize+19*16]
5152    mova                 m1, [rsp+gprsize+20*16]
5153    mova                 m2, [rsp+gprsize+21*16]
5154    mova                 m3, [rsp+gprsize+22*16]
5155    mova                 m4, [rsp+gprsize+23*16]
5156    mova                 m5, [rsp+gprsize+24*16]
5157    mova                 m6, [rsp+gprsize+25*16]
5158    mova                 m7, [rsp+gprsize+26*16]
5159    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
5160    lea                dstq, [dstq+strideq*8]
5161    mova                 m0, [rsp+gprsize+27*16]
5162    mova                 m1, [rsp+gprsize+28*16]
5163    mova                 m2, [rsp+gprsize+29*16]
5164    mova                 m3, [rsp+gprsize+30*16]
5165    mova                 m4, [rsp+gprsize+31*16]
5166    mova                 m5, [rsp+gprsize+32*16]
5167    mova                 m6, [rsp+gprsize+33*16]
5168    mova                 m7, [rsp+gprsize+34*16]
5169    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
5170    ret
5171.dconly:
5172    imul                r5d, [cq], 181
5173    mov                [cq], eobd ; 0
5174    mov                 r3d, 8
5175    add                 r5d, 640
5176    sar                 r5d, 10
5177    add                 rsp, (31+2*ARCH_X86_64)*16
5178    jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
5179
5180cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
5181                                          dst, stride, c, eob
5182    LEA                  r6, base
5183    test               eobd, eobd
5184    jz .dconly
5185
5186%if ARCH_X86_32
5187    mov [rsp+gprsize*1+76*16], r0
5188%elif WIN64
5189    mov [rsp+gprsize*1+76*16], r7
5190%endif
5191%undef cmp
5192    ; remove entirely-zero iterations
5193    mov                 r5d, 7*2
5194    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
5195    jge .end_zero_loop
5196    pxor                 m0, m0
5197.zero_loop:
5198    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
5199    movzx               t1d, t0b
5200    shr                 t0d, 8
5201    mova   [rsp+12*16+r5*8], m0
5202    mova   [rsp+20*16+r5*8], m0
5203    mova   [rsp+12*16+t0*8], m0
5204    mova   [rsp+12*16+t1*8], m0
5205    mova   [rsp+44*16+r5*8], m0
5206    mova   [rsp+52*16+r5*8], m0
5207    mova   [rsp+44*16+t0*8], m0
5208    mova   [rsp+44*16+t1*8], m0
5209    sub                 r5d, 2
5210    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
5211    jl .zero_loop
5212.end_zero_loop:
5213    ; actual first pass after skipping all-zero data
5214    mov [rsp+gprsize*0+76*16], eobd
5215    mov                  r3, rsp
5216.loop_pass1:
5217%if ARCH_X86_64
5218    mova                m11, [o(pd_2048)]
5219    mova                m12, [o(clip_18b_min)]
5220    mova                m13, [o(clip_18b_max)]
5221    mova                m14, [o(pd_2896)]
5222%endif
5223    mova                 m0, [cq+ 1*128+r5*8]
5224    mova                 m1, [cq+ 3*128+r5*8]
5225    mova                 m2, [cq+ 5*128+r5*8]
5226    mova                 m3, [cq+ 7*128+r5*8]
5227    mova                 m4, [cq+ 9*128+r5*8]
5228    mova                 m5, [cq+11*128+r5*8]
5229    mova                 m6, [cq+13*128+r5*8]
5230    mova                 m7, [cq+15*128+r5*8]
5231    call m(idct_8x4_internal_16bpc).rect2_mul
5232    call m(idct_16x4_internal_16bpc).main_oddhalf
5233
5234    mova                 m0, [cq+ 0*128+r5*8]
5235    mova                 m1, [cq+ 2*128+r5*8]
5236    mova                 m2, [cq+ 4*128+r5*8]
5237    mova                 m3, [cq+ 6*128+r5*8]
5238    mova                 m4, [cq+ 8*128+r5*8]
5239    mova                 m5, [cq+10*128+r5*8]
5240    mova                 m6, [cq+12*128+r5*8]
5241    mova                 m7, [cq+14*128+r5*8]
5242    call m(idct_8x4_internal_16bpc).rect2_mul
5243    call m(idct_8x4_internal_16bpc).main_pass1
5244    call m(idct_8x4_internal_16bpc).round
5245    call m(idct_16x4_internal_16bpc).round
5246%if ARCH_X86_64
5247    packssdw             m0, m1
5248    packssdw             m2, m3
5249    packssdw             m4, m5
5250    packssdw             m6, m7
5251    packssdw             m8, m9
5252    packssdw            m10, m11
5253    packssdw            m12, m13
5254    packssdw            m14, m15
5255%endif
5256    call m(idct_8x4_internal_16bpc).transpose4x8packed
5257    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
5258    movzx               t1d, t0b
5259    shr                 t0d, 8
5260%if ARCH_X86_64
5261    mova   [rsp+12*16+r5*8], m0
5262    mova   [rsp+20*16+r5*8], m2
5263    mova   [rsp+12*16+t1*8], m1
5264    mova   [rsp+12*16+t0*8], m3
5265    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
5266    mova   [rsp+44*16+r5*8], m8
5267    mova   [rsp+52*16+r5*8], m10
5268    mova   [rsp+44*16+t1*8], m9
5269    mova   [rsp+44*16+t0*8], m11
5270%else
5271    mova   [rsp+44*16+r5*8], m0
5272    mova   [rsp+52*16+r5*8], m2
5273    mova   [rsp+44*16+t1*8], m1
5274    mova   [rsp+44*16+t0*8], m3
5275    mova                 m0, [r3+ 8*16]
5276    mova                 m2, [r3+ 9*16]
5277    mova                 m4, [r3+10*16]
5278    mova                 m6, [r3+11*16]
5279    call m(idct_8x4_internal_16bpc).transpose4x8packed
5280    mova   [rsp+12*16+r5*8], m0
5281    mova   [rsp+20*16+r5*8], m2
5282    mova   [rsp+12*16+t1*8], m1
5283    mova   [rsp+12*16+t0*8], m3
5284%endif
5285    pxor                 m7, m7
5286    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
5287    sub                 r5d, 2
5288    jge .loop_pass1
5289
5290    ; pass=2
5291    add                 rsp, 9*16
5292%if ARCH_X86_64
5293    mov                  r6, dstq
5294%else
5295    mov                dstq, [rsp+gprsize*1+67*16]
5296%endif
5297    mov                eobd, [rsp+gprsize*0+67*16]
5298    cmp                eobd, 44
5299    jl .load_veryfast
5300    cmp                eobd, 151
5301    jl .load_fast
5302    ; load normal
5303    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
5304    jmp .run
5305.load_fast:
5306    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
5307    jmp .run
5308.load_veryfast:
5309    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
5310    ; fall-through
5311.run:
5312%if ARCH_X86_64
5313    lea                  r2, [dstq+32]
5314    mov                  r7, -4
5315%else
5316    lea                  r2, [rsp+67*16]
5317    mov dword [r2+0*gprsize], 2
5318%endif
5319    jmp .loop_pass2_entry
5320.loop_pass2:
5321    mova                 m0, [rsp+16* 3]
5322.loop_pass2_entry:
5323%if ARCH_X86_32
5324    mov                dstq, [r2+1*gprsize]
5325%endif
5326    call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
5327    add                 rsp, 32*16
5328%if ARCH_X86_64
5329    add                  r7, 2
5330    lea                dstq, [r2+r7*8]
5331    jl .loop_pass2
5332%if WIN64
5333    mov                  r7, [rsp+gprsize*1+3*16]
5334%endif
5335%else
5336    add dword [r2+1*gprsize], 16
5337    dec dword [r2+0*gprsize]
5338    jg .loop_pass2
5339%endif
5340%assign stack_size (stack_size-73*16)
5341%if STACK_ALIGNMENT >= 16
5342%assign stack_size_padded (stack_size_padded-73*16)
5343%assign stack_offset (stack_offset-73*16)
5344%else
5345%xdefine rstkm [rsp + stack_size]
5346%endif
5347    RET
5348.dconly:
5349    imul                r5d, [cq], 181
5350    mov                [cq], eobd ; 0
5351    mov                 r3d, 32
5352    add                 r5d, 128
5353    sar                 r5d, 8
5354    imul                r5d, 181
5355    add                 rsp, (65+4*ARCH_X86_64)*16
5356    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
5357
5358cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
5359                                         dst, stride, c, eob
5360%if ARCH_X86_32
5361    LEA                  r6, $$
5362%endif
5363    test               eobd, eobd
5364    jz .dconly
5365
5366    ; remove entirely-zero iterations
5367%undef cmp
5368%if ARCH_X86_64
5369    xor                 r5d, r5d
5370    cmp                eobd, 10
5371    setge               r5b
5372%else
5373    mov                 r5d, 1
5374    cmp                eobd, 10
5375    sbb                 r5d, 0
5376%endif
5377    add                 r5d, r5d
5378
5379    ; actual first pass after skipping all-zero data
5380.loop_pass1:
5381    mova                 m0, [cq+32* 1+r5*8]
5382    mova                 m1, [cq+32* 7+r5*8]
5383    mova                 m2, [cq+32* 9+r5*8]
5384    mova                 m3, [cq+32*15+r5*8]
5385    mova                 m4, [cq+32*17+r5*8]
5386    mova                 m5, [cq+32*23+r5*8]
5387    mova                 m6, [cq+32*25+r5*8]
5388    mova                 m7, [cq+32*31+r5*8]
5389%if ARCH_X86_64
5390    mova                m11, [o(pd_2048)]
5391    mova                m12, [o(clip_18b_min)]
5392    mova                m13, [o(clip_18b_max)]
5393    mova                m14, [o(pd_2896)]
5394%endif
5395    mov                  r3, rsp
5396    call .main_oddhalf_part1
5397    mova                 m0, [cq+32* 3+r5*8]
5398    mova                 m1, [cq+32* 5+r5*8]
5399    mova                 m2, [cq+32*11+r5*8]
5400    mova                 m3, [cq+32*13+r5*8]
5401    mova                 m4, [cq+32*19+r5*8]
5402    mova                 m5, [cq+32*21+r5*8]
5403    mova                 m6, [cq+32*27+r5*8]
5404    mova                 m7, [cq+32*29+r5*8]
5405    call .main_oddhalf_part2
5406    mova                 m0, [cq+32* 2+r5*8]
5407    mova                 m1, [cq+32* 6+r5*8]
5408    mova                 m2, [cq+32*10+r5*8]
5409    mova                 m3, [cq+32*14+r5*8]
5410    mova                 m4, [cq+32*18+r5*8]
5411    mova                 m5, [cq+32*22+r5*8]
5412    mova                 m6, [cq+32*26+r5*8]
5413    mova                 m7, [cq+32*30+r5*8]
5414    add                  r3, 16*(16+4*ARCH_X86_32)
5415    call m(idct_16x4_internal_16bpc).main_oddhalf
5416    mova                 m0, [cq+32* 0+r5*8]
5417    mova                 m1, [cq+32* 4+r5*8]
5418    mova                 m2, [cq+32* 8+r5*8]
5419    mova                 m3, [cq+32*12+r5*8]
5420    mova                 m4, [cq+32*16+r5*8]
5421    mova                 m5, [cq+32*20+r5*8]
5422    mova                 m6, [cq+32*24+r5*8]
5423    mova                 m7, [cq+32*28+r5*8]
5424    call m(idct_8x4_internal_16bpc).main_pass1
5425    call m(idct_8x4_internal_16bpc).round
5426    sub                  r3, 16*(16+4*ARCH_X86_32)
5427    call .round_dct32
5428%if ARCH_X86_64
5429    call m(idct_8x4_internal_16bpc).transpose4x8packed
5430    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
5431    mova    [cq+32* 8+r5*8], m8
5432    mova    [cq+32* 9+r5*8], m9
5433    mova    [cq+32*10+r5*8], m10
5434    mova    [cq+32*11+r5*8], m11
5435    mova                 m8, [r3+16* 9] ;  8  9
5436    mova                m10, [r3+16*11] ; 10 11
5437    mova                m12, [r3+16*13] ; 12 13
5438    mova                m14, [r3+16*15] ; 14 15
5439    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
5440    mova    [cq+32* 4+r5*8], m8
5441    mova    [cq+32* 5+r5*8], m9
5442    mova    [cq+32* 6+r5*8], m10
5443    mova    [cq+32* 7+r5*8], m11
5444    mova                 m8, [r3+16* 8] ; 24 25
5445    mova                m10, [r3+16*10] ; 26 27
5446    mova                m12, [r3+16*12] ; 28 29
5447    mova                m14, [r3+16*14] ; 30 31
5448    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
5449    mova    [cq+32*12+r5*8], m8
5450    mova    [cq+32*13+r5*8], m9
5451    mova    [cq+32*14+r5*8], m10
5452    mova    [cq+32*15+r5*8], m11
5453%else
5454    sub                  r3, 8*16
5455    mova                 m0, [r3+ 8*16]
5456    mova                 m2, [r3+10*16]
5457    mova                 m4, [r3+12*16]
5458    mova                 m6, [r3+14*16]
5459    packssdw             m0, [r3+ 9*16]
5460    packssdw             m2, [r3+11*16]
5461    packssdw             m4, [r3+13*16]
5462    packssdw             m6, [r3+15*16]
5463    call m(idct_8x4_internal_16bpc).transpose4x8packed
5464    mova    [cq+32* 4+r5*8], m0
5465    mova    [cq+32* 5+r5*8], m1
5466    mova    [cq+32* 6+r5*8], m2
5467    mova    [cq+32* 7+r5*8], m3
5468    mova                 m0, [r3+16*16]
5469    mova                 m2, [r3+18*16]
5470    mova                 m4, [r3+20*16]
5471    mova                 m6, [r3+22*16]
5472    packssdw             m0, [r3+17*16]
5473    packssdw             m2, [r3+19*16]
5474    packssdw             m4, [r3+21*16]
5475    packssdw             m6, [r3+23*16]
5476    call m(idct_8x4_internal_16bpc).transpose4x8packed
5477    mova    [cq+32* 8+r5*8], m0
5478    mova    [cq+32* 9+r5*8], m1
5479    mova    [cq+32*10+r5*8], m2
5480    mova    [cq+32*11+r5*8], m3
5481    mova                 m0, [r3+31*16]
5482    mova                 m2, [r3+29*16]
5483    mova                 m4, [r3+27*16]
5484    mova                 m6, [r3+25*16]
5485    packssdw             m0, [r3+30*16]
5486    packssdw             m2, [r3+28*16]
5487    packssdw             m4, [r3+26*16]
5488    packssdw             m6, [r3+24*16]
5489    call m(idct_8x4_internal_16bpc).transpose4x8packed
5490    mova    [cq+32*12+r5*8], m0
5491    mova    [cq+32*13+r5*8], m1
5492    mova    [cq+32*14+r5*8], m2
5493    mova    [cq+32*15+r5*8], m3
5494    mova                 m0, [r3+ 0*16]
5495    mova                 m2, [r3+ 2*16]
5496    mova                 m4, [r3+ 4*16]
5497    mova                 m6, [r3+ 6*16]
5498    packssdw             m0, [r3+ 1*16]
5499    packssdw             m2, [r3+ 3*16]
5500    packssdw             m4, [r3+ 5*16]
5501    packssdw             m6, [r3+ 7*16]
5502    call m(idct_8x4_internal_16bpc).transpose4x8packed
5503%endif
5504    pxor                 m7, m7
5505    ; clear lower half of [cq]
5506    REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \
5507                                    24, 25, 26, 27, 28, 29, 30, 31
5508    test                r5d, r5d
5509    jz .end_pass1
5510    mova    [cq+32* 0+r5*8], m0
5511    mova    [cq+32* 1+r5*8], m1
5512    mova    [cq+32* 2+r5*8], m2
5513    mova    [cq+32* 3+r5*8], m3
5514    sub                 r5d, 2
5515    jmp .loop_pass1
5516.end_pass1:
5517
5518    ; pass=2, we need to call this otherwise the stack pointer has
5519    ; the wrong offset in the 8-bit code
5520    mov                 r4d, 4
5521    call m(idct_16x8_internal_16bpc).pass2_main
5522    RET
5523
5524.main_oddhalf_part1_fast: ; lower half zero
5525    pmulld               m7, m0, [o(pd_4091)]
5526    pmulld               m0, [o(pd_201)]
5527    pmulld               m4, m3, [o(pd_m2751)]
5528%if ARCH_X86_32
5529    pmulld               m3, [o(pd_3035)]
5530    mova                 m5, [o(pd_2048)]
5531    REPX      {paddd x, m5}, m0, m7
5532    REPX      {psrad x, 12}, m0, m7
5533    mova          [r3+3*16], m7
5534    mova                 m7, m3
5535    mova                 m3, m5
5536%else
5537    pmulld               m3, [o(pd_3035)]
5538%endif
5539    pmulld               m6, m1, [o(pd_m1380)]
5540    pmulld               m1, [o(pd_3857)]
5541    pmulld               m5, m2, [o(pd_3703)]
5542    pmulld               m2, [o(pd_1751)]
5543    jmp .main_oddhalf_part1_fast2
5544.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
5545%if ARCH_X86_64
5546    ITX_MULSUB_2D         0, 7, 8, 9, 10, _,  201, 4091 ; t16a, t31a
5547    ITX_MULSUB_2D         6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
5548    ITX_MULSUB_2D         2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
5549    ITX_MULSUB_2D         4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
5550.main_oddhalf_part1_fast2:
5551    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
5552    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
5553    psubd                m8, m0, m4 ; t17
5554    paddd                m0, m4     ; t16
5555    psubd                m4, m6, m2 ; t18
5556    paddd                m6, m2     ; t19
5557    psubd                m2, m1, m5 ; t29
5558    paddd                m1, m5     ; t28
5559    psubd                m5, m7, m3 ; t30
5560    paddd                m7, m3     ; t31
5561    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
5562    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
5563    mova                m15, [o(pd_4017)]
5564    mova                m10, [o(pd_799)]
5565    ITX_MULSUB_2D         5, 8, 3, 9, _, 11, 10, 15    ; t17a, t30a
5566    ITX_MULSUB_2D         2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
5567    psubd                m3, m0, m6 ; t19a
5568    paddd                m0, m6     ; t16a
5569    psubd                m6, m7, m1 ; t28a
5570    paddd                m7, m1     ; t31a
5571    psubd                m1, m5, m4 ; t18
5572    paddd                m5, m4     ; t17
5573    psubd                m4, m8, m2 ; t29
5574    paddd                m8, m2     ; t30
5575    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
5576    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
5577    mova                m15, [o(pd_3784)]
5578    mova                m10, [o(pd_1567)]
5579    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
5580    ITX_MULSUB_2D         6, 3, 2, 9, _, 11, 10, 15 ; t19,  t28
5581    mova          [r3+16*0], m0
5582    mova          [r3+16*1], m5
5583    mova          [r3+16*2], m4
5584    mova          [r3+16*3], m6
5585    mova          [r3+16*4], m3
5586    mova          [r3+16*5], m1
5587    mova          [r3+16*6], m8
5588    mova          [r3+16*7], m7
5589%else
5590    mova          [r3+0*16], m2
5591    mova          [r3+1*16], m3
5592    mova          [r3+2*16], m4
5593    mova          [r3+3*16], m5
5594    mova                  m3, [o(pd_2048)]
5595    ITX_MULSUB_2D         0, 7, 2, 4, 5, 3,  201, 4091 ; t16a, t31a
5596    ITX_MULSUB_2D         6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a
5597    mova                 m4, [r3+2*16]
5598    mova                 m5, [r3+3*16]
5599    mova          [r3+2*16], m6
5600    mova          [r3+3*16], m7
5601    mova                 m2, [r3+0*16]
5602    mova                 m7, [r3+1*16]
5603    mova          [r3+0*16], m0
5604    mova          [r3+1*16], m1
5605    ITX_MULSUB_2D         2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a
5606    ITX_MULSUB_2D         4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a
5607    mova                 m0, [r3+0*16]
5608    mova                 m1, [r3+1*16]
5609    mova                 m6, [r3+2*16]
5610.main_oddhalf_part1_fast2:
5611    REPX      {paddd x, m3}, m1, m2, m4, m5, m6, m7
5612    REPX      {psrad x, 12}, m1, m2, m4, m5, m6, m7
5613    psubd                m3, m0, m4 ; t17
5614    mova          [r3+0*16], m3
5615    mova                 m3, [r3+3*16]
5616    paddd                m0, m4     ; t16
5617    psubd                m4, m6, m2 ; t18
5618    paddd                m6, m2     ; t19
5619    psubd                m2, m1, m5 ; t29
5620    paddd                m1, m5     ; t28
5621    psubd                m5, m3, m7 ; t30
5622    paddd                m7, m3     ; t31
5623    mova                 m3, [o(clip_18b_min)]
5624    REPX     {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
5625    pmaxsd               m3, [r3+0*16]
5626    mova          [r3+0*16], m3
5627    mova                 m3, [o(clip_18b_max)]
5628    REPX     {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
5629    pminsd               m3, [r3+0*16]
5630    mova          [r3+0*16], m0
5631    mova          [r3+1*16], m1
5632    mova          [r3+2*16], m6
5633    mova          [r3+3*16], m7
5634    mova                 m0, [o(pd_2048)]
5635    ITX_MULSUB_2D         5, 3, 1, 6, 7, 0,  799, 4017    ; t17a, t30a
5636    ITX_MULSUB_2D         2, 4, 1, 6, _, 0,    7, 4017, 4 ; t29a, t18a
5637    psubd                m1, m5, m4 ; t18
5638    paddd                m5, m4     ; t17
5639    psubd                m4, m3, m2 ; t29
5640    paddd                m3, m2     ; t30
5641    mova                 m0, [r3+0*16]
5642    mova                 m2, [r3+1*16]
5643    mova                 m6, [r3+2*16]
5644    mova                 m7, [r3+3*16]
5645    mova          [r3+0*16], m3
5646    psubd                m3, m0, m6 ; t19a
5647    paddd                m0, m6     ; t16a
5648    psubd                m6, m7, m2 ; t28a
5649    paddd                m7, m2     ; t31a
5650    mova                 m2, [o(clip_18b_min)]
5651    REPX     {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
5652    pmaxsd               m2, [r3+0*16]
5653    mova          [r3+0*16], m2
5654    mova                 m2, [o(clip_18b_max)]
5655    REPX     {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
5656    pminsd               m2, [r3+0*16]
5657    mova          [r3+16*0], m0
5658    mova          [r3+16*1], m5
5659    mova          [r3+16*6], m2
5660    mova          [r3+16*7], m7
5661    mova                 m7, [o(pd_2048)]
5662    ITX_MULSUB_2D         4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a
5663    ITX_MULSUB_2D         6, 3, 0, 5, 2, 7,    2, 3784 ; t19,  t28
5664    mova          [r3+16*2], m4
5665    mova          [r3+16*3], m6
5666    mova          [r3+16*4], m3
5667    mova          [r3+16*5], m1
5668%endif
5669    ret
5670.main_oddhalf_part2_fast: ; lower half zero
5671    pmulld               m7, m0, [o(pd_m601)]
5672    pmulld               m0, [o(pd_4052)]
5673    pmulld               m4, m3, [o(pd_3290)]
5674%if ARCH_X86_32
5675    pmulld               m3, [o(pd_2440)]
5676    mova                 m5, [o(pd_2048)]
5677    REPX      {paddd x, m5}, m0, m7
5678    REPX      {psrad x, 12}, m0, m7
5679    mova         [r3+11*16], m7
5680    mova                 m7, m3
5681    mova                 m3, m5
5682%else
5683    pmulld               m3, [o(pd_2440)]
5684%endif
5685    pmulld               m6, m1, [o(pd_3973)]
5686    pmulld               m1, [o(pd_995)]
5687    pmulld               m5, m2, [o(pd_m2106)]
5688    pmulld               m2, [o(pd_3513)]
5689    jmp .main_oddhalf_part2_fast2
5690.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
5691%if ARCH_X86_64
5692    ITX_MULSUB_2D         7, 0, 8, 9, 10, _, 4052,  601 ; t23a, t24a
5693    ITX_MULSUB_2D         1, 6, 8, 9, 10, _,  995, 3973 ; t20a, t27a
5694    ITX_MULSUB_2D         5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
5695    ITX_MULSUB_2D         3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
5696.main_oddhalf_part2_fast2:
5697    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
5698    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
5699    psubd                m8, m0, m4 ; t25
5700    paddd                m0, m4     ; t24
5701    psubd                m4, m6, m2 ; t26
5702    paddd                m6, m2     ; t27
5703    psubd                m2, m1, m5 ; t21
5704    paddd                m1, m5     ; t20
5705    psubd                m5, m7, m3 ; t22
5706    paddd                m7, m3     ; t23
5707    REPX    {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
5708    REPX    {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
5709    mova                m15, [o(pd_2276)]
5710    mova                m10, [o(pd_3406)]
5711    ITX_MULSUB_2D         4, 2, 3, 9, _, 11, 10, 15    ; t21a, t26a
5712    ITX_MULSUB_2D         8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
5713    psubd                m3, m0, m6 ; t27a
5714    paddd                m0, m6     ; t24a
5715    psubd                m6, m7, m1 ; t20a
5716    paddd                m7, m1     ; t23a
5717    psubd                m1, m5, m4 ; t21
5718    paddd                m5, m4     ; t22
5719    psubd                m4, m8, m2 ; t26
5720    paddd                m8, m2     ; t25
5721    REPX    {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
5722    REPX    {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
5723    mova                m15, [o(pd_3784)]
5724    mova                m10, [o(pd_1567)]
5725    ITX_MULSUB_2D         4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
5726    ITX_MULSUB_2D         3, 6, 2, 9, _, 11, 10, 15, 4 ; t27,  t20
5727    mova                 m9, [r3+16*0] ; t16a
5728    mova                m10, [r3+16*1] ; t17
5729    psubd                m2, m9, m7    ; t23
5730    paddd                m9, m7        ; t16
5731    psubd                m7, m10, m5   ; t22a
5732    paddd               m10, m5        ; t17a
5733    REPX    {pmaxsd x, m12}, m9, m10, m2, m7
5734    REPX    {pminsd x, m13}, m9, m10, m2, m7
5735    mova          [r3+16*0], m9
5736    mova          [r3+16*1], m10
5737    mova                 m9, [r3+16*2] ; t18a
5738    mova                m10, [r3+16*3] ; t19
5739    psubd                m5, m9, m1    ; t21
5740    paddd                m9, m1        ; t18
5741    psubd                m1, m10, m6   ; t20a
5742    paddd               m10, m6        ; t19a
5743    REPX    {pmaxsd x, m12}, m9, m10, m5, m1
5744    REPX    {pminsd x, m13}, m9, m10, m5, m1
5745    mova          [r3+16*2], m9
5746    mova          [r3+16*3], m10
5747    mova                 m9, [r3+16*4] ; t28
5748    mova                m10, [r3+16*5] ; t29a
5749    psubd                m6, m9, m3    ; t27a
5750    paddd                m9, m3        ; t28a
5751    psubd                m3, m10, m4   ; t26
5752    paddd               m10, m4        ; t29
5753    REPX    {pmaxsd x, m12}, m9, m10, m6, m3
5754    REPX    {pminsd x, m13}, m9, m10, m6, m3
5755    REPX    {pmulld x, m14}, m6, m3, m1, m5
5756    paddd                m6, m11
5757    paddd                m3, m11
5758    psubd                m4, m6, m1    ; t20
5759    paddd                m6, m1        ; t27
5760    psubd                m1, m3, m5    ; t21a
5761    paddd                m3, m5        ; t26a
5762    REPX    {psrad  x, 12 }, m4, m1, m3, m6
5763    mova          [r3+16*4], m4
5764    mova          [r3+16*5], m1
5765    mova                 m4, [r3+16*6] ; t30
5766    mova                 m1, [r3+16*7] ; t31a
5767    psubd                m5, m4, m8    ; t25a
5768    paddd                m4, m8        ; t30a
5769    psubd                m8, m1, m0    ; t24
5770    paddd                m1, m0        ; t31
5771    REPX    {pmaxsd x, m12}, m8, m5, m4, m1
5772    REPX    {pminsd x, m13}, m8, m5, m4, m1
5773    REPX    {pmulld x, m14}, m5, m8, m7, m2
5774    paddd                m5, m11
5775    paddd                m8, m11
5776    psubd                m0, m5, m7    ; t22
5777    paddd                m5, m7        ; t25
5778    psubd                m7, m8, m2    ; t23a
5779    paddd                m2, m8        ; t24a
5780    REPX    {psrad  x, 12 }, m0, m7, m2, m5
5781    mova          [r3+16*6], m0
5782    mova          [r3+16*7], m7
5783    mova          [r3+16*8], m2
5784    mova          [r3+16*9], m5
5785    mova         [r3+16*10], m3
5786    mova         [r3+16*11], m6
5787    mova         [r3+16*12], m9
5788    mova         [r3+16*13], m10
5789    mova         [r3+16*14], m4
5790    mova         [r3+16*15], m1
5791%else
5792    mova         [r3+ 8*16], m2
5793    mova         [r3+ 9*16], m3
5794    mova         [r3+10*16], m4
5795    mova         [r3+11*16], m5
5796    mova                 m3, [o(pd_2048)]
5797    ITX_MULSUB_2D         7, 0, 2, 4, 5, 3, 4052,  601 ; t23a, t24a
5798    ITX_MULSUB_2D         1, 6, 2, 4, 5, _,  995, 3973 ; t20a, t27a
5799    mova                 m2, [r3+ 8*16]
5800    mova                 m4, [r3+10*16]
5801    mova                 m5, [r3+11*16]
5802    mova         [r3+ 8*16], m0
5803    mova         [r3+10*16], m6
5804    mova         [r3+11*16], m7
5805    mova                 m7, [r3+ 9*16]
5806    mova         [r3+ 9*16], m1
5807    ITX_MULSUB_2D         5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a
5808    ITX_MULSUB_2D         7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a
5809    mova                 m0, [r3+ 8*16]
5810    mova                 m1, [r3+ 9*16]
5811    mova                 m6, [r3+10*16]
5812.main_oddhalf_part2_fast2:
5813    REPX      {paddd x, m3}, m1, m2, m7, m4, m5, m6
5814    REPX      {psrad x, 12}, m1, m2, m7, m4, m5, m6
5815    psubd                m3, m0, m4 ; t25
5816    mova         [r3+ 8*16], m3
5817    mova                 m3, [r3+11*16]
5818    paddd                m0, m4     ; t24
5819    psubd                m4, m6, m2 ; t26
5820    paddd                m6, m2     ; t27
5821    psubd                m2, m1, m5 ; t21
5822    paddd                m1, m5     ; t20
5823    psubd                m5, m3, m7 ; t22
5824    paddd                m7, m3     ; t23
5825    mova                 m3, [o(clip_18b_min)]
5826    REPX     {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
5827    pmaxsd               m3, [r3+ 8*16]
5828    mova         [r3+ 8*16], m3
5829    mova                 m3, [o(clip_18b_max)]
5830    REPX     {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
5831    pminsd               m3, [r3+ 8*16]
5832    mova         [r3+ 8*16], m0
5833    mova         [r3+ 9*16], m1
5834    mova         [r3+10*16], m6
5835    mova         [r3+11*16], m7
5836    mova                 m7, [o(pd_2048)]
5837    ITX_MULSUB_2D         4, 2, 0, 1, 6, 7, 3406, 2276    ; t21a, t26a
5838    ITX_MULSUB_2D         3, 5, 0, 1, _, 7,    6, 2276, 4 ; t25a, t22a
5839    psubd                m1, m5, m4 ; t21
5840    paddd                m5, m4     ; t22
5841    psubd                m4, m3, m2 ; t26
5842    paddd                m3, m2     ; t25
5843    mova                 m0, [r3+ 8*16]
5844    mova                 m2, [r3+ 9*16]
5845    mova                 m6, [r3+10*16]
5846    mova                 m7, [r3+11*16]
5847    mova         [r3+ 8*16], m3
5848    psubd                m3, m0, m6 ; t27a
5849    paddd                m0, m6     ; t24a
5850    psubd                m6, m7, m2 ; t20a
5851    paddd                m7, m2     ; t23a
5852    mova                 m2, [o(clip_18b_min)]
5853    REPX     {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
5854    pmaxsd               m2, [r3+ 8*16]
5855    mova         [r3+ 8*16], m2
5856    mova                 m2, [o(clip_18b_max)]
5857    REPX     {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
5858    pminsd               m2, [r3+ 8*16]
5859    mova         [r3+ 8*16], m0
5860    mova         [r3+ 9*16], m2
5861    mova         [r3+14*16], m5
5862    mova         [r3+15*16], m7
5863    mova                 m0, [o(pd_2048)]
5864    ITX_MULSUB_2D         4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a
5865    ITX_MULSUB_2D         3, 6, 2, 5, _, 0,    7, 3784, 4 ; t27,  t20
5866    mova         [r3+10*16], m3
5867    mova                 m0, [o(clip_18b_min)]
5868    mova                 m2, [o(clip_18b_max)]
5869    mova                 m5, [r3+16*2] ; t18a
5870    mova                 m7, [r3+16*3] ; t19
5871    psubd                m3, m5, m1    ; t21
5872    paddd                m5, m1        ; t18
5873    psubd                m1, m7, m6    ; t20a
5874    paddd                m7, m6        ; t19a
5875    REPX     {pmaxsd x, m0}, m5, m7, m3, m1
5876    REPX     {pminsd x, m2}, m5, m7, m3, m1
5877    mova          [r3+16*2], m5
5878    mova          [r3+16*3], m7
5879    mova         [r3+11*16], m3
5880    mova                 m3, [r3+10*16]
5881    mova                 m5, [r3+16*4] ; t28
5882    mova                 m7, [r3+16*5] ; t29a
5883    psubd                m6, m5, m3    ; t27a
5884    paddd                m5, m3        ; t28a
5885    psubd                m3, m7, m4    ; t26
5886    paddd                m7, m4        ; t29
5887    REPX     {pmaxsd x, m0}, m5, m7, m6, m3
5888    REPX     {pminsd x, m2}, m5, m7, m6, m3
5889    mova         [r3+16*12], m5
5890    mova         [r3+16*13], m7
5891    mova                 m5, [o(pd_2048)]
5892    mova                 m7, [o(pd_2896)]
5893    mova                 m4, [r3+11*16]
5894    REPX     {pmulld x, m7}, m6, m3, m1, m4
5895    paddd                m6, m5
5896    paddd                m3, m5
5897    psubd                m5, m6, m1    ; t20
5898    paddd                m6, m1        ; t27
5899    psubd                m1, m3, m4    ; t21a
5900    paddd                m3, m4        ; t26a
5901    REPX     {psrad  x, 12}, m5, m1, m3, m6
5902    mova          [r3+16*4], m5
5903    mova          [r3+16*5], m1
5904    mova         [r3+16*10], m3
5905    mova         [r3+16*11], m6
5906
5907    mova                 m5, [r3+14*16]
5908    mova                 m6, [r3+15*16]
5909    mova                 m3, [r3+16*0] ; t16a
5910    mova                 m4, [r3+16*1] ; t17
5911    psubd                m1, m3, m6    ; t23
5912    paddd                m3, m6        ; t16
5913    psubd                m6, m4, m5    ; t22a
5914    paddd                m4, m5        ; t17a
5915    REPX     {pmaxsd x, m0}, m3, m4, m1, m6
5916    REPX     {pminsd x, m2}, m3, m4, m1, m6
5917    mova          [r3+16*0], m3
5918    mova          [r3+16*1], m4
5919    mova                 m5, [r3+ 8*16]
5920    mova                 m3, [r3+ 9*16]
5921    mova         [r3+ 8*16], m1
5922    mova         [r3+ 9*16], m6
5923    mova                 m4, [r3+16*6] ; t30
5924    mova                 m1, [r3+16*7] ; t31a
5925    psubd                m6, m1, m5    ; t24
5926    paddd                m1, m5        ; t31
5927    psubd                m5, m4, m3    ; t25a
5928    paddd                m4, m3        ; t30a
5929    REPX     {pmaxsd x, m0}, m6, m5, m4, m1
5930    REPX     {pminsd x, m2}, m6, m5, m4, m1
5931    mova         [r3+16*14], m4
5932    mova         [r3+16*15], m1
5933    mova                 m4, [o(pd_2048)]
5934    mova                 m1, [r3+ 9*16]
5935    mova                 m2, [r3+ 8*16]
5936    REPX     {pmulld x, m7}, m5, m6, m1, m2
5937    paddd                m5, m4
5938    paddd                m6, m4
5939    psubd                m0, m5, m1    ; t22
5940    paddd                m5, m1        ; t25
5941    psubd                m1, m6, m2    ; t23a
5942    paddd                m2, m6        ; t24a
5943    REPX     {psrad  x, 12}, m0, m1, m2, m5
5944    mova          [r3+16*6], m0
5945    mova          [r3+16*7], m1
5946    mova          [r3+16*8], m2
5947    mova          [r3+16*9], m5
5948%endif
5949    ret
5950
5951    ; final sumsub for idct16 as well as idct32, plus final downshift
5952%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
5953    mova                m%4, [r3+16*(23-%1)]
5954    pmaxsd              m%1, m12
5955    pminsd              m%1, m13
5956    psubd               m%3, m%1, m%4 ; idct16 out15 - n
5957    paddd               m%1, m%4      ; idct16 out0  + n
5958    pmaxsd              m%1, m12
5959    pmaxsd              m%3, m12
5960    pminsd              m%1, m13
5961    pminsd              m%3, m13
5962    paddd               m%1, m11
5963    paddd               m%3, m11
5964    mova                m%5, [r3+16*( 0+%1)]
5965    mova                m%2, [r3+16*(15-%1)]
5966    psubd               m%4, m%1, m%2 ; out31 - n
5967    paddd               m%1, m%2      ; out0  + n
5968    paddd               m%2, m%3, m%5 ; out15 - n
5969    psubd               m%3, m%5      ; out16 + n
5970    REPX      {psrad x, %6}, m%1, m%3, m%2, m%4
5971%endmacro
5972
5973.round_dct32:
5974%if ARCH_X86_64
5975    psrld               m11, 10 ; pd_2
5976    IDCT32_END            0, 15, 8, 9, 10, 2    ; 0 15 16 31
5977    mova         [r3+ 0*16], m6
5978    mova         [r3+23*16], m7
5979    IDCT32_END            1, 14, 6, 7, 10, 2    ; 1 14 17 30
5980    packssdw             m0, m1       ;  0  1
5981    packssdw            m14, m15      ; 14 15
5982    packssdw             m8, m6       ; 16 17
5983    packssdw             m7, m9       ; 30 31
5984    mova         [r3+16*15], m14
5985    mova         [r3+16*14], m7
5986    IDCT32_END            2, 15, 10, 7, 6, 2    ; 2 13 18 29
5987    IDCT32_END            3, 14,  1, 9, 6, 2    ; 3 12 19 28
5988    packssdw             m2, m3       ;  2  3
5989    packssdw            m14, m15      ; 12 13
5990    packssdw            m10, m1       ; 18 19
5991    packssdw             m9, m7       ; 28 29
5992    mova         [r3+16*13], m14
5993    mova         [r3+16*12], m9
5994    IDCT32_END            4, 15, 1, 7, 6, 2     ; 4 11 20 27
5995    IDCT32_END            5, 14, 3, 9, 6, 2     ; 5 10 21 26
5996    packssdw             m4, m5       ;  4  5
5997    packssdw            m14, m15      ; 10 11
5998    packssdw             m1, m3       ; 20 21
5999    packssdw             m9, m7       ; 26 27
6000    mova         [r3+16*11], m14
6001    mova         [r3+16*10], m9
6002    mova                 m6, [r3+ 0*16]
6003    mova                 m7, [r3+23*16]
6004    IDCT32_END            6, 15, 14, 5,  3, 2   ; 6 9 22 25
6005    IDCT32_END            7, 11,  3, 9, 13, 2   ; 7 8 23 24
6006    packssdw             m6, m7       ;  6  7
6007    packssdw            m11, m15      ;  8  9
6008    packssdw            m14, m3       ; 22 23
6009    packssdw             m9, m5       ; 24 25
6010    mova          [r3+16*9], m11
6011    mova          [r3+16*8], m9
6012    mova                m12, m1
6013    ret
6014%else
6015    mova         [r3+16*16], m0
6016    mova         [r3+17*16], m1
6017    mova         [r3+18*16], m2
6018    mova         [r3+19*16], m3
6019    mova         [r3+20*16], m4
6020    mova         [r3+21*16], m5
6021    mova         [r3+22*16], m6
6022    mova         [r3+23*16], m7
6023    mova                 m1, [o(pd_2)]
6024    mova                 m2, [o(clip_18b_min)]
6025    mova                 m3, [o(clip_18b_max)]
6026
6027    mov                  r4, 15*16
6028.loop_dct32_end:
6029    mova                 m0, [r3+16*16]
6030    mova                 m6, [r3+16*24]
6031    pmaxsd               m0, m2
6032    pminsd               m0, m3
6033    psubd                m5, m0, m6 ; idct16 out15 - n
6034    paddd                m0, m6     ; idct16 out0  + n
6035    pmaxsd               m0, m2
6036    pmaxsd               m5, m2
6037    pminsd               m0, m3
6038    pminsd               m5, m3
6039    paddd                m0, m1
6040    paddd                m5, m1
6041    mova                 m7, [r3]
6042    mova                 m4, [r3+r4]
6043    psubd                m6, m0, m4 ; out31 - n
6044    paddd                m0, m4     ; out0  + n
6045    paddd                m4, m5, m7 ; out15 - n
6046    psubd                m5, m7     ; out16 + n
6047    REPX       {psrad x, 2}, m0, m5, m4, m6
6048    mova               [r3], m0
6049    mova            [r3+r4], m4
6050    mova         [r3+16*16], m5
6051    mova         [r3+24*16], m6
6052    add                  r3, 16
6053    sub                  r4, 32
6054    jg .loop_dct32_end
6055    ret
6056%endif
6057
6058.dconly:
6059    imul                r5d, [cq], 181
6060    mov                [cq], eobd ; 0
6061    mov                 r3d, 8
6062.dconly1:
6063    add                 r5d, 640
6064    sar                 r5d, 10
6065.dconly2:
6066    imul                r5d, 2896
6067    add                 r5d, 34816
6068    movd                 m0, r5d
6069    pshuflw              m0, m0, q1111
6070    punpcklqdq           m0, m0
6071    mova                 m6, [o(pixel_10bpc_max)]
6072    pxor                 m5, m5
6073.dconly_loop:
6074    mova                 m1, [dstq+16*0]
6075    mova                 m2, [dstq+16*1]
6076    mova                 m3, [dstq+16*2]
6077    mova                 m4, [dstq+16*3]
6078    REPX     {paddw  x, m0}, m1, m2, m3, m4
6079    REPX     {pminsw x, m6}, m1, m2, m3, m4
6080    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
6081    mova        [dstq+16*0], m1
6082    mova        [dstq+16*1], m2
6083    mova        [dstq+16*2], m3
6084    mova        [dstq+16*3], m4
6085    add                dstq, strideq
6086    dec                 r3d
6087    jg .dconly_loop
6088    RET
6089
6090cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
6091                                         dst, stride, c, eob
6092    LEA                  r6, base
6093    test               eobd, eobd
6094    jz .dconly
6095
6096    ; remove entirely-zero iterations
6097%undef cmp
6098    mov                 r5d, 8
6099.zero_loop:
6100    sub                 r5d, 2
6101    cmp                eobw, word [o2(tbl_32x16_2d)+r5]
6102    jl .zero_loop
6103
6104    ; actual first pass after skipping all-zero data
6105.loop_pass1:
6106%if ARCH_X86_64
6107    mova                m11, [o(pd_2048)]
6108    mova                m12, [o(clip_18b_min)]
6109    mova                m13, [o(clip_18b_max)]
6110    mova                m14, [o(pd_2896)]
6111%endif
6112    mova                 m0, [cq+64* 1+r5*8]
6113    mova                 m1, [cq+64* 7+r5*8]
6114    mova                 m2, [cq+64* 9+r5*8]
6115    mova                 m3, [cq+64*15+r5*8]
6116    mova                 m4, [cq+64*17+r5*8]
6117    mova                 m5, [cq+64*23+r5*8]
6118    mova                 m6, [cq+64*25+r5*8]
6119    mova                 m7, [cq+64*31+r5*8]
6120    mov                  r3, rsp
6121    call m(idct_8x4_internal_16bpc).rect2_mul
6122    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
6123
6124    mova                 m0, [cq+64* 3+r5*8]
6125    mova                 m1, [cq+64* 5+r5*8]
6126    mova                 m2, [cq+64*11+r5*8]
6127    mova                 m3, [cq+64*13+r5*8]
6128    mova                 m4, [cq+64*19+r5*8]
6129    mova                 m5, [cq+64*21+r5*8]
6130    mova                 m6, [cq+64*27+r5*8]
6131    mova                 m7, [cq+64*29+r5*8]
6132%if ARCH_X86_32
6133    add                  r3, 16*8
6134%endif
6135    call m(idct_8x4_internal_16bpc).rect2_mul
6136%if ARCH_X86_32
6137    sub                  r3, 16*8
6138%endif
6139    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
6140    add                  r3, 16*(16+4*ARCH_X86_32)
6141
6142    mova                 m0, [cq+64* 2+r5*8]
6143    mova                 m1, [cq+64* 6+r5*8]
6144    mova                 m2, [cq+64*10+r5*8]
6145    mova                 m3, [cq+64*14+r5*8]
6146    mova                 m4, [cq+64*18+r5*8]
6147    mova                 m5, [cq+64*22+r5*8]
6148    mova                 m6, [cq+64*26+r5*8]
6149    mova                 m7, [cq+64*30+r5*8]
6150    call m(idct_8x4_internal_16bpc).rect2_mul
6151    call m(idct_16x4_internal_16bpc).main_oddhalf
6152
6153    mova                 m0, [cq+64* 0+r5*8]
6154    mova                 m1, [cq+64* 4+r5*8]
6155    mova                 m2, [cq+64* 8+r5*8]
6156    mova                 m3, [cq+64*12+r5*8]
6157    mova                 m4, [cq+64*16+r5*8]
6158    mova                 m5, [cq+64*20+r5*8]
6159    mova                 m6, [cq+64*24+r5*8]
6160    mova                 m7, [cq+64*28+r5*8]
6161    call m(idct_8x4_internal_16bpc).rect2_mul
6162    call m(idct_8x4_internal_16bpc).main_pass1
6163    call m(idct_8x4_internal_16bpc).round
6164    sub                  r3, 16*(16+4*ARCH_X86_32)
6165    call .round_dct32
6166
6167%if ARCH_X86_64
6168    call m(idct_8x4_internal_16bpc).transpose4x8packed
6169    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6170    mova    [cq+64* 8+r5*8], m8
6171    mova    [cq+64* 9+r5*8], m9
6172    mova    [cq+64*10+r5*8], m10
6173    mova    [cq+64*11+r5*8], m11
6174    mova                 m8, [r3+16* 9] ;  8  9
6175    mova                m10, [r3+16*11] ; 10 11
6176    mova                m12, [r3+16*13] ; 12 13
6177    mova                m14, [r3+16*15] ; 14 15
6178    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6179    mova    [cq+64* 4+r5*8], m8
6180    mova    [cq+64* 5+r5*8], m9
6181    mova    [cq+64* 6+r5*8], m10
6182    mova    [cq+64* 7+r5*8], m11
6183    mova                 m8, [r3+16* 8] ; 24 25
6184    mova                m10, [r3+16*10] ; 26 27
6185    mova                m12, [r3+16*12] ; 28 29
6186    mova                m14, [r3+16*14] ; 30 31
6187    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6188    mova    [cq+64*12+r5*8], m8
6189    mova    [cq+64*13+r5*8], m9
6190    mova    [cq+64*14+r5*8], m10
6191    mova    [cq+64*15+r5*8], m11
6192%else
6193    sub                  r3, 8*16
6194    mova                 m0, [r3+ 8*16]
6195    mova                 m2, [r3+10*16]
6196    mova                 m4, [r3+12*16]
6197    mova                 m6, [r3+14*16]
6198    packssdw             m0, [r3+ 9*16]
6199    packssdw             m2, [r3+11*16]
6200    packssdw             m4, [r3+13*16]
6201    packssdw             m6, [r3+15*16]
6202    call m(idct_8x4_internal_16bpc).transpose4x8packed
6203    mova    [cq+64* 4+r5*8], m0
6204    mova    [cq+64* 5+r5*8], m1
6205    mova    [cq+64* 6+r5*8], m2
6206    mova    [cq+64* 7+r5*8], m3
6207    mova                 m0, [r3+16*16]
6208    mova                 m2, [r3+18*16]
6209    mova                 m4, [r3+20*16]
6210    mova                 m6, [r3+22*16]
6211    packssdw             m0, [r3+17*16]
6212    packssdw             m2, [r3+19*16]
6213    packssdw             m4, [r3+21*16]
6214    packssdw             m6, [r3+23*16]
6215    call m(idct_8x4_internal_16bpc).transpose4x8packed
6216    mova    [cq+64* 8+r5*8], m0
6217    mova    [cq+64* 9+r5*8], m1
6218    mova    [cq+64*10+r5*8], m2
6219    mova    [cq+64*11+r5*8], m3
6220    mova                 m0, [r3+31*16]
6221    mova                 m2, [r3+29*16]
6222    mova                 m4, [r3+27*16]
6223    mova                 m6, [r3+25*16]
6224    packssdw             m0, [r3+30*16]
6225    packssdw             m2, [r3+28*16]
6226    packssdw             m4, [r3+26*16]
6227    packssdw             m6, [r3+24*16]
6228    call m(idct_8x4_internal_16bpc).transpose4x8packed
6229    mova    [cq+64*12+r5*8], m0
6230    mova    [cq+64*13+r5*8], m1
6231    mova    [cq+64*14+r5*8], m2
6232    mova    [cq+64*15+r5*8], m3
6233    mova                 m0, [r3+ 0*16]
6234    mova                 m2, [r3+ 2*16]
6235    mova                 m4, [r3+ 4*16]
6236    mova                 m6, [r3+ 6*16]
6237    packssdw             m0, [r3+ 1*16]
6238    packssdw             m2, [r3+ 3*16]
6239    packssdw             m4, [r3+ 5*16]
6240    packssdw             m6, [r3+ 7*16]
6241    call m(idct_8x4_internal_16bpc).transpose4x8packed
6242%endif
6243    mova    [cq+64* 0+r5*8], m0
6244    mova    [cq+64* 1+r5*8], m1
6245    mova    [cq+64* 2+r5*8], m2
6246    mova    [cq+64* 3+r5*8], m3
6247    pxor                 m0, m0
6248    REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
6249                                    24, 25, 26, 27, 28, 29, 30, 31
6250    sub                 r5d, 2
6251    jge .loop_pass1
6252
6253    ; pass=2, we need to call this otherwise the stack pointer has
6254    ; the wrong offset in the 8-bit code
6255    call .pass2
6256    RET
6257
6258.pass2:
6259%if ARCH_X86_64
6260    mova                 m8, [o(pw_2048)]
6261    pxor                 m9, m9
6262    mova                m10, [o(pixel_10bpc_max)]
6263%if WIN64
6264    mov [rsp+16*16+gprsize], r7
6265%endif
6266    mov                  r7, dstq
6267%else
6268    mov [rsp+2*gprsize+16*16], dstq
6269%endif
6270    lea                  r3, [strideq*3]
6271    mov                 r4d, 4
6272    jmp m(idct_16x16_internal_16bpc).loop_pass2
6273
6274.round_dct32:
6275%if ARCH_X86_64
6276    psrld               m11, 11 ; pd_1
6277    IDCT32_END            0, 15, 8, 9, 10, 1    ; 0 15 16 31
6278    mova         [r3+ 0*16], m6
6279    mova         [r3+23*16], m7
6280    IDCT32_END            1, 14, 6, 7, 10, 1    ; 1 14 17 30
6281    packssdw             m0, m1       ;  0  1
6282    packssdw            m14, m15      ; 14 15
6283    packssdw             m8, m6       ; 16 17
6284    packssdw             m7, m9       ; 30 31
6285    mova         [r3+16*15], m14
6286    mova         [r3+16*14], m7
6287    IDCT32_END            2, 15, 10, 7, 6, 1    ; 2 13 18 29
6288    IDCT32_END            3, 14,  1, 9, 6, 1    ; 3 12 19 28
6289    packssdw             m2, m3       ;  2  3
6290    packssdw            m14, m15      ; 12 13
6291    packssdw            m10, m1       ; 18 19
6292    packssdw             m9, m7       ; 28 29
6293    mova         [r3+16*13], m14
6294    mova         [r3+16*12], m9
6295    IDCT32_END            4, 15, 1, 7, 6, 1     ; 4 11 20 27
6296    IDCT32_END            5, 14, 3, 9, 6, 1     ; 5 10 21 26
6297    packssdw             m4, m5       ;  4  5
6298    packssdw            m14, m15      ; 10 11
6299    packssdw             m1, m3       ; 20 21
6300    packssdw             m9, m7       ; 26 27
6301    mova         [r3+16*11], m14
6302    mova         [r3+16*10], m9
6303    mova                 m6, [r3+ 0*16]
6304    mova                 m7, [r3+23*16]
6305    IDCT32_END            6, 15, 14, 5,  3, 1   ; 6 9 22 25
6306    IDCT32_END            7, 11,  3, 9, 13, 1   ; 7 8 23 24
6307    packssdw             m6, m7       ;  6  7
6308    packssdw            m11, m15      ;  8  9
6309    packssdw            m14, m3       ; 22 23
6310    packssdw             m9, m5       ; 24 25
6311    mova          [r3+16*9], m11
6312    mova          [r3+16*8], m9
6313    mova                m12, m1
6314    ret
6315%else
6316    mova         [r3+16*16], m0
6317    mova         [r3+17*16], m1
6318    mova         [r3+18*16], m2
6319    mova         [r3+19*16], m3
6320    mova         [r3+20*16], m4
6321    mova         [r3+21*16], m5
6322    mova         [r3+22*16], m6
6323    mova         [r3+23*16], m7
6324    pcmpeqd              m1, m1     ; -1
6325    mova                 m2, [o(clip_18b_min)]
6326    mova                 m3, [o(clip_18b_max)]
6327
6328    mov                  r4, 15*16
6329.loop_dct32_end:
6330    mova                 m0, [r3+16*16]
6331    mova                 m6, [r3+16*24]
6332    psubd                m5, m0, m6 ; idct16 out15 - n
6333    paddd                m0, m6     ; idct16 out0  + n
6334    pmaxsd               m0, m2
6335    pmaxsd               m5, m2
6336    pminsd               m0, m3
6337    pminsd               m5, m3
6338    psubd                m0, m1
6339    psubd                m5, m1
6340    mova                 m7, [r3]
6341    mova                 m4, [r3+r4]
6342    psubd                m6, m0, m4 ; out31 - n
6343    paddd                m0, m4     ; out0  + n
6344    paddd                m4, m5, m7 ; out15 - n
6345    psubd                m5, m7     ; out16 + n
6346    REPX       {psrad x, 1}, m0, m5, m4, m6
6347    mova               [r3], m0
6348    mova            [r3+r4], m4
6349    mova         [r3+16*16], m5
6350    mova         [r3+24*16], m6
6351    add                  r3, 16
6352    sub                  r4, 32
6353    jg .loop_dct32_end
6354    ret
6355%endif
6356
6357.dconly:
6358    imul                r5d, [cq], 181
6359    mov                [cq], eobd ; 0
6360    mov                 r3d, 16
6361    add                 r5d, 128
6362    sar                 r5d, 8
6363    imul                r5d, 181
6364    add                 r5d, 384
6365    sar                 r5d, 9
6366    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
6367
6368cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
6369                                          dst, stride, c, eob
6370    LEA                  r6, base
6371    test               eobd, eobd
6372    jz .dconly
6373
6374    ; remove entirely-zero iterations
6375%if ARCH_X86_32
6376    mov [rsp+5*32*16+1*gprsize], dstq
6377%elif WIN64
6378    mov [rsp+5*32*16+1*gprsize], r7
6379%endif
6380%undef cmp
6381    mov                 r5d, 14
6382    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
6383    jge .end_zero_loop
6384    pxor                 m0, m0
6385.zero_loop:
6386    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
6387    movzx               t1d, t0b
6388    shr                 t0d, 8
6389    mova   [rsp+32*16+r5*8+0*32*16], m0
6390    mova   [rsp+40*16+r5*8+0*32*16], m0
6391    mova   [rsp+32*16+t0*8+0*32*16], m0
6392    mova   [rsp+32*16+t1*8+0*32*16], m0
6393    mova   [rsp+32*16+r5*8+1*32*16], m0
6394    mova   [rsp+40*16+r5*8+1*32*16], m0
6395    mova   [rsp+32*16+t0*8+1*32*16], m0
6396    mova   [rsp+32*16+t1*8+1*32*16], m0
6397    mova   [rsp+32*16+r5*8+2*32*16], m0
6398    mova   [rsp+40*16+r5*8+2*32*16], m0
6399    mova   [rsp+32*16+t0*8+2*32*16], m0
6400    mova   [rsp+32*16+t1*8+2*32*16], m0
6401    mova   [rsp+32*16+r5*8+3*32*16], m0
6402    mova   [rsp+40*16+r5*8+3*32*16], m0
6403    mova   [rsp+32*16+t0*8+3*32*16], m0
6404    mova   [rsp+32*16+t1*8+3*32*16], m0
6405    sub                 r5d, 2
6406    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
6407    jl .zero_loop
6408.end_zero_loop:
6409
6410    ; actual first pass after skipping all-zero data
6411    mov [rsp+gprsize*0+5*32*16], eobd
6412.loop_pass1:
6413    mova                 m0, [cq+128* 1+r5*8]
6414    mova                 m1, [cq+128* 7+r5*8]
6415    mova                 m2, [cq+128* 9+r5*8]
6416    mova                 m3, [cq+128*15+r5*8]
6417    mova                 m4, [cq+128*17+r5*8]
6418    mova                 m5, [cq+128*23+r5*8]
6419    mova                 m6, [cq+128*25+r5*8]
6420    mova                 m7, [cq+128*31+r5*8]
6421%if ARCH_X86_64
6422    mova                m11, [o(pd_2048)]
6423    mova                m12, [o(clip_18b_min)]
6424    mova                m13, [o(clip_18b_max)]
6425    mova                m14, [o(pd_2896)]
6426%endif
6427    mov                  r3, rsp
6428    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
6429    mova                 m0, [cq+128* 3+r5*8]
6430    mova                 m1, [cq+128* 5+r5*8]
6431    mova                 m2, [cq+128*11+r5*8]
6432    mova                 m3, [cq+128*13+r5*8]
6433    mova                 m4, [cq+128*19+r5*8]
6434    mova                 m5, [cq+128*21+r5*8]
6435    mova                 m6, [cq+128*27+r5*8]
6436    mova                 m7, [cq+128*29+r5*8]
6437    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
6438    mova                 m0, [cq+128* 2+r5*8]
6439    mova                 m1, [cq+128* 6+r5*8]
6440    mova                 m2, [cq+128*10+r5*8]
6441    mova                 m3, [cq+128*14+r5*8]
6442    mova                 m4, [cq+128*18+r5*8]
6443    mova                 m5, [cq+128*22+r5*8]
6444    mova                 m6, [cq+128*26+r5*8]
6445    mova                 m7, [cq+128*30+r5*8]
6446    add                  r3, 16*(16+4*ARCH_X86_32)
6447    call m(idct_16x4_internal_16bpc).main_oddhalf
6448    mova                 m0, [cq+128* 0+r5*8]
6449    mova                 m1, [cq+128* 4+r5*8]
6450    mova                 m2, [cq+128* 8+r5*8]
6451    mova                 m3, [cq+128*12+r5*8]
6452    mova                 m4, [cq+128*16+r5*8]
6453    mova                 m5, [cq+128*20+r5*8]
6454    mova                 m6, [cq+128*24+r5*8]
6455    mova                 m7, [cq+128*28+r5*8]
6456    call m(idct_8x4_internal_16bpc).main_pass1
6457    call m(idct_8x4_internal_16bpc).round
6458    sub                  r3, 16*(16+4*ARCH_X86_32)
6459    call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32
6460    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
6461    movzx               t1d, t0b
6462    shr                 t0d, 8
6463%if ARCH_X86_64
6464    call m(idct_8x4_internal_16bpc).transpose4x8packed
6465    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6466    mova   [rsp+32*16+r5*8+2*32*16], m8
6467    mova   [rsp+40*16+r5*8+2*32*16], m10
6468    mova   [rsp+32*16+t1*8+2*32*16], m9
6469    mova   [rsp+32*16+t0*8+2*32*16], m11
6470    mova                 m8, [r3+16* 9] ;  8  9
6471    mova                m10, [r3+16*11] ; 10 11
6472    mova                m12, [r3+16*13] ; 12 13
6473    mova                m14, [r3+16*15] ; 14 15
6474    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6475    mova   [rsp+32*16+r5*8+1*32*16], m8
6476    mova   [rsp+40*16+r5*8+1*32*16], m10
6477    mova   [rsp+32*16+t1*8+1*32*16], m9
6478    mova   [rsp+32*16+t0*8+1*32*16], m11
6479    mova                 m8, [r3+16* 8] ; 24 25
6480    mova                m10, [r3+16*10] ; 26 27
6481    mova                m12, [r3+16*12] ; 28 29
6482    mova                m14, [r3+16*14] ; 30 31
6483    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6484    mova   [rsp+32*16+r5*8+3*32*16], m8
6485    mova   [rsp+40*16+r5*8+3*32*16], m10
6486    mova   [rsp+32*16+t1*8+3*32*16], m9
6487    mova   [rsp+32*16+t0*8+3*32*16], m11
6488%else
6489    sub                  r3, 8*16
6490    mova                 m0, [r3+ 8*16]
6491    mova                 m2, [r3+10*16]
6492    mova                 m4, [r3+12*16]
6493    mova                 m6, [r3+14*16]
6494    packssdw             m0, [r3+ 9*16]
6495    packssdw             m2, [r3+11*16]
6496    packssdw             m4, [r3+13*16]
6497    packssdw             m6, [r3+15*16]
6498    call m(idct_8x4_internal_16bpc).transpose4x8packed
6499    mova   [rsp+32*16+r5*8+1*32*16], m0
6500    mova   [rsp+40*16+r5*8+1*32*16], m2
6501    mova   [rsp+32*16+t1*8+1*32*16], m1
6502    mova   [rsp+32*16+t0*8+1*32*16], m3
6503    mova                 m0, [r3+16*16]
6504    mova                 m2, [r3+18*16]
6505    mova                 m4, [r3+20*16]
6506    mova                 m6, [r3+22*16]
6507    packssdw             m0, [r3+17*16]
6508    packssdw             m2, [r3+19*16]
6509    packssdw             m4, [r3+21*16]
6510    packssdw             m6, [r3+23*16]
6511    call m(idct_8x4_internal_16bpc).transpose4x8packed
6512    mova   [rsp+32*16+r5*8+2*32*16], m0
6513    mova   [rsp+40*16+r5*8+2*32*16], m2
6514    mova   [rsp+32*16+t1*8+2*32*16], m1
6515    mova   [rsp+32*16+t0*8+2*32*16], m3
6516    mova                 m0, [r3+31*16]
6517    mova                 m2, [r3+29*16]
6518    mova                 m4, [r3+27*16]
6519    mova                 m6, [r3+25*16]
6520    packssdw             m0, [r3+30*16]
6521    packssdw             m2, [r3+28*16]
6522    packssdw             m4, [r3+26*16]
6523    packssdw             m6, [r3+24*16]
6524    call m(idct_8x4_internal_16bpc).transpose4x8packed
6525    mova   [rsp+32*16+r5*8+3*32*16], m0
6526    mova   [rsp+40*16+r5*8+3*32*16], m2
6527    mova   [rsp+32*16+t1*8+3*32*16], m1
6528    mova   [rsp+32*16+t0*8+3*32*16], m3
6529    mova                 m0, [r3+ 0*16]
6530    mova                 m2, [r3+ 2*16]
6531    mova                 m4, [r3+ 4*16]
6532    mova                 m6, [r3+ 6*16]
6533    packssdw             m0, [r3+ 1*16]
6534    packssdw             m2, [r3+ 3*16]
6535    packssdw             m4, [r3+ 5*16]
6536    packssdw             m6, [r3+ 7*16]
6537    call m(idct_8x4_internal_16bpc).transpose4x8packed
6538%endif
6539    pxor                 m7, m7
6540    ; clear lower half of [cq]
6541    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
6542                                     8, 9, 10, 11, 12, 13, 14, 15, \
6543                                     16, 17, 18, 19, 20, 21, 22, 23, \
6544                                     24, 25, 26, 27, 28, 29, 30, 31
6545    mova   [rsp+32*16+r5*8+0*32*16], m0
6546    mova   [rsp+40*16+r5*8+0*32*16], m2
6547    mova   [rsp+32*16+t1*8+0*32*16], m1
6548    mova   [rsp+32*16+t0*8+0*32*16], m3
6549    sub                 r5d, 2
6550    jge .loop_pass1
6551
6552    ; pass=2 code starts here
6553    mov                eobd, [rsp+gprsize*0+5*32*16]
6554    add                 rsp, 29*16
6555    cmp                eobd, 36
6556    jl .load_veryfast
6557    cmp                eobd, 136
6558    jl .load_fast
6559    ; load normal
6560    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
6561    jmp .run
6562.load_fast:
6563    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
6564    jmp .run
6565.load_veryfast:
6566    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
6567    ; fall-through
6568.run:
6569%if ARCH_X86_64
6570    lea                  r2, [dstq+64]
6571    mov                  r7, -8
6572%else
6573    lea                  r2, [rsp+(4*32+3)*16]
6574    mov dword [r2+0*gprsize], 4
6575%endif
6576    jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
6577
6578.dconly:
6579    imul                r5d, [cq], 181
6580    mov                [cq], eobd ; 0
6581    mov                 r3d, 32
6582    add                 rsp, (5*32+1-(24+8*ARCH_X86_32))*16
6583    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1
6584
6585cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
6586                                          0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \
6587                                          dst, stride, c, eob
6588    LEA                  r6, base
6589    test               eobd, eobd
6590    jz .dconly
6591
6592%if ARCH_X86_32
6593    DECLARE_REG_TMP 4, 1, 2, 0
6594    mov [rsp+gprsize*1+(64*2+12)*16], r0
6595    mov [rsp+gprsize*2+(64*2+12)*16], r1
6596    mov [rsp+gprsize*3+(64*2+12)*16], r2
6597%else
6598    DECLARE_REG_TMP 8, 9, 4, 7
6599    mov [rsp+gprsize*1+(64*2+12)*16], r9
6600%if WIN64
6601    mov [rsp+gprsize*2+(64*2+12)*16], r7
6602    mov [rsp+gprsize*3+(64*2+12)*16], r8
6603%endif
6604%endif
6605%undef cmp
6606    ; remove entirely-zero iterations
6607    mov                 r5d, 7*2
6608    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
6609    jge .end_zero_loop
6610    pxor                 m0, m0
6611.zero_loop:
6612    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
6613    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
6614    movzx               t0d, t1b
6615    movzx               t2d, t3b
6616    shr                 t1d, 8
6617    shr                 t3d, 8
6618    mova   [rsp+12*16+t0*8], m0
6619    mova   [rsp+12*16+t1*8], m0
6620    mova   [rsp+12*16+t2*8], m0
6621    mova   [rsp+12*16+t3*8], m0
6622    mova   [rsp+76*16+t0*8], m0
6623    mova   [rsp+76*16+t1*8], m0
6624    mova   [rsp+76*16+t2*8], m0
6625    mova   [rsp+76*16+t3*8], m0
6626    sub                 r5d, 2
6627    cmp                eobw, word [o2(tbl_16x32_2d)+r5]
6628    jl .zero_loop
6629.end_zero_loop:
6630    ; actual first pass after skipping all-zero data
6631    mov [rsp+gprsize*0+(64*2+12)*16], eobd
6632    mov                  r3, rsp
6633%if ARCH_X86_32
6634    DECLARE_REG_TMP 4, 1, 6, 0
6635    mov                  r2, [rsp+gprsize*3+(64*2+12)*16]
6636    mov [rsp+gprsize*3+(64*2+12)*16], r6
6637%endif
6638.loop_pass1:
6639%if ARCH_X86_64
6640    mova                m11, [o(pd_2048)]
6641    mova                m12, [o(clip_18b_min)]
6642    mova                m13, [o(clip_18b_max)]
6643    mova                m14, [o(pd_2896)]
6644%endif
6645    mova                 m0, [cq+ 1*128+r5*8]
6646    mova                 m1, [cq+ 3*128+r5*8]
6647    mova                 m2, [cq+ 5*128+r5*8]
6648    mova                 m3, [cq+ 7*128+r5*8]
6649    mova                 m4, [cq+ 9*128+r5*8]
6650    mova                 m5, [cq+11*128+r5*8]
6651    mova                 m6, [cq+13*128+r5*8]
6652    mova                 m7, [cq+15*128+r5*8]
6653    call m(idct_16x4_internal_16bpc).main_oddhalf
6654
6655    mova                 m0, [cq+ 0*128+r5*8]
6656    mova                 m1, [cq+ 2*128+r5*8]
6657    mova                 m2, [cq+ 4*128+r5*8]
6658    mova                 m3, [cq+ 6*128+r5*8]
6659    mova                 m4, [cq+ 8*128+r5*8]
6660    mova                 m5, [cq+10*128+r5*8]
6661    mova                 m6, [cq+12*128+r5*8]
6662    mova                 m7, [cq+14*128+r5*8]
6663    call m(idct_8x4_internal_16bpc).main_pass1
6664    call m(idct_8x4_internal_16bpc).round
6665    call m(idct_16x16_internal_16bpc).round
6666%if ARCH_X86_64
6667    packssdw             m0, m1
6668    packssdw             m2, m3
6669    packssdw             m4, m5
6670    packssdw             m6, m7
6671    packssdw             m8, m9
6672    packssdw            m10, m11
6673    packssdw            m12, m13
6674    packssdw            m14, m15
6675%endif
6676    call m(idct_8x4_internal_16bpc).transpose4x8packed
6677    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
6678    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
6679    movzx               t0d, t1b
6680    movzx               t2d, t3b
6681    shr                 t1d, 8
6682    shr                 t3d, 8
6683%if ARCH_X86_64
6684    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6685    mova   [rsp+76*16+t0*8], m8
6686    mova   [rsp+76*16+t1*8], m9
6687    mova   [rsp+76*16+t2*8], m10
6688    mova   [rsp+76*16+t3*8], m11
6689%else
6690    mova   [rsp+76*16+t0*8], m0
6691    mova   [rsp+76*16+t1*8], m1
6692    mova   [rsp+76*16+t2*8], m2
6693    mova   [rsp+76*16+t3*8], m3
6694    mova                 m0, [rsp+ 8*16]
6695    mova                 m2, [rsp+ 9*16]
6696    mova                 m4, [rsp+10*16]
6697    mova                 m6, [rsp+11*16]
6698    call m(idct_8x4_internal_16bpc).transpose4x8packed
6699%endif
6700    mova   [rsp+12*16+t0*8], m0
6701    mova   [rsp+12*16+t1*8], m1
6702    mova   [rsp+12*16+t2*8], m2
6703    mova   [rsp+12*16+t3*8], m3
6704%if ARCH_X86_32
6705    mov                  r6, [rsp+gprsize*3+(64*2+12)*16]
6706%endif
6707    pxor                 m7, m7
6708    REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
6709    sub                 r5d, 2
6710    jge .loop_pass1
6711
6712    ; pass=2
6713    mov                eobd, [rsp+gprsize*0+(64*2+12)*16]
6714    cmp                eobd, 151
6715    jl .fast
6716    ; fall-through
6717%if ARCH_X86_64
6718    DECLARE_REG_TMP 8, 9
6719%else
6720    DECLARE_REG_TMP 1, 5
6721%endif
6722    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
6723    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
6724    jmp .run
6725.fast:
6726    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
6727    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
6728.run:
6729    add                 rsp, 9*16
6730
6731%if ARCH_X86_64
6732    lea                  r2, [dstq+32]
6733    mov                  r7, -4
6734%else
6735    lea                  r2, [rsp+(64*2+3)*16]
6736    mov      [r2+4*gprsize], t0
6737    mov      [r2+5*gprsize], t1
6738    mov                  r1, [r2+2*gprsize]
6739    mov dword [r2+0*gprsize], 2
6740%endif
6741.loop_pass2:
6742%if ARCH_X86_32
6743    mov                dstq, [r2+1*gprsize]
6744%endif
6745    call .pass2
6746    add                 rsp, 64*16
6747%if ARCH_X86_64
6748    add                  r7, 2
6749    lea                dstq, [r2+r7*8]
6750    jl .loop_pass2
6751%else
6752    add dword [r2+1*gprsize], 16
6753    dec dword [r2+0*gprsize]
6754    jg .loop_pass2
6755%endif
6756%assign stack_size (stack_size-(64*2+9)*16)
6757%if STACK_ALIGNMENT >= 16
6758%assign stack_size_padded (stack_size_padded-(64*2+9)*16)
6759%assign stack_offset (stack_offset-(64*2+9)*16)
6760%else
6761%xdefine rstkm [rsp + stack_size]
6762%endif
6763%if ARCH_X86_64
6764    mov                  r9, [rsp+gprsize*1+3*16]
6765%if WIN64
6766    mov                  r7, [rsp+gprsize*2+3*16]
6767    mov                  r8, [rsp+gprsize*3+3*16]
6768%endif
6769%endif
6770    RET
6771
6772.pass2:
6773%if ARCH_X86_32
6774    lea                  r5, [o(itx8_start)]
6775%endif
6776    mova                 m0, [rsp+gprsize+16* 3]
6777    mova                 m1, [rsp+gprsize+16* 4]
6778    mova                 m2, [rsp+gprsize+16* 5]
6779    mova                 m3, [rsp+gprsize+16* 6]
6780    pxor                 m4, m4
6781    REPX       {mova x, m4}, m5, m6, m7
6782    call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
6783    mova [rsp+gprsize+ 3*16], m0
6784    mova [rsp+gprsize+ 4*16], m1
6785    mova [rsp+gprsize+ 5*16], m2
6786    mova [rsp+gprsize+ 6*16], m3
6787    mova [rsp+gprsize+ 7*16], m4
6788    mova [rsp+gprsize+ 8*16], m5
6789    mova [rsp+gprsize+ 9*16], m6
6790    mova [rsp+gprsize+10*16], m7
6791    mova                 m0, [rsp+gprsize+16*11]
6792    mova                 m1, [rsp+gprsize+16*12]
6793    mova                 m2, [rsp+gprsize+16*13]
6794    mova                 m3, [rsp+gprsize+16*14]
6795    pxor                 m4, m4
6796    REPX       {mova x, m4}, m5, m6, m7
6797    call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
6798    mova                 m7, [rsp+gprsize+ 0*16]
6799    mova [rsp+gprsize+11*16], m0
6800    mova [rsp+gprsize+12*16], m1
6801    mova [rsp+gprsize+13*16], m2
6802    mova [rsp+gprsize+14*16], m3
6803    mova [rsp+gprsize+15*16], m4
6804    mova [rsp+gprsize+16*16], m5
6805    mova [rsp+gprsize+17*16], m6
6806    mova [rsp+gprsize+18*16], m7
6807%if ARCH_X86_64
6808    call                  r8
6809%else
6810    call      [r2+4*gprsize]
6811%endif
6812    mova [rsp+gprsize+ 3*16], m0
6813    mova [rsp+gprsize+ 5*16], m2
6814    mova [rsp+gprsize+ 8*16], m5
6815    mova [rsp+gprsize+10*16], m7
6816%if ARCH_X86_64
6817    call                 r9
6818    mova                 m8, [o(pw_2048)]
6819    pxor                 m9, m9
6820    mova                m10, [o(pixel_10bpc_max)]
6821%else
6822    call     [r2+5*gprsize]
6823%endif
6824    lea                  r3, [strideq*3]
6825    lea                  r4, [rsp+gprsize+ 3*16]
6826%if ARCH_X86_64
6827    mov                 r6d, 8
6828%else
6829    mov dword [r2+2*gprsize], 8
6830%endif
6831.loop_write:
6832    mova                 m0, [r4+0*16]
6833    mova                 m1, [r4+1*16]
6834    mova                 m2, [r4+2*16]
6835    mova                 m3, [r4+3*16]
6836    mova                 m4, [r4+4*16]
6837    mova                 m5, [r4+5*16]
6838    mova                 m6, [r4+6*16]
6839    mova                 m7, [r4+7*16]
6840    call m(idct_8x8_internal_16bpc).round1_and_write_8x8
6841    lea                dstq, [dstq+strideq*8]
6842    add                  r4, 8*16
6843%if ARCH_X86_64
6844    dec                 r6d
6845%else
6846    dec dword [r2+2*gprsize]
6847%endif
6848    jg .loop_write
6849    ret
6850
6851.dconly:
6852    imul                r5d, [cq], 181
6853    mov                [cq], eobd ; 0
6854    mov                 r3d, 64
6855    add                 r5d, 640
6856    sar                 r5d, 10
6857    add                 rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
6858    jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
6859
6860cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
6861                                          0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \
6862                                          dst, stride, c, eob
6863    LEA                  r6, base
6864    test               eobd, eobd
6865    jz .dconly
6866
6867%if ARCH_X86_32
6868    DECLARE_REG_TMP 4, 1, 2, 0
6869    mov [rsp+gprsize*1+(64*4+32)*16], r0
6870    mov [rsp+gprsize*2+(64*4+32)*16], r1
6871    mov [rsp+gprsize*3+(64*4+32)*16], r2
6872%else
6873    DECLARE_REG_TMP 8, 9, 4, 7
6874    mov [rsp+gprsize*1+(64*4+32)*16], r9
6875%if WIN64
6876    mov [rsp+gprsize*2+(64*4+32)*16], r7
6877    mov [rsp+gprsize*3+(64*4+32)*16], r8
6878%endif
6879%endif
6880%undef cmp
6881    ; remove entirely-zero iterations
6882    mov                 r5d, 7*2
6883    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
6884    jge .end_zero_loop
6885    pxor                 m0, m0
6886.zero_loop:
6887    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
6888    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
6889    movzx               t0d, t1b
6890    movzx               t2d, t3b
6891    shr                 t1d, 8
6892    shr                 t3d, 8
6893    mova  [rsp+ 32*16+t0*8], m0
6894    mova  [rsp+ 32*16+t1*8], m0
6895    mova  [rsp+ 32*16+t2*8], m0
6896    mova  [rsp+ 32*16+t3*8], m0
6897    mova  [rsp+ 96*16+t0*8], m0
6898    mova  [rsp+ 96*16+t1*8], m0
6899    mova  [rsp+ 96*16+t2*8], m0
6900    mova  [rsp+ 96*16+t3*8], m0
6901    mova  [rsp+160*16+t0*8], m0
6902    mova  [rsp+160*16+t1*8], m0
6903    mova  [rsp+160*16+t2*8], m0
6904    mova  [rsp+160*16+t3*8], m0
6905    mova  [rsp+224*16+t0*8], m0
6906    mova  [rsp+224*16+t1*8], m0
6907    mova  [rsp+224*16+t2*8], m0
6908    mova  [rsp+224*16+t3*8], m0
6909    sub                 r5d, 2
6910    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
6911    jl .zero_loop
6912.end_zero_loop:
6913    ; actual first pass after skipping all-zero data
6914    mov [rsp+gprsize*0+(64*4+32)*16], eobd
6915    mov                  r3, rsp
6916%if ARCH_X86_32
6917    DECLARE_REG_TMP 4, 1, 6, 0
6918    mov                  r2, [rsp+gprsize*3+(64*4+32)*16]
6919    mov [rsp+gprsize*3+(64*4+32)*16], r6
6920%endif
6921.loop_pass1:
6922%if ARCH_X86_64
6923    mova                m11, [o(pd_2048)]
6924    mova                m12, [o(clip_18b_min)]
6925    mova                m13, [o(clip_18b_max)]
6926    mova                m14, [o(pd_2896)]
6927%endif
6928    mova                 m0, [cq+128* 1+r5*8]
6929    mova                 m1, [cq+128* 7+r5*8]
6930    mova                 m2, [cq+128* 9+r5*8]
6931    mova                 m3, [cq+128*15+r5*8]
6932    mova                 m4, [cq+128*17+r5*8]
6933    mova                 m5, [cq+128*23+r5*8]
6934    mova                 m6, [cq+128*25+r5*8]
6935    mova                 m7, [cq+128*31+r5*8]
6936    mov                  r3, rsp
6937    call m(idct_8x4_internal_16bpc).rect2_mul
6938    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
6939
6940    mova                 m0, [cq+128* 3+r5*8]
6941    mova                 m1, [cq+128* 5+r5*8]
6942    mova                 m2, [cq+128*11+r5*8]
6943    mova                 m3, [cq+128*13+r5*8]
6944    mova                 m4, [cq+128*19+r5*8]
6945    mova                 m5, [cq+128*21+r5*8]
6946    mova                 m6, [cq+128*27+r5*8]
6947    mova                 m7, [cq+128*29+r5*8]
6948%if ARCH_X86_32
6949    add                  r3, 16*8
6950%endif
6951    call m(idct_8x4_internal_16bpc).rect2_mul
6952%if ARCH_X86_32
6953    sub                  r3, 16*8
6954%endif
6955    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
6956    add                  r3, 16*(16+4*ARCH_X86_32)
6957
6958    mova                 m0, [cq+128* 2+r5*8]
6959    mova                 m1, [cq+128* 6+r5*8]
6960    mova                 m2, [cq+128*10+r5*8]
6961    mova                 m3, [cq+128*14+r5*8]
6962    mova                 m4, [cq+128*18+r5*8]
6963    mova                 m5, [cq+128*22+r5*8]
6964    mova                 m6, [cq+128*26+r5*8]
6965    mova                 m7, [cq+128*30+r5*8]
6966    call m(idct_8x4_internal_16bpc).rect2_mul
6967    call m(idct_16x4_internal_16bpc).main_oddhalf
6968
6969    mova                 m0, [cq+128* 0+r5*8]
6970    mova                 m1, [cq+128* 4+r5*8]
6971    mova                 m2, [cq+128* 8+r5*8]
6972    mova                 m3, [cq+128*12+r5*8]
6973    mova                 m4, [cq+128*16+r5*8]
6974    mova                 m5, [cq+128*20+r5*8]
6975    mova                 m6, [cq+128*24+r5*8]
6976    mova                 m7, [cq+128*28+r5*8]
6977    call m(idct_8x4_internal_16bpc).rect2_mul
6978    call m(idct_8x4_internal_16bpc).main_pass1
6979    call m(idct_8x4_internal_16bpc).round
6980    sub                  r3, 16*(16+4*ARCH_X86_32)
6981    call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32
6982
6983    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
6984    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
6985    movzx               t0d, t1b
6986    movzx               t2d, t3b
6987    shr                 t1d, 8
6988    shr                 t3d, 8
6989%if ARCH_X86_64
6990    call m(idct_8x4_internal_16bpc).transpose4x8packed
6991    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
6992    mova  [rsp+160*16+t0*8], m8
6993    mova  [rsp+160*16+t1*8], m9
6994    mova  [rsp+160*16+t2*8], m10
6995    mova  [rsp+160*16+t3*8], m11
6996    mova                 m8, [r3+16* 9] ;  8  9
6997    mova                m10, [r3+16*11] ; 10 11
6998    mova                m12, [r3+16*13] ; 12 13
6999    mova                m14, [r3+16*15] ; 14 15
7000    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
7001    mova  [rsp+ 96*16+t0*8], m8
7002    mova  [rsp+ 96*16+t1*8], m9
7003    mova  [rsp+ 96*16+t2*8], m10
7004    mova  [rsp+ 96*16+t3*8], m11
7005    mova                 m8, [r3+16* 8] ; 24 25
7006    mova                m10, [r3+16*10] ; 26 27
7007    mova                m12, [r3+16*12] ; 28 29
7008    mova                m14, [r3+16*14] ; 30 31
7009    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
7010    mova  [rsp+224*16+t0*8], m8
7011    mova  [rsp+224*16+t1*8], m9
7012    mova  [rsp+224*16+t2*8], m10
7013    mova  [rsp+224*16+t3*8], m11
7014%else
7015    sub                  r3, 8*16
7016    mova                 m0, [r3+ 8*16]
7017    mova                 m2, [r3+10*16]
7018    mova                 m4, [r3+12*16]
7019    mova                 m6, [r3+14*16]
7020    packssdw             m0, [r3+ 9*16]
7021    packssdw             m2, [r3+11*16]
7022    packssdw             m4, [r3+13*16]
7023    packssdw             m6, [r3+15*16]
7024    call m(idct_8x4_internal_16bpc).transpose4x8packed
7025    mova  [rsp+ 96*16+t0*8], m0
7026    mova  [rsp+ 96*16+t1*8], m1
7027    mova  [rsp+ 96*16+t2*8], m2
7028    mova  [rsp+ 96*16+t3*8], m3
7029    mova                 m0, [r3+16*16]
7030    mova                 m2, [r3+18*16]
7031    mova                 m4, [r3+20*16]
7032    mova                 m6, [r3+22*16]
7033    packssdw             m0, [r3+17*16]
7034    packssdw             m2, [r3+19*16]
7035    packssdw             m4, [r3+21*16]
7036    packssdw             m6, [r3+23*16]
7037    call m(idct_8x4_internal_16bpc).transpose4x8packed
7038    mova  [rsp+160*16+t0*8], m0
7039    mova  [rsp+160*16+t1*8], m1
7040    mova  [rsp+160*16+t2*8], m2
7041    mova  [rsp+160*16+t3*8], m3
7042    mova                 m0, [r3+31*16]
7043    mova                 m2, [r3+29*16]
7044    mova                 m4, [r3+27*16]
7045    mova                 m6, [r3+25*16]
7046    packssdw             m0, [r3+30*16]
7047    packssdw             m2, [r3+28*16]
7048    packssdw             m4, [r3+26*16]
7049    packssdw             m6, [r3+24*16]
7050    call m(idct_8x4_internal_16bpc).transpose4x8packed
7051    mova  [rsp+224*16+t0*8], m0
7052    mova  [rsp+224*16+t1*8], m1
7053    mova  [rsp+224*16+t2*8], m2
7054    mova  [rsp+224*16+t3*8], m3
7055    mova                 m0, [r3+ 0*16]
7056    mova                 m2, [r3+ 2*16]
7057    mova                 m4, [r3+ 4*16]
7058    mova                 m6, [r3+ 6*16]
7059    packssdw             m0, [r3+ 1*16]
7060    packssdw             m2, [r3+ 3*16]
7061    packssdw             m4, [r3+ 5*16]
7062    packssdw             m6, [r3+ 7*16]
7063    call m(idct_8x4_internal_16bpc).transpose4x8packed
7064%endif
7065    mova  [rsp+ 32*16+t0*8], m0
7066    mova  [rsp+ 32*16+t1*8], m1
7067    mova  [rsp+ 32*16+t2*8], m2
7068    mova  [rsp+ 32*16+t3*8], m3
7069    pxor                 m0, m0
7070    REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \
7071                                     8, 9, 10, 11, 12, 13, 14, 15, \
7072                                     16, 17, 18, 19, 20, 21, 22, 23, \
7073                                     24, 25, 26, 27, 28, 29, 30, 31
7074%if ARCH_X86_32
7075    mov                  r6, [rsp+gprsize*3+(64*4+32)*16]
7076%endif
7077    sub                 r5d, 2
7078    jge .loop_pass1
7079
7080    ; pass=2
7081    mov                eobd, [rsp+gprsize*0+(64*4+32)*16]
7082    cmp                eobd, 136
7083    jl .fast
7084    ; fall-through
7085%if ARCH_X86_64
7086    DECLARE_REG_TMP 8, 9
7087%else
7088    DECLARE_REG_TMP 1, 5
7089%endif
7090    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
7091    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
7092    jmp .run
7093.fast:
7094    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
7095    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
7096.run:
7097    add                 rsp, 29*16
7098
7099%if ARCH_X86_64
7100    lea                  r2, [dstq+64]
7101    mov                  r7, -8
7102%else
7103    lea                  r2, [rsp+(64*4+3)*16]
7104    mov      [r2+4*gprsize], t0
7105    mov      [r2+5*gprsize], t1
7106    mov                  r1, [r2+2*gprsize]
7107    mov dword [r2+0*gprsize], 4
7108%endif
7109    jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
7110
7111.dconly:
7112    imul                r5d, [cq], 181
7113    mov                [cq], eobd ; 0
7114    mov                 r3d, 64
7115    add                 r5d, 128
7116    sar                 r5d, 8
7117    imul                r5d, 181
7118    add                 r5d, 384
7119    sar                 r5d, 9
7120    add                 rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
7121    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
7122
7123cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
7124                                         dst, stride, c, eob
7125    LEA                  r6, base
7126    test               eobd, eobd
7127    jz .dconly
7128
7129    ; remove entirely-zero iterations
7130%undef cmp
7131    mov                 r5d, 8
7132.zero_loop:
7133    sub                 r5d, 2
7134    cmp                eobw, word [o2(tbl_32x16_2d)+r5]
7135    jl .zero_loop
7136
7137    ; actual first pass after skipping all-zero data
7138.loop_pass1:
7139%if ARCH_X86_64
7140    mova                m11, [o(pd_2048)]
7141    mova                m12, [o(clip_18b_min)]
7142    mova                m13, [o(clip_18b_max)]
7143    mova                m14, [o(pd_2896)]
7144%endif
7145
7146    mov                  r3, rsp
7147    lea                  r4, [o(idct64_mul_16bpc)]
7148    mova                 m0, [cq+64* 1+r5*8]
7149    mova                 m1, [cq+64*31+r5*8]
7150    mova                 m2, [cq+64*17+r5*8]
7151    mova                 m3, [cq+64*15+r5*8]
7152    call .main_part1
7153    mova                 m0, [cq+64* 7+r5*8]
7154    mova                 m1, [cq+64*25+r5*8]
7155    mova                 m2, [cq+64*23+r5*8]
7156    mova                 m3, [cq+64* 9+r5*8]
7157    call .main_part1
7158    mova                 m0, [cq+64* 5+r5*8]
7159    mova                 m1, [cq+64*27+r5*8]
7160    mova                 m2, [cq+64*21+r5*8]
7161    mova                 m3, [cq+64*11+r5*8]
7162    call .main_part1
7163    mova                 m0, [cq+64* 3+r5*8]
7164    mova                 m1, [cq+64*29+r5*8]
7165    mova                 m2, [cq+64*19+r5*8]
7166    mova                 m3, [cq+64*13+r5*8]
7167    call .main_part1
7168    call .main_part2
7169
7170    mova                 m0, [cq+64* 2+r5*8]
7171    mova                 m1, [cq+64*14+r5*8]
7172    mova                 m2, [cq+64*18+r5*8]
7173    mova                 m3, [cq+64*30+r5*8]
7174    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
7175
7176    mova                 m0, [cq+64* 6+r5*8]
7177    mova                 m1, [cq+64*10+r5*8]
7178    mova                 m2, [cq+64*22+r5*8]
7179    mova                 m3, [cq+64*26+r5*8]
7180    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
7181    add                  r3, 16*(24+4*ARCH_X86_32)
7182
7183    mova                 m0, [cq+64* 4+r5*8]
7184    mova                 m1, [cq+64*12+r5*8]
7185    mova                 m2, [cq+64*20+r5*8]
7186    mova                 m3, [cq+64*28+r5*8]
7187    call m(idct_16x4_internal_16bpc).main_oddhalf_fast
7188
7189    mova                 m0, [cq+64* 0+r5*8]
7190    mova                 m1, [cq+64* 8+r5*8]
7191    mova                 m2, [cq+64*16+r5*8]
7192    mova                 m3, [cq+64*24+r5*8]
7193    call m(idct_8x4_internal_16bpc).main_pass1_fast
7194    call m(idct_8x4_internal_16bpc).round
7195    mova [r3-(7+4*ARCH_X86_32)*16], m1
7196    mova [r3-(6+4*ARCH_X86_32)*16], m2
7197    mova [r3-(5+4*ARCH_X86_32)*16], m3
7198    mova [r3-(4+4*ARCH_X86_32)*16], m4
7199    mova [r3-(3+4*ARCH_X86_32)*16], m5
7200    mova [r3-(2+4*ARCH_X86_32)*16], m6
7201    mova [r3-(1+4*ARCH_X86_32)*16], m7
7202    sub                  r3, 16*(40+4*ARCH_X86_32-4)
7203
7204%if ARCH_X86_64
7205    psrld               m15, m11, 10 ; pd_2
7206%else
7207    mova                 m7, [o(pd_2)]
7208%endif
7209    call .main_end_loop_start
7210
7211    lea                  r3, [rsp+56*16]
7212    lea                  r4, [cq+r5*8+64*28]
7213    call .shift_transpose
7214    sub                 r5d, 2
7215    jge .loop_pass1
7216
7217    ; pass=2, we need to call this otherwise the stack pointer has
7218    ; the wrong offset in the 8-bit code
7219    call .pass2
7220    RET
7221
7222.pass2:
7223%if ARCH_X86_64
7224    mova                 m8, [o(pw_2048)]
7225    pxor                 m9, m9
7226    mova                m10, [o(pixel_10bpc_max)]
7227%if WIN64
7228    mov [rsp+16*16+gprsize], r7
7229%endif
7230    mov                  r7, dstq
7231%else
7232    mov [rsp+2*gprsize+16*16], dstq
7233%endif
7234    lea                  r3, [strideq*3]
7235    mov                 r4d, 8
7236    jmp m(idct_16x16_internal_16bpc).loop_pass2
7237
7238.main_part1: ; idct64 steps 1-5
7239    ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
7240    ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
7241    ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
7242    ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
7243%if ARCH_X86_64
7244    movd                 m7, [r4+4*0]
7245    movd                 m8, [r4+4*1]
7246    movd                 m6, [r4+4*2]
7247    movd                 m9, [r4+4*3]
7248    movd                 m5, [r4+4*4]
7249    movd                m10, [r4+4*5]
7250    movd                 m4, [r4+4*6]
7251    movd                m15, [r4+4*7]
7252    REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15
7253    pmulld               m7, m0     ; t63a
7254    pmulld               m0, m8     ; t32a
7255    pmulld               m6, m1     ; t62a
7256    pmulld               m1, m9     ; t33a
7257    pmulld               m5, m2     ; t61a
7258    pmulld               m2, m10    ; t34a
7259    pmulld               m4, m3     ; t60a
7260    pmulld               m3, m15    ; t35a
7261    movd                m10, [r4+4*8]
7262    movd                m15, [r4+4*9]
7263    REPX {pshufd x, x, q0000}, m10, m15
7264    REPX     {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
7265    REPX     {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
7266    psubd                m8, m0, m1 ; t33
7267    paddd                m0, m1     ; t32
7268    psubd                m1, m7, m6 ; t62
7269    paddd                m7, m6     ; t63
7270    psubd                m6, m3, m2 ; t34
7271    paddd                m3, m2     ; t35
7272    psubd                m2, m4, m5 ; t61
7273    paddd                m4, m5     ; t60
7274    REPX    {pmaxsd x, m12}, m8, m1, m6, m2
7275    REPX    {pminsd x, m13}, m8, m1, m6, m2
7276    ITX_MULSUB_2D         1, 8, 5, 9, _, 11, 10, 15    ; t33a, t62a
7277    ITX_MULSUB_2D         2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
7278    REPX    {pmaxsd x, m12}, m0, m3, m7, m4
7279    REPX    {pminsd x, m13}, m0, m3, m7, m4
7280    movd                m10, [r4+4*10]
7281    movd                m15, [r4+4*11]
7282    REPX {pshufd x, x, q0000}, m10, m15
7283    psubd                m5, m0, m3 ; t35a
7284    paddd                m0, m3     ; t32a
7285    psubd                m3, m7, m4 ; t60a
7286    paddd                m7, m4     ; t63a
7287    psubd                m4, m1, m6 ; t34
7288    paddd                m1, m6     ; t33
7289    psubd                m6, m8, m2 ; t61
7290    paddd                m8, m2     ; t62
7291    REPX    {pmaxsd x, m12}, m5, m3, m4, m6
7292    REPX    {pminsd x, m13}, m5, m3, m4, m6
7293    ITX_MULSUB_2D         3, 5, 2, 9, _, 11, 10, 15 ; t35,  t60
7294    ITX_MULSUB_2D         6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
7295    REPX    {pmaxsd x, m12}, m0, m7, m1, m8
7296    REPX    {pminsd x, m13}, m0, m7, m1, m8
7297    add                  r4, 4*12
7298    mova          [r3+16*0], m0
7299    mova          [r3+16*7], m7
7300    mova          [r3+16*1], m1
7301    mova          [r3+16*6], m8
7302    mova          [r3+16*2], m6
7303    mova          [r3+16*5], m4
7304    mova          [r3+16*3], m3
7305    mova          [r3+16*4], m5
7306%else
7307    movd                 m7, [r4+4*0]
7308    movd                 m6, [r4+4*2]
7309    movd                 m5, [r4+4*4]
7310    movd                 m4, [r4+4*6]
7311    REPX {pshufd x, x, q0000}, m7, m6, m5, m4
7312    pmulld               m7, m0     ; t63a
7313    pmulld               m6, m1     ; t62a
7314    pmulld               m5, m2     ; t61a
7315    pmulld               m4, m3     ; t60a
7316    mova          [r3+0*16], m6
7317    mova          [r3+1*16], m7
7318    movd                 m6, [r4+4*1]
7319    movd                 m7, [r4+4*3]
7320    REPX {pshufd x, x, q0000}, m7, m6
7321    pmulld               m0, m6     ; t32a
7322    pmulld               m1, m7     ; t33a
7323    movd                 m6, [r4+4*5]
7324    movd                 m7, [r4+4*7]
7325    REPX {pshufd x, x, q0000}, m7, m6
7326    pmulld               m2, m6     ; t34a
7327    pmulld               m3, m7     ; t35a
7328    mova                 m6, [r3+0*16]
7329    mova                 m7, [o(pd_2048)]
7330    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
7331    paddd                m7, [r3+1*16]
7332    REPX      {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4
7333    mova           [r3+0*16], m5
7334    psubd                m5, m0, m1 ; t33
7335    paddd                m0, m1     ; t32
7336    mova           [r3+1*16], m0
7337    mova                 m0, [r3+0*16]
7338    psubd                m1, m7, m6 ; t62
7339    paddd                m7, m6     ; t63
7340    psubd                m6, m3, m2 ; t34
7341    paddd                m3, m2     ; t35
7342    psubd                m2, m4, m0 ; t61
7343    paddd                m4, m0     ; t60
7344    mova                 m0, [o(clip_18b_min)]
7345    REPX     {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4
7346    pmaxsd               m0, [r3+1*16]
7347    mova          [r3+0*16], m0
7348    mova                 m0, [o(clip_18b_max)]
7349    REPX     {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4
7350    pminsd               m0, [r3+0*16]
7351    mova          [r3+0*16], m0
7352    mova          [r3+1*16], m3
7353    mova          [r3+2*16], m4
7354    mova          [r3+3*16], m7
7355    mova                 m0, [o(pd_2048)]
7356    movd                 m3, [r4+4*8]
7357    movd                 m4, [r4+4*9]
7358    REPX {pshufd x, x, q0000}, m3, m4
7359    mova          [r3+4*16], m2
7360    ITX_MULSUB_2D         1, 5, 2, 7, _, 0, 3, 4    ; t33a, t62a
7361    mova                 m2, [r3+4*16]
7362    mova          [r3+4*16], m5
7363    ITX_MULSUB_2D         2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a
7364    mova                 m0, [r3+0*16]
7365    mova                 m3, [r3+1*16]
7366    mova                 m4, [r3+2*16]
7367    mova                 m7, [r3+3*16]
7368    psubd                m5, m0, m3 ; t35a
7369    paddd                m0, m3     ; t32a
7370    mova          [r3+0*16], m5
7371    mova                 m5, [r3+4*16]
7372    psubd                m3, m7, m4 ; t60a
7373    paddd                m7, m4     ; t63a
7374    psubd                m4, m1, m6 ; t34
7375    paddd                m1, m6     ; t33
7376    psubd                m6, m5, m2 ; t61
7377    paddd                m2, m5     ; t62
7378    mova                 m5, [o(clip_18b_min)]
7379    REPX     {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2
7380    pmaxsd               m5, [r3+0*16]
7381    mova          [r3+0*16], m5
7382    mova                 m5, [o(clip_18b_max)]
7383    REPX     {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2
7384    pminsd               m5, [r3+0*16]
7385    mova          [r3+16*0], m0
7386    mova          [r3+16*7], m7
7387    mova          [r3+16*1], m1
7388    mova          [r3+16*6], m2
7389    mova          [r3+16*2], m4
7390    mova                 m7, [o(pd_2048)]
7391    movd                 m0, [r4+4*10]
7392    movd                 m1, [r4+4*11]
7393    REPX {pshufd x, x, q0000}, m0, m1
7394    ITX_MULSUB_2D         3, 5, 2, 4, _, 7, 0, 1 ; t35,  t60
7395    mova          [r3+16*3], m3
7396    mova          [r3+16*4], m5
7397    mova                 m4, [r3+2*16]
7398    ITX_MULSUB_2D         6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a
7399    add                  r4, 4*12
7400    mova          [r3+16*2], m6
7401    mova          [r3+16*5], m4
7402%endif
7403    add                  r3, 16*8
7404    ret
7405
7406.main_part2: ; idct64 steps 6-9
7407    lea                  r4, [r3+16*7]
7408%if ARCH_X86_64
7409    mova                m10, [o(pd_1567)]
7410    mova                m15, [o(pd_3784)]
7411.main_part2_loop:
7412    mova                 m0, [r3-16*32] ; t32a
7413    mova                 m1, [r4-16*24] ; t39a
7414    mova                 m2, [r4-16*32] ; t63a
7415    mova                 m3, [r3-16*24] ; t56a
7416    mova                 m4, [r3-16*16] ; t40a
7417    mova                 m5, [r4-16* 8] ; t47a
7418    mova                 m6, [r4-16*16] ; t55a
7419    mova                 m7, [r3-16* 8] ; t48a
7420    psubd                m8, m0, m1 ; t39
7421    paddd                m0, m1     ; t32
7422    psubd                m1, m2, m3 ; t56
7423    paddd                m2, m3     ; t63
7424    psubd                m3, m5, m4 ; t40
7425    paddd                m5, m4     ; t47
7426    psubd                m4, m7, m6 ; t55
7427    paddd                m7, m6     ; t48
7428    REPX    {pmaxsd x, m12}, m8, m1, m3, m4
7429    REPX    {pminsd x, m13}, m8, m1, m3, m4
7430    ITX_MULSUB_2D         1, 8, 6, 9, _, 11, 10, 15    ; t39a, t56a
7431    ITX_MULSUB_2D         4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
7432    REPX    {pmaxsd x, m12}, m0, m2, m5, m7
7433    REPX    {pminsd x, m13}, m0, m5, m2, m7
7434    psubd                m6, m2, m7 ; t48a
7435    paddd                m2, m7     ; t63a
7436    psubd                m7, m0, m5 ; t47a
7437    paddd                m0, m5     ; t32a
7438    psubd                m5, m8, m4 ; t55
7439    paddd                m8, m4     ; t56
7440    psubd                m4, m1, m3 ; t40
7441    paddd                m1, m3     ; t39
7442    REPX    {pmaxsd x, m12}, m6, m7, m5, m4
7443    REPX    {pminsd x, m13}, m6, m7, m5, m4
7444    REPX    {pmulld x, m14}, m6, m7, m5, m4
7445    REPX    {pmaxsd x, m12}, m2, m0, m8, m1
7446    REPX    {pminsd x, m13}, m2, m0, m8, m1
7447    paddd                m6, m11
7448    paddd                m5, m11
7449    psubd                m3, m6, m7 ; t47
7450    paddd                m6, m7     ; t48
7451    psubd                m7, m5, m4 ; t40a
7452    paddd                m5, m4     ; t55a
7453    REPX      {psrad x, 12}, m3, m6, m7, m5
7454    mova         [r4-16* 8], m2
7455    mova         [r3-16*32], m0
7456    mova         [r3-16* 8], m8
7457    mova         [r4-16*32], m1
7458    mova         [r4-16*24], m3
7459    mova         [r3-16*16], m6
7460    mova         [r3-16*24], m7
7461    mova         [r4-16*16], m5
7462%else
7463.main_part2_loop:
7464    mova                 m0, [r3-16*32] ; t32a
7465    mova                 m1, [r4-16*24] ; t39a
7466    mova                 m2, [r4-16*32] ; t63a
7467    mova                 m3, [r3-16*24] ; t56a
7468    mova                 m4, [r3-16*16] ; t40a
7469    mova                 m5, [r4-16* 8] ; t47a
7470    mova                 m6, [r4-16*16] ; t55a
7471    psubd                m7, m0, m1 ; t39
7472    paddd                m0, m1     ; t32
7473    mova          [r3+0*16], m7
7474    mova                 m7, [r3-16* 8] ; t48a
7475    psubd                m1, m2, m3 ; t56
7476    paddd                m2, m3     ; t63
7477    psubd                m3, m5, m4 ; t40
7478    paddd                m5, m4     ; t47
7479    psubd                m4, m7, m6 ; t55
7480    paddd                m7, m6     ; t48
7481    mova                 m6, [o(clip_18b_min)]
7482    REPX     {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7
7483    pmaxsd               m6, [r3+0*16]
7484    mova          [r3+0*16], m6
7485    mova                 m6, [o(clip_18b_max)]
7486    REPX     {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7
7487    pminsd               m6, [r3+0*16]
7488    mova          [r3+0*16], m0
7489    mova          [r3+1*16], m2
7490    mova          [r3+2*16], m5
7491    mova          [r3+3*16], m7
7492    mova                 m0, [o(pd_2048)]
7493    ITX_MULSUB_2D         1, 6, 2, 5, 7, 0, 1567, 3784    ; t39a, t56a
7494    ITX_MULSUB_2D         4, 3, 2, 5, _, 0,    7, 3784, 4 ; t55a, t40a
7495    mova                 m2, [r3+1*16]
7496    mova                 m7, [r3+3*16]
7497    psubd                m5, m2, m7 ; t48a
7498    paddd                m2, m7     ; t63a
7499    mova          [r3+1*16], m5
7500    mova                 m0, [r3+0*16]
7501    mova                 m5, [r3+2*16]
7502    psubd                m7, m0, m5 ; t47a
7503    paddd                m0, m5     ; t32a
7504    psubd                m5, m6, m4 ; t55
7505    paddd                m6, m4     ; t56
7506    psubd                m4, m1, m3 ; t40
7507    paddd                m1, m3     ; t39
7508    mova                 m3, [o(clip_18b_min)]
7509    REPX     {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1
7510    pmaxsd               m3, [r3+1*16]
7511    mova          [r3+0*16], m3
7512    mova                 m3, [o(clip_18b_max)]
7513    REPX     {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1
7514    pminsd               m3, [r3+0*16]
7515    mova         [r4-16* 8], m2
7516    mova         [r3-16*32], m0
7517    mova         [r3-16* 8], m6
7518    mova         [r4-16*32], m1
7519    mova                 m0, [o(pd_2896)]
7520    mova                 m1, [o(pd_2048)]
7521    REPX     {pmulld x, m0}, m3, m7, m5, m4
7522    REPX     {paddd  x, m1}, m3, m5
7523    psubd                m6, m3, m7 ; t47
7524    paddd                m3, m7     ; t48
7525    psubd                m7, m5, m4 ; t40a
7526    paddd                m5, m4     ; t55a
7527    REPX      {psrad x, 12}, m6, m3, m7, m5
7528    mova         [r4-16*24], m6
7529    mova         [r3-16*16], m3
7530    mova         [r3-16*24], m7
7531    mova         [r4-16*16], m5
7532%endif
7533    add                  r3, 16
7534    sub                  r4, 16
7535    cmp                  r3, r4
7536    jl .main_part2_loop
7537    sub                  r3, 4*16
7538    ret
7539
7540.main_end_loop:
7541    mova                 m0, [r3+16*28] ; idct8  0  + n
7542.main_end_loop_start:
7543    mova                 m2, [r3+16*12] ; idct32 16 + n
7544    mova                 m3, [r4+16*12] ; idct32 31 - n
7545%if ARCH_X86_64
7546    mova                 m1, [r4+16*28] ; idct16 15 - n
7547    mova                 m4, [r4-16* 4] ; idct64 63 - n
7548    mova                 m5, [r3-16* 4] ; idct64 48 + n
7549    mova                 m6, [r4-16*20] ; idct64 47 - n
7550    mova                 m7, [r3-16*20] ; idct64 32 + n
7551    pmaxsd               m0, m12
7552    pminsd               m0, m13
7553    paddd                m8, m0, m1     ; idct16 out0  + n
7554    psubd                m0, m1         ; idct16 out15 - n
7555    REPX    {pmaxsd x, m12}, m8, m0
7556    REPX    {pminsd x, m13}, m8, m0
7557    paddd                m1, m8, m3     ; idct32 out0  + n
7558    psubd                m8, m3         ; idct32 out31 - n
7559    paddd                m3, m0, m2     ; idct32 out15 - n
7560    psubd                m0, m2         ; idct32 out16 + n
7561    REPX    {pmaxsd x, m12}, m1, m8, m3, m0
7562    REPX    {pminsd x, m13}, m1, m3, m8, m0
7563    REPX    {paddd  x, m15}, m1, m3, m0, m8
7564    paddd                m2, m1, m4     ; idct64 out0  + n (unshifted)
7565    psubd                m1, m4         ; idct64 out63 - n (unshifted)
7566    paddd                m4, m3, m5     ; idct64 out15 - n (unshifted)
7567    psubd                m3, m5         ; idct64 out48 + n (unshifted)
7568    paddd                m5, m0, m6     ; idct64 out16 + n (unshifted)
7569    psubd                m0, m6         ; idct64 out47 - n (unshifted)
7570    paddd                m6, m8, m7     ; idct64 out31 - n (unshifted)
7571    psubd                m8, m7         ; idct64 out32 + n (unshifted)
7572    mova         [r3-16*20], m2
7573    mova         [r4+16*28], m1
7574    mova         [r4-16*20], m4
7575    mova         [r3+16*28], m3
7576    mova         [r3-16* 4], m5
7577    mova         [r4+16*12], m0
7578    mova         [r4-16* 4], m6
7579    mova         [r3+16*12], m8
7580%else
7581    mova                 m5, [o(clip_18b_min)]
7582    mova                 m6, [o(clip_18b_max)]
7583    mova                 m1, [r3+16*44] ; idct16 15 - n
7584    pmaxsd               m0, m5
7585    pminsd               m0, m6
7586    paddd                m4, m0, m1     ; idct16 out0  + n
7587    psubd                m0, m1         ; idct16 out15 - n
7588    REPX     {pmaxsd x, m5}, m4, m0
7589    REPX     {pminsd x, m6}, m4, m0
7590    paddd                m1, m4, m3     ; idct32 out0  + n
7591    psubd                m4, m3         ; idct32 out31 - n
7592    paddd                m3, m0, m2     ; idct32 out15 - n
7593    psubd                m0, m2         ; idct32 out16 + n
7594    REPX     {pmaxsd x, m5}, m1, m4, m3, m0
7595    REPX     {pminsd x, m6}, m1, m3, m4, m0
7596    REPX     {paddd  x, m7}, m1, m3, m0, m4
7597    mova                 m5, [r4-16* 4] ; idct64 63 - n
7598    mova                 m6, [r3-16* 4] ; idct64 48 + n
7599    paddd                m2, m1, m5     ; idct64 out0  + n (unshifted)
7600    psubd                m1, m5         ; idct64 out63 - n (unshifted)
7601    paddd                m5, m3, m6     ; idct64 out15 - n (unshifted)
7602    psubd                m3, m6         ; idct64 out48 + n (unshifted)
7603    mova         [r4+16*28], m1
7604    mova         [r3+16*28], m3
7605    mova                 m6, [r4-16*20] ; idct64 47 - n
7606    mova                 m1, [r3-16*20] ; idct64 32 + n
7607    mova         [r3-16*20], m2
7608    mova         [r4-16*20], m5
7609    paddd                m5, m0, m6     ; idct64 out16 + n (unshifted)
7610    psubd                m0, m6         ; idct64 out47 - n (unshifted)
7611    paddd                m6, m4, m1     ; idct64 out31 - n (unshifted)
7612    psubd                m4, m1         ; idct64 out32 + n (unshifted)
7613    mova         [r3-16* 4], m5
7614    mova         [r4+16*12], m0
7615    mova         [r4-16* 4], m6
7616    mova         [r3+16*12], m4
7617%endif
7618    sub                  r4, 16
7619    add                  r3, 16
7620    cmp                  r3, r4
7621    jl .main_end_loop
7622    ret
7623
7624.shift_transpose:
7625    mova                 m0, [r3+0*16]
7626    mova                 m1, [r3+1*16]
7627    mova                 m2, [r3+2*16]
7628    mova                 m3, [r3+3*16]
7629    mova                 m4, [r3+4*16]
7630    mova                 m5, [r3+5*16]
7631    mova                 m6, [r3+6*16]
7632    mova                 m7, [r3+7*16]
7633    REPX       {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
7634    packssdw             m0, m1
7635    packssdw             m2, m3
7636    packssdw             m4, m5
7637    packssdw             m6, m7
7638    call m(idct_8x4_internal_16bpc).transpose4x8packed
7639    mova          [r4+0*64], m0
7640    mova          [r4+1*64], m1
7641    mova          [r4+2*64], m2
7642    mova          [r4+3*64], m3
7643    sub                  r4, 4*64
7644    sub                  r3, 8*16
7645    cmp                  r3, rsp
7646    jg .shift_transpose
7647    ret
7648
7649.dconly:
7650    imul                r5d, [cq], 181
7651    mov                [cq], eobd ; 0
7652    mov                 r3d, 16
7653.dconly1:
7654    add                 r5d, 640
7655    sar                 r5d, 10
7656.dconly2:
7657    imul                r5d, 2896
7658    add                 r5d, 34816
7659    movd                 m0, r5d
7660    pshuflw              m0, m0, q1111
7661    punpcklqdq           m0, m0
7662    mova                 m6, [o(pixel_10bpc_max)]
7663    pxor                 m5, m5
7664.dconly_loop:
7665    paddw                m1, m0, [dstq+16*0]
7666    paddw                m2, m0, [dstq+16*1]
7667    paddw                m3, m0, [dstq+16*2]
7668    paddw                m4, m0, [dstq+16*3]
7669    REPX     {pmaxsw x, m5}, m1, m2, m3, m4
7670    REPX     {pminsw x, m6}, m1, m2, m3, m4
7671    mova        [dstq+16*0], m1
7672    mova        [dstq+16*1], m2
7673    mova        [dstq+16*2], m3
7674    mova        [dstq+16*3], m4
7675    add                dstq, 64
7676    btc                 r3d, 16
7677    jnc .dconly_loop
7678    lea                dstq, [dstq+strideq-128]
7679    dec                 r3d
7680    jg .dconly_loop
7681    RET
7682
7683cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
7684                                         0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \
7685                                         dst, stride, c, eob
7686    LEA                  r6, base
7687    test               eobd, eobd
7688    jz .dconly
7689
7690%if ARCH_X86_32
7691    DECLARE_REG_TMP 0, 4, 1
7692    mov [rsp+(8*32+64+8)*16+1*gprsize], dstq
7693    mov [rsp+(8*32+64+8)*16+2*gprsize], strideq
7694%else
7695    DECLARE_REG_TMP 4, 7, 8
7696%if WIN64
7697    mov [rsp+(8*32+64+1)*16+1*gprsize], r7
7698    mov [rsp+64*16+0*gprsize], r8
7699%endif
7700%endif
7701%undef cmp
7702    ; remove entirely-zero iterations
7703    mov                 r5d, 14
7704    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
7705    jge .end_zero_loop
7706    pxor                 m0, m0
7707.zero_loop:
7708    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
7709    movzx               t1d, t0b
7710    shr                 t0d, 8
7711    lea                  t2, [rsp+7*32*16]
7712.zero_loop_inner:
7713    mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
7714    mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
7715    mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0
7716    mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0
7717    sub                  t2, 32*16
7718    cmp                  t2, rsp
7719    jge .zero_loop_inner
7720    sub                 r5d, 2
7721    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
7722    jl .zero_loop
7723.end_zero_loop:
7724    mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd
7725    ; actual first pass after skipping all-zero data
7726.loop_pass1:
7727%if ARCH_X86_64
7728    mova                m11, [o(pd_2048)]
7729    mova                m12, [o(clip_18b_min)]
7730    mova                m13, [o(clip_18b_max)]
7731    mova                m14, [o(pd_2896)]
7732%endif
7733
7734    mov                  r3, rsp
7735    lea                  r4, [o(idct64_mul_16bpc)]
7736    mova                 m0, [cq+128* 1+r5*8]
7737    mova                 m1, [cq+128*31+r5*8]
7738    mova                 m2, [cq+128*17+r5*8]
7739    mova                 m3, [cq+128*15+r5*8]
7740    call .rect2_mul_fast
7741    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7742    mova                 m0, [cq+128* 7+r5*8]
7743    mova                 m1, [cq+128*25+r5*8]
7744    mova                 m2, [cq+128*23+r5*8]
7745    mova                 m3, [cq+128* 9+r5*8]
7746    call .rect2_mul_fast
7747    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7748    mova                 m0, [cq+128* 5+r5*8]
7749    mova                 m1, [cq+128*27+r5*8]
7750    mova                 m2, [cq+128*21+r5*8]
7751    mova                 m3, [cq+128*11+r5*8]
7752    call .rect2_mul_fast
7753    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7754    mova                 m0, [cq+128* 3+r5*8]
7755    mova                 m1, [cq+128*29+r5*8]
7756    mova                 m2, [cq+128*19+r5*8]
7757    mova                 m3, [cq+128*13+r5*8]
7758    call .rect2_mul_fast
7759    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7760    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
7761
7762    mova                 m0, [cq+128* 2+r5*8]
7763    mova                 m1, [cq+128*14+r5*8]
7764    mova                 m2, [cq+128*18+r5*8]
7765    mova                 m3, [cq+128*30+r5*8]
7766    call .rect2_mul_fast
7767    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
7768
7769    mova                 m0, [cq+128* 6+r5*8]
7770    mova                 m1, [cq+128*10+r5*8]
7771    mova                 m2, [cq+128*22+r5*8]
7772    mova                 m3, [cq+128*26+r5*8]
7773    call .rect2_mul_fast
7774    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
7775    add                  r3, 16*(24+4*ARCH_X86_32)
7776
7777    mova                 m0, [cq+128* 4+r5*8]
7778    mova                 m1, [cq+128*12+r5*8]
7779    mova                 m2, [cq+128*20+r5*8]
7780    mova                 m3, [cq+128*28+r5*8]
7781    call .rect2_mul_fast
7782    call m(idct_16x4_internal_16bpc).main_oddhalf_fast
7783
7784    mova                 m0, [cq+128* 0+r5*8]
7785    mova                 m1, [cq+128* 8+r5*8]
7786    mova                 m2, [cq+128*16+r5*8]
7787    mova                 m3, [cq+128*24+r5*8]
7788    call .rect2_mul_fast
7789    call m(idct_8x4_internal_16bpc).main_pass1_fast
7790    call m(idct_8x4_internal_16bpc).round
7791    mova [r3-(7+4*ARCH_X86_32)*16], m1
7792    mova [r3-(6+4*ARCH_X86_32)*16], m2
7793    mova [r3-(5+4*ARCH_X86_32)*16], m3
7794    mova [r3-(4+4*ARCH_X86_32)*16], m4
7795    mova [r3-(3+4*ARCH_X86_32)*16], m5
7796    mova [r3-(2+4*ARCH_X86_32)*16], m6
7797    mova [r3-(1+4*ARCH_X86_32)*16], m7
7798    sub                  r3, 16*(40+4*ARCH_X86_32-4)
7799
7800%if ARCH_X86_64
7801    psrld               m15, m11, 11 ; pd_1
7802%else
7803    mova                 m7, [o(pd_1)]
7804%endif
7805    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
7806
7807    lea                  r3, [rsp+56*16]
7808    lea                  t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16]
7809    movzx               t0d, word [o2(tbl_Nx32_odd_offset)+r5]
7810    movzx               t1d, t0b
7811    shr                 t0d, 8
7812    call .shift_transpose
7813    ; zero cq
7814    pxor                 m7, m7
7815    lea                  r4, [cq+30*128+r5*8]
7816.zero_cq_loop:
7817    REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
7818    sub                  r4, 4*128
7819    cmp                  r4, cq
7820    jg .zero_cq_loop
7821    sub                 r5d, 2
7822    jge .loop_pass1
7823
7824    ; pass=2 code starts here
7825    mov                eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16]
7826%if ARCH_X86_32
7827    mov             strideq, [rsp+gprsize*2+(8*32+64+8)*16]
7828%elif WIN64
7829    mov                  r8, [rsp+gprsize*0+64*16]
7830%endif
7831    add                 rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16
7832    cmp                eobd, 36
7833    jl .load_veryfast
7834    cmp                eobd, 136
7835    jl .load_fast
7836    ; load normal
7837    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
7838    jmp .run
7839.load_fast:
7840    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
7841    jmp .run
7842.load_veryfast:
7843    lea                  r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
7844    ; fall-through
7845.run:
7846%if ARCH_X86_64
7847    lea                  r2, [dstq+128]
7848    mov                  r7, -16
7849%else
7850    lea                  r2, [rsp+(8*32+3)*16]
7851    mov dword [r2+0*gprsize], 8
7852%endif
7853    jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
7854
7855.rect2_mul_fast:
7856%if ARCH_X86_64
7857    REPX    {pmulld x, m14}, m0, m1, m2, m3
7858    REPX    {paddd  x, m11}, m0, m1, m2, m3
7859%else
7860    mova                 m4, [o(pd_2896)]
7861    mova                 m5, [o(pd_2048)]
7862    REPX    {pmulld x, m4 }, m0, m1, m2, m3
7863    REPX    {paddd  x, m5 }, m0, m1, m2, m3
7864%endif
7865    REPX    {psrad  x, 12 }, m0, m1, m2, m3
7866    ret
7867
7868.shift_transpose:
7869    mova                 m0, [r3+0*16]
7870    mova                 m1, [r3+1*16]
7871    mova                 m2, [r3+2*16]
7872    mova                 m3, [r3+3*16]
7873    mova                 m4, [r3+4*16]
7874    mova                 m5, [r3+5*16]
7875    mova                 m6, [r3+6*16]
7876    mova                 m7, [r3+7*16]
7877    REPX       {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
7878    packssdw             m0, m1
7879    packssdw             m2, m3
7880    packssdw             m4, m5
7881    packssdw             m6, m7
7882    call m(idct_8x4_internal_16bpc).transpose4x8packed
7883    mova     [t2+0*16+r5*8], m0
7884    mova     [t2+8*16+r5*8], m2
7885    mova     [t2+0*16+t0*8], m3
7886    mova     [t2+0*16+t1*8], m1
7887    sub                  t2, 16*32
7888    sub                  r3, 8*16
7889    cmp                  r3, rsp
7890    jg .shift_transpose
7891    ret
7892
7893.dconly:
7894    imul                r5d, [cq], 181
7895    mov                [cq], eobd ; 0
7896    mov                 r3d, 32
7897    add                 r5d, 128
7898    sar                 r5d, 8
7899    imul                r5d, 181
7900    add                 r5d, 384
7901    sar                 r5d, 9
7902    add                 rsp, (1+8*32+1*WIN64)*16
7903    jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
7904
7905cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
7906                                         0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \
7907                                         dst, stride, c, eob
7908    LEA                  r6, base
7909    test               eobd, eobd
7910    jz .dconly
7911
7912%if ARCH_X86_32
7913    DECLARE_REG_TMP 4, 1, 2, 0, 6
7914    mov [rsp+gprsize*1+(64*9+8)*16], r0
7915    mov [rsp+gprsize*2+(64*9+8)*16], r1
7916    mov [rsp+gprsize*3+(64*9+8)*16], r2
7917    mov [rsp+gprsize*4+(64*9+8)*16], r6
7918%else
7919    DECLARE_REG_TMP 8, 9, 4, 7, 0
7920    mov [rsp+gprsize*1+(64*9+1)*16], r9
7921    mov [rsp+gprsize*0+64*16], r0
7922%if WIN64
7923    mov [rsp+gprsize*2+(64*9+1)*16], r7
7924    mov [rsp+gprsize*3+(64*9+1)*16], r8
7925%endif
7926%endif
7927%undef cmp
7928
7929    ; remove entirely-zero iterations
7930    mov                 r5d, 14
7931    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
7932    jge .end_zero_loop
7933    pxor                 m0, m0
7934.zero_loop:
7935    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
7936    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
7937    movzx               t0d, t1b
7938    movzx               t2d, t3b
7939    shr                 t1d, 8
7940    shr                 t3d, 8
7941    lea                  t4, [rsp+7*64*16]
7942.zero_loop_inner:
7943    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0
7944    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0
7945    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0
7946    mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0
7947    sub                  t4, 64*16
7948    cmp                  t4, rsp
7949    jge .zero_loop_inner
7950%if ARCH_X86_32
7951    mov                  r6, [rsp+gprsize*4+(64*9+8)*16]
7952%endif
7953    sub                 r5d, 2
7954    cmp                eobw, word [o2(tbl_32x32_2d)+r5]
7955    jl .zero_loop
7956.end_zero_loop:
7957    mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd
7958%if ARCH_X86_32
7959    mov                  cq, [rsp+gprsize*3+(64*9+8)*16]
7960%endif
7961    ; actual first pass after skipping all-zero data
7962.loop_pass1:
7963%if ARCH_X86_64
7964    mova                m11, [o(pd_2048)]
7965    mova                m12, [o(clip_18b_min)]
7966    mova                m13, [o(clip_18b_max)]
7967    mova                m14, [o(pd_2896)]
7968%endif
7969
7970    mov                  r3, rsp
7971    lea                  r4, [o(idct64_mul_16bpc)]
7972    mova                 m0, [cq+128* 1+r5*8]
7973    mova                 m1, [cq+128*31+r5*8]
7974    mova                 m2, [cq+128*17+r5*8]
7975    mova                 m3, [cq+128*15+r5*8]
7976    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7977    mova                 m0, [cq+128* 7+r5*8]
7978    mova                 m1, [cq+128*25+r5*8]
7979    mova                 m2, [cq+128*23+r5*8]
7980    mova                 m3, [cq+128* 9+r5*8]
7981    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7982    mova                 m0, [cq+128* 5+r5*8]
7983    mova                 m1, [cq+128*27+r5*8]
7984    mova                 m2, [cq+128*21+r5*8]
7985    mova                 m3, [cq+128*11+r5*8]
7986    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7987    mova                 m0, [cq+128* 3+r5*8]
7988    mova                 m1, [cq+128*29+r5*8]
7989    mova                 m2, [cq+128*19+r5*8]
7990    mova                 m3, [cq+128*13+r5*8]
7991    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
7992    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
7993
7994    mova                 m0, [cq+128* 2+r5*8]
7995    mova                 m1, [cq+128*14+r5*8]
7996    mova                 m2, [cq+128*18+r5*8]
7997    mova                 m3, [cq+128*30+r5*8]
7998    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
7999
8000    mova                 m0, [cq+128* 6+r5*8]
8001    mova                 m1, [cq+128*10+r5*8]
8002    mova                 m2, [cq+128*22+r5*8]
8003    mova                 m3, [cq+128*26+r5*8]
8004    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
8005    add                  r3, 16*(24+4*ARCH_X86_32)
8006
8007    mova                 m0, [cq+128* 4+r5*8]
8008    mova                 m1, [cq+128*12+r5*8]
8009    mova                 m2, [cq+128*20+r5*8]
8010    mova                 m3, [cq+128*28+r5*8]
8011    call m(idct_16x4_internal_16bpc).main_oddhalf_fast
8012
8013    mova                 m0, [cq+128* 0+r5*8]
8014    mova                 m1, [cq+128* 8+r5*8]
8015    mova                 m2, [cq+128*16+r5*8]
8016    mova                 m3, [cq+128*24+r5*8]
8017    call m(idct_8x4_internal_16bpc).main_pass1_fast
8018    call m(idct_8x4_internal_16bpc).round
8019    mova [r3-(7+4*ARCH_X86_32)*16], m1
8020    mova [r3-(6+4*ARCH_X86_32)*16], m2
8021    mova [r3-(5+4*ARCH_X86_32)*16], m3
8022    mova [r3-(4+4*ARCH_X86_32)*16], m4
8023    mova [r3-(3+4*ARCH_X86_32)*16], m5
8024    mova [r3-(2+4*ARCH_X86_32)*16], m6
8025    mova [r3-(1+4*ARCH_X86_32)*16], m7
8026    sub                  r3, 16*(40+4*ARCH_X86_32-4)
8027
8028%if ARCH_X86_64
8029    psrld               m15, m11, 10 ; pd_2
8030%else
8031    mova                 m7, [o(pd_2)]
8032%endif
8033    call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
8034
8035    lea                  r3, [rsp+56*16]
8036    movzx               t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
8037    movzx               t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
8038    movzx               t0d, t1b
8039    movzx               t2d, t3b
8040    shr                 t1d, 8
8041    shr                 t3d, 8
8042    lea                  t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
8043    call .shift_transpose
8044    ; zero cq
8045    pxor                 m7, m7
8046%if ARCH_X86_32
8047    mov                  cq, [rsp+gprsize*3+(64*9+8)*16]
8048%endif
8049    lea                  r4, [cq+30*128+r5*8]
8050.zero_cq_loop:
8051    REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
8052    sub                  r4, 4*128
8053    cmp                  r4, cq
8054    jg .zero_cq_loop
8055%if ARCH_X86_32
8056    mov                  r6, [rsp+gprsize*4+(64*9+8)*16]
8057%endif
8058    sub                 r5d, 2
8059    jge .loop_pass1
8060
8061    ; pass=2 code starts here
8062    mov                eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
8063%if ARCH_X86_32
8064    mov             strideq, [rsp+gprsize*2+(9*64+8)*16]
8065%else
8066    mov                  r0, [rsp+gprsize*0+64*16]
8067%endif
8068    add                 rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16
8069    cmp                eobd, 151
8070    jl .fast
8071    ; fall-through
8072%if ARCH_X86_64
8073    DECLARE_REG_TMP 8, 9
8074%else
8075    DECLARE_REG_TMP 1, 5
8076%endif
8077    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
8078    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
8079    jmp .run
8080.fast:
8081    lea                  t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
8082    lea                  t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
8083.run:
8084
8085%if ARCH_X86_64
8086    lea                  r2, [dstq+128]
8087    mov                  r7, -16
8088%else
8089    lea                  r2, [rsp+(64*8+3)*16]
8090    mov      [r2+4*gprsize], t0
8091    mov      [r2+5*gprsize], t1
8092    mov                  r1, [r2+2*gprsize]
8093    mov dword [r2+0*gprsize], 8
8094%endif
8095    jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
8096
8097    ; copy of pass=1 tmp-regs
8098%if ARCH_X86_32
8099    DECLARE_REG_TMP 4, 1, 2, 0, 6
8100%else
8101    DECLARE_REG_TMP 8, 9, 4, 7, 0
8102%endif
8103
8104.shift_transpose:
8105    mova                 m0, [r3+0*16]
8106    mova                 m1, [r3+1*16]
8107    mova                 m2, [r3+2*16]
8108    mova                 m3, [r3+3*16]
8109    mova                 m4, [r3+4*16]
8110    mova                 m5, [r3+5*16]
8111    mova                 m6, [r3+6*16]
8112    mova                 m7, [r3+7*16]
8113    REPX       {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
8114    packssdw             m0, m1
8115    packssdw             m2, m3
8116    packssdw             m4, m5
8117    packssdw             m6, m7
8118    call m(idct_8x4_internal_16bpc).transpose4x8packed
8119    mova          [t4+t0*8], m0
8120    mova          [t4+t1*8], m1
8121    mova          [t4+t2*8], m2
8122    mova          [t4+t3*8], m3
8123    sub                  t4, 16*64
8124    sub                  r3, 8*16
8125    cmp                  r3, rsp
8126    jg .shift_transpose
8127    ret
8128
8129.dconly:
8130    imul                r5d, [cq], 181
8131    mov                [cq], eobd ; 0
8132    mov                 r3d, 64
8133    add                 rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
8134                             (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16
8135    jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1
8136