xref: /aosp_15_r20/external/libdav1d/src/x86/itx_avx2.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018-2021, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; All rights reserved.
4*c0909341SAndroid Build Coastguard Worker;
5*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
6*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
7*c0909341SAndroid Build Coastguard Worker;
8*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
9*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
10*c0909341SAndroid Build Coastguard Worker;
11*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
12*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
13*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
14*c0909341SAndroid Build Coastguard Worker;
15*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25*c0909341SAndroid Build Coastguard Worker
26*c0909341SAndroid Build Coastguard Worker%include "config.asm"
27*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16
32*c0909341SAndroid Build Coastguard Worker
33*c0909341SAndroid Build Coastguard Worker; Note: The order of (at least some of) those constants matter!
34*c0909341SAndroid Build Coastguard Worker
35*c0909341SAndroid Build Coastguard Workerconst deint_shuf, db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
36*c0909341SAndroid Build Coastguard Worker
37*c0909341SAndroid Build Coastguard Worker%macro COEF_PAIR 2
38*c0909341SAndroid Build Coastguard Workerpw_%1_%2:  dw  %1, %2
39*c0909341SAndroid Build Coastguard Workerpw_m%2_%1: dw -%2, %1
40*c0909341SAndroid Build Coastguard Worker%endmacro
41*c0909341SAndroid Build Coastguard Worker
42*c0909341SAndroid Build Coastguard Worker; ADST-only
43*c0909341SAndroid Build Coastguard Workerpw_3803_1321:   dw  3803,  1321
44*c0909341SAndroid Build Coastguard Workerpw_m1321_2482:  dw -1321,  2482
45*c0909341SAndroid Build Coastguard Workerpw_2482_3344:   dw  2482,  3344
46*c0909341SAndroid Build Coastguard Workerpw_m3344_3344:  dw -3344,  3344
47*c0909341SAndroid Build Coastguard Workerpw_m3803_3344:  dw -3803,  3344
48*c0909341SAndroid Build Coastguard Workerpw_m3803_m6688: dw -3803, -6688
49*c0909341SAndroid Build Coastguard Workerpw_2896_m2896:  dw  2896, -2896
50*c0909341SAndroid Build Coastguard Worker
51*c0909341SAndroid Build Coastguard Workerconst pw_5,       times 2 dw 5
52*c0909341SAndroid Build Coastguard Workerconst pw_2048,    times 2 dw 2048
53*c0909341SAndroid Build Coastguard Workerconst pw_4096,    times 2 dw 4096
54*c0909341SAndroid Build Coastguard Workerconst pw_8192,    times 2 dw 8192
55*c0909341SAndroid Build Coastguard Workerconst pw_16384,   times 2 dw 16384
56*c0909341SAndroid Build Coastguard Workerconst pw_1697x16, times 2 dw 1697*16
57*c0909341SAndroid Build Coastguard Workerconst pw_1697x8,  times 2 dw 1697*8
58*c0909341SAndroid Build Coastguard Workerconst pw_2896x8,  times 2 dw 2896*8
59*c0909341SAndroid Build Coastguard Workerconst pd_2048,    dd 2048
60*c0909341SAndroid Build Coastguard Worker
61*c0909341SAndroid Build Coastguard Workerconst pw_2896_2896,  dw  2896, 2896
62*c0909341SAndroid Build Coastguard Workerconst pw_m2896_2896, dw -2896, 2896
63*c0909341SAndroid Build Coastguard Workerconst pw_1567_3784,  dw  1567, 3784
64*c0909341SAndroid Build Coastguard Workerconst pw_m3784_1567, dw -3784, 1567
65*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3784, 1567
66*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  201, 4091
67*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  995, 3973
68*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1751, 3703
69*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2440, 3290
70*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3035, 2751
71*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3513, 2106
72*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3857, 1380
73*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4052,  601
74*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  401, 4076
75*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 1931, 3612
76*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3166, 2598
77*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3920, 1189
78*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR  799, 4017
79*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 3406, 2276
80*c0909341SAndroid Build Coastguard Workerpw_m799_m4017:  dw  -799, -4017
81*c0909341SAndroid Build Coastguard Workerconst pw_m1567_m3784, dw -1567, -3784
82*c0909341SAndroid Build Coastguard Workerpw_m3406_m2276: dw -3406, -2276
83*c0909341SAndroid Build Coastguard Workerpw_m401_m4076:  dw  -401, -4076
84*c0909341SAndroid Build Coastguard Workerpw_m3166_m2598: dw -3166, -2598
85*c0909341SAndroid Build Coastguard Workerpw_m1931_m3612: dw -1931, -3612
86*c0909341SAndroid Build Coastguard Workerpw_m3920_m1189: dw -3920, -1189
87*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 2276, 3406
88*c0909341SAndroid Build Coastguard WorkerCOEF_PAIR 4017,  799
89*c0909341SAndroid Build Coastguard Worker
90*c0909341SAndroid Build Coastguard Worker%macro COEF_X8 1-*
91*c0909341SAndroid Build Coastguard Worker%rep %0
92*c0909341SAndroid Build Coastguard Worker    dw %1*8, %1*8
93*c0909341SAndroid Build Coastguard Worker    %rotate 1
94*c0909341SAndroid Build Coastguard Worker%endrep
95*c0909341SAndroid Build Coastguard Worker%endmacro
96*c0909341SAndroid Build Coastguard Worker
97*c0909341SAndroid Build Coastguard Workerpw_3703x8:  COEF_X8  3703
98*c0909341SAndroid Build Coastguard Workerpw_1751x8:  COEF_X8  1751
99*c0909341SAndroid Build Coastguard Workerpw_m1380x8: COEF_X8 -1380
100*c0909341SAndroid Build Coastguard Workerpw_3857x8:  COEF_X8  3857
101*c0909341SAndroid Build Coastguard Workerpw_3973x8:  COEF_X8  3973
102*c0909341SAndroid Build Coastguard Workerpw_995x8:   COEF_X8   995
103*c0909341SAndroid Build Coastguard Workerpw_m2106x8: COEF_X8 -2106
104*c0909341SAndroid Build Coastguard Workerpw_3513x8:  COEF_X8  3513
105*c0909341SAndroid Build Coastguard Workerpw_3290x8:  COEF_X8  3290
106*c0909341SAndroid Build Coastguard Workerpw_2440x8:  COEF_X8  2440
107*c0909341SAndroid Build Coastguard Workerpw_m601x8:  COEF_X8  -601
108*c0909341SAndroid Build Coastguard Workerpw_4052x8:  COEF_X8  4052
109*c0909341SAndroid Build Coastguard Worker
110*c0909341SAndroid Build Coastguard Workerconst idct64_mul
111*c0909341SAndroid Build Coastguard WorkerCOEF_X8  4095,   101,  4065,   501,  2967, -2824,  3229, -2520
112*c0909341SAndroid Build Coastguard WorkerCOEF_X8  3745,  1660,  3564,  2019,  3822, -1474,  3948, -1092
113*c0909341SAndroid Build Coastguard WorkerCOEF_X8  3996,   897,  3889,  1285,  3461, -2191,  3659, -1842
114*c0909341SAndroid Build Coastguard WorkerCOEF_X8  3349,  2359,  3102,  2675,  4036,  -700,  4085,  -301
115*c0909341SAndroid Build Coastguard Worker
116*c0909341SAndroid Build Coastguard Workerpw_201_4091x8:   dw   201*8, 4091*8
117*c0909341SAndroid Build Coastguard Workerpw_m601_4052x8:  dw  -601*8, 4052*8
118*c0909341SAndroid Build Coastguard Workerpw_995_3973x8:   dw   995*8, 3973*8
119*c0909341SAndroid Build Coastguard Workerpw_m1380_3857x8: dw -1380*8, 3857*8
120*c0909341SAndroid Build Coastguard Workerpw_1751_3703x8:  dw  1751*8, 3703*8
121*c0909341SAndroid Build Coastguard Workerpw_m2106_3513x8: dw -2106*8, 3513*8
122*c0909341SAndroid Build Coastguard Workerpw_2440_3290x8:  dw  2440*8, 3290*8
123*c0909341SAndroid Build Coastguard Workerpw_m2751_3035x8: dw -2751*8, 3035*8
124*c0909341SAndroid Build Coastguard Worker
125*c0909341SAndroid Build Coastguard Worker%define o_idct64_offset idct64_mul - (o_base) - 8
126*c0909341SAndroid Build Coastguard Worker
127*c0909341SAndroid Build Coastguard WorkerSECTION .text
128*c0909341SAndroid Build Coastguard Worker
129*c0909341SAndroid Build Coastguard Worker; Code size reduction trickery: Instead of using rip-relative loads with
130*c0909341SAndroid Build Coastguard Worker; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
131*c0909341SAndroid Build Coastguard Worker; single rip-relative lea and then address things relative from that with
132*c0909341SAndroid Build Coastguard Worker; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
133*c0909341SAndroid Build Coastguard Worker%define o_base deint_shuf + 128
134*c0909341SAndroid Build Coastguard Worker%define o(x) (r6 - (o_base) + (x))
135*c0909341SAndroid Build Coastguard Worker%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
136*c0909341SAndroid Build Coastguard Worker
137*c0909341SAndroid Build Coastguard Worker; flags: 1 = swap, 2 = interleave, 4: coef_regs
138*c0909341SAndroid Build Coastguard Worker%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
139*c0909341SAndroid Build Coastguard Worker%if %7 & 4
140*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m%5, m%1
141*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m%6
142*c0909341SAndroid Build Coastguard Worker%else
143*c0909341SAndroid Build Coastguard Worker%if %7 & 1
144*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%2, [o(pw_%5_%6)]
145*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%3, [o(pw_m%6_%5)]
146*c0909341SAndroid Build Coastguard Worker%else
147*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%2, [o(pw_m%6_%5)]
148*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%3, [o(pw_%5_%6)]
149*c0909341SAndroid Build Coastguard Worker%endif
150*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m%1
151*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m%3
152*c0909341SAndroid Build Coastguard Worker%endif
153*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%4
154*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%4
155*c0909341SAndroid Build Coastguard Worker%if %7 & 2
156*c0909341SAndroid Build Coastguard Worker    pslld               m%2, 4
157*c0909341SAndroid Build Coastguard Worker    psrld               m%1, 12
158*c0909341SAndroid Build Coastguard Worker    pblendw             m%1, m%2, 0xaa
159*c0909341SAndroid Build Coastguard Worker%else
160*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 12
161*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 12
162*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%2
163*c0909341SAndroid Build Coastguard Worker%endif
164*c0909341SAndroid Build Coastguard Worker%endmacro
165*c0909341SAndroid Build Coastguard Worker
166*c0909341SAndroid Build Coastguard Worker; flags: 1 = swap, 2 = interleave, 4 = coef_regs
167*c0909341SAndroid Build Coastguard Worker%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags
168*c0909341SAndroid Build Coastguard Worker%if %10 & 1
169*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%3, [o(pw_%8_%9)]
170*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%4, [o(pw_m%9_%8)]
171*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm%2, [o(pw_%6_%7)]
172*c0909341SAndroid Build Coastguard Worker    vpblendd            m%2, m%3, 0xf0
173*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm%3, [o(pw_m%7_%6)]
174*c0909341SAndroid Build Coastguard Worker%else
175*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%3, [o(pw_m%9_%8)]
176*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%4, [o(pw_%8_%9)]
177*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm%2, [o(pw_m%7_%6)]
178*c0909341SAndroid Build Coastguard Worker    vpblendd            m%2, m%3, 0xf0
179*c0909341SAndroid Build Coastguard Worker    vpbroadcastd       xm%3, [o(pw_%6_%7)]
180*c0909341SAndroid Build Coastguard Worker%endif
181*c0909341SAndroid Build Coastguard Worker    vpblendd            m%3, m%4, 0xf0
182*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       %1, %4, _, %5, %2, %3, (4|%10)
183*c0909341SAndroid Build Coastguard Worker%endmacro
184*c0909341SAndroid Build Coastguard Worker
185*c0909341SAndroid Build Coastguard Worker; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
186*c0909341SAndroid Build Coastguard Worker; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
187*c0909341SAndroid Build Coastguard Worker%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
188*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%3, m%2, m%1
189*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%2, m%1
190*c0909341SAndroid Build Coastguard Worker%if %7 < 32
191*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m%7, m%2
192*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, m%7, m%3
193*c0909341SAndroid Build Coastguard Worker%else
194*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%1, [o(pw_m%7_%6)]
195*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%4, m%3, m%1
196*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%1, m%2
197*c0909341SAndroid Build Coastguard Worker%endif
198*c0909341SAndroid Build Coastguard Worker    paddd               m%4, m%5
199*c0909341SAndroid Build Coastguard Worker    paddd               m%1, m%5
200*c0909341SAndroid Build Coastguard Worker    psrad               m%4, 12
201*c0909341SAndroid Build Coastguard Worker    psrad               m%1, 12
202*c0909341SAndroid Build Coastguard Worker    packssdw            m%1, m%4
203*c0909341SAndroid Build Coastguard Worker%if %7 < 32
204*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m%6
205*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m%6
206*c0909341SAndroid Build Coastguard Worker%else
207*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%4, [o(pw_%6_%7)]
208*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%3, m%4
209*c0909341SAndroid Build Coastguard Worker    pmaddwd             m%2, m%4
210*c0909341SAndroid Build Coastguard Worker%endif
211*c0909341SAndroid Build Coastguard Worker    paddd               m%3, m%5
212*c0909341SAndroid Build Coastguard Worker    paddd               m%2, m%5
213*c0909341SAndroid Build Coastguard Worker    psrad               m%3, 12
214*c0909341SAndroid Build Coastguard Worker    psrad               m%2, 12
215*c0909341SAndroid Build Coastguard Worker%if %0 == 8
216*c0909341SAndroid Build Coastguard Worker    packssdw            m%8, m%2, m%3
217*c0909341SAndroid Build Coastguard Worker%else
218*c0909341SAndroid Build Coastguard Worker    packssdw            m%2, m%3
219*c0909341SAndroid Build Coastguard Worker%endif
220*c0909341SAndroid Build Coastguard Worker%endmacro
221*c0909341SAndroid Build Coastguard Worker
222*c0909341SAndroid Build Coastguard Worker%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
223*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
224*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
225*c0909341SAndroid Build Coastguard Worker    psubsw              m%3, m%1, m%2
226*c0909341SAndroid Build Coastguard Worker    paddsw              m%2, m%1
227*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m%4, m%5
228*c0909341SAndroid Build Coastguard Worker    psubsw              m%4, m%5
229*c0909341SAndroid Build Coastguard Worker%endmacro
230*c0909341SAndroid Build Coastguard Worker
231*c0909341SAndroid Build Coastguard Worker%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
232*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
233*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %2, %8, %9, %10, %11,  799, 4017 ; t4a, t7a
234*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
235*c0909341SAndroid Build Coastguard Worker    paddsw              m%9, m%2, m%6  ; t4
236*c0909341SAndroid Build Coastguard Worker    psubsw              m%2, m%6       ; t5a
237*c0909341SAndroid Build Coastguard Worker    paddsw             m%10, m%8, m%4  ; t7
238*c0909341SAndroid Build Coastguard Worker    psubsw              m%8, m%4       ; t6a
239*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
240*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
241*c0909341SAndroid Build Coastguard Worker    psubsw              m%6, m%1, m%3  ; dct4 out2
242*c0909341SAndroid Build Coastguard Worker    paddsw              m%3, m%1       ; dct4 out1
243*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m%5, m%7  ; dct4 out0
244*c0909341SAndroid Build Coastguard Worker    psubsw              m%5, m%7       ; dct4 out3
245*c0909341SAndroid Build Coastguard Worker    psubsw              m%7, m%3, m%2  ; out6
246*c0909341SAndroid Build Coastguard Worker    paddsw              m%2, m%3       ; out1
247*c0909341SAndroid Build Coastguard Worker    paddsw              m%3, m%6, m%8  ; out2
248*c0909341SAndroid Build Coastguard Worker    psubsw              m%6, m%8       ; out5
249*c0909341SAndroid Build Coastguard Worker    psubsw              m%8, m%1, m%10 ; out7
250*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m%10      ; out0
251*c0909341SAndroid Build Coastguard Worker    paddsw              m%4, m%5, m%9  ; out3
252*c0909341SAndroid Build Coastguard Worker    psubsw              m%5, m%9       ; out4
253*c0909341SAndroid Build Coastguard Worker%endmacro
254*c0909341SAndroid Build Coastguard Worker
255*c0909341SAndroid Build Coastguard Worker; in1 = %1, in3  = %2, in5  = %3, in7  = %4
256*c0909341SAndroid Build Coastguard Worker; in9 = %5, in11 = %6, in13 = %7, in15 = %8
257*c0909341SAndroid Build Coastguard Worker%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048
258*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %1, %8, %9, %10, %11,  401, 4076 ; t8a,  t15a
259*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %5, %4, %9, %10, %11, 3166, 2598 ; t9a,  t14a
260*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
261*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
262*c0909341SAndroid Build Coastguard Worker    psubsw              m%9, m%2, m%6 ; t13
263*c0909341SAndroid Build Coastguard Worker    paddsw              m%6, m%2      ; t12
264*c0909341SAndroid Build Coastguard Worker    psubsw              m%2, m%8, m%4 ; t14
265*c0909341SAndroid Build Coastguard Worker    paddsw              m%8, m%4      ; t15
266*c0909341SAndroid Build Coastguard Worker    psubsw              m%4, m%7, m%3 ; t10
267*c0909341SAndroid Build Coastguard Worker    paddsw              m%3, m%7      ; t11
268*c0909341SAndroid Build Coastguard Worker    psubsw              m%7, m%1, m%5 ; t9
269*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m%5      ; t8
270*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %2, %7, %5, %10, %11,  1567, 3784 ; t9a,  t14a
271*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
272*c0909341SAndroid Build Coastguard Worker    psubsw              m%5, m%1, m%3 ; t11a
273*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m%3      ; t8a
274*c0909341SAndroid Build Coastguard Worker    psubsw              m%3, m%7, m%4 ; t13
275*c0909341SAndroid Build Coastguard Worker    paddsw              m%7, m%4      ; t14
276*c0909341SAndroid Build Coastguard Worker    psubsw              m%4, m%8, m%6 ; t12a
277*c0909341SAndroid Build Coastguard Worker    paddsw              m%8, m%6      ; t15a
278*c0909341SAndroid Build Coastguard Worker    psubsw              m%6, m%2, m%9 ; t10
279*c0909341SAndroid Build Coastguard Worker    paddsw              m%2, m%9      ; t9
280*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
281*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        %4, %5, %9, %10, %11, 2896, 2896 ; t11,  t12
282*c0909341SAndroid Build Coastguard Worker%endmacro
283*c0909341SAndroid Build Coastguard Worker
284*c0909341SAndroid Build Coastguard Worker%macro WRAP_XMM 1+
285*c0909341SAndroid Build Coastguard Worker    INIT_XMM cpuname
286*c0909341SAndroid Build Coastguard Worker    %1
287*c0909341SAndroid Build Coastguard Worker    INIT_YMM cpuname
288*c0909341SAndroid Build Coastguard Worker%endmacro
289*c0909341SAndroid Build Coastguard Worker
290*c0909341SAndroid Build Coastguard Worker%macro ITX4_END 4-5 2048 ; row[1-4], rnd
291*c0909341SAndroid Build Coastguard Worker%if %5
292*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_%5)]
293*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
294*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
295*c0909341SAndroid Build Coastguard Worker%endif
296*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+strideq*2]
297*c0909341SAndroid Build Coastguard Worker%assign %%i 1
298*c0909341SAndroid Build Coastguard Worker%rep 4
299*c0909341SAndroid Build Coastguard Worker    %if %1 & 2
300*c0909341SAndroid Build Coastguard Worker        CAT_XDEFINE %%row_adr, %%i, r2   + strideq*(%1&1)
301*c0909341SAndroid Build Coastguard Worker    %else
302*c0909341SAndroid Build Coastguard Worker        CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
303*c0909341SAndroid Build Coastguard Worker    %endif
304*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i + 1
305*c0909341SAndroid Build Coastguard Worker    %rotate 1
306*c0909341SAndroid Build Coastguard Worker%endrep
307*c0909341SAndroid Build Coastguard Worker    movd                 m2, [%%row_adr1]
308*c0909341SAndroid Build Coastguard Worker    pinsrd               m2, [%%row_adr2], 1
309*c0909341SAndroid Build Coastguard Worker    movd                 m3, [%%row_adr3]
310*c0909341SAndroid Build Coastguard Worker    pinsrd               m3, [%%row_adr4], 1
311*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m2, m2
312*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m3, m3
313*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
314*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
315*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
316*c0909341SAndroid Build Coastguard Worker    movd       [%%row_adr1], m0
317*c0909341SAndroid Build Coastguard Worker    pextrd     [%%row_adr2], m0, 1
318*c0909341SAndroid Build Coastguard Worker    pextrd     [%%row_adr3], m0, 2
319*c0909341SAndroid Build Coastguard Worker    pextrd     [%%row_adr4], m0, 3
320*c0909341SAndroid Build Coastguard Worker    ret
321*c0909341SAndroid Build Coastguard Worker%endmacro
322*c0909341SAndroid Build Coastguard Worker
323*c0909341SAndroid Build Coastguard Worker%macro IWHT4_1D_PACKED 0
324*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m0, m1 ; in1 in3
325*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1     ; in0 in2
326*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0, m3
327*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
328*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m2     ; t2 t2
329*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m0     ; t0 t0
330*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, m2
331*c0909341SAndroid Build Coastguard Worker    psraw                m1, 1
332*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3     ; t1 t3
333*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1     ; ____ out0
334*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1     ; out3 ____
335*c0909341SAndroid Build Coastguard Worker%endmacro
336*c0909341SAndroid Build Coastguard Worker
337*c0909341SAndroid Build Coastguard WorkerINIT_XMM avx2
338*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c
339*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
340*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
341*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
342*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m2
343*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m2
344*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
345*c0909341SAndroid Build Coastguard Worker    psraw                m1, 2
346*c0909341SAndroid Build Coastguard Worker    IWHT4_1D_PACKED
347*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
348*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1, m2
349*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m3
350*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m3
351*c0909341SAndroid Build Coastguard Worker    IWHT4_1D_PACKED
352*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0x03
353*c0909341SAndroid Build Coastguard Worker    ITX4_END              3, 0, 2, 1, 0
354*c0909341SAndroid Build Coastguard Worker
355*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_FN 3 ; type1, type2, size
356*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2
357*c0909341SAndroid Build Coastguard Worker    %define %%p1 m(i%1_%3_internal_8bpc)
358*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
359*c0909341SAndroid Build Coastguard Worker    ; Jump to the 1st txfm function if we're not taking the fast path, which
360*c0909341SAndroid Build Coastguard Worker    ; in turn performs an indirect jump to the 2nd txfm function.
361*c0909341SAndroid Build Coastguard Worker    lea                tx2q, [m(i%2_%3_internal_8bpc).pass2]
362*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
363*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
364*c0909341SAndroid Build Coastguard Worker    jnz %%p1
365*c0909341SAndroid Build Coastguard Worker%else
366*c0909341SAndroid Build Coastguard Worker    ; jump to the 1st txfm function unless it's located directly after this
367*c0909341SAndroid Build Coastguard Worker    times ((%%end - %%p1) >> 31) & 1 jmp %%p1
368*c0909341SAndroid Build Coastguard WorkerALIGN function_align
369*c0909341SAndroid Build Coastguard Worker%%end:
370*c0909341SAndroid Build Coastguard Worker%endif
371*c0909341SAndroid Build Coastguard Worker%endmacro
372*c0909341SAndroid Build Coastguard Worker
373*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X4_FN 2 ; type1, type2
374*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 4x4
375*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
376*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, [cq]
377*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_2896x8)]
378*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m1
379*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd ; 0
380*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m1
381*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
382*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x4_internal_8bpc).end2
383*c0909341SAndroid Build Coastguard Worker%endif
384*c0909341SAndroid Build Coastguard Worker%endmacro
385*c0909341SAndroid Build Coastguard Worker
386*c0909341SAndroid Build Coastguard Worker%macro IDCT4_1D_PACKED 0
387*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pd_2048)]
388*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m0
389*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0
390*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 0, 3, 4, 1567, 3784
391*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 0, 3, 4, 2896, 2896
392*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1, m2 ; out0 out1
393*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2     ; out3 out2
394*c0909341SAndroid Build Coastguard Worker%endmacro
395*c0909341SAndroid Build Coastguard Worker
396*c0909341SAndroid Build Coastguard Worker%macro IADST4_1D_PACKED 0
397*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m0
398*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m1, m0
399*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_m3344_3344)]
400*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(pw_3803_1321)]
401*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_m1321_2482)]
402*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, m2 ; 3344*in3 - 3344*in2
403*c0909341SAndroid Build Coastguard Worker    psrld                m5, 16
404*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m2
405*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m4
406*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m3 ; 3344*in0
407*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
408*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2482_3344)]
409*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_m3803_3344)]
410*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3
411*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m3
412*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
413*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(pw_m3803_m6688)]
414*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m0
415*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(pd_2048)]
416*c0909341SAndroid Build Coastguard Worker    paddd                m2, m0
417*c0909341SAndroid Build Coastguard Worker    paddd                m1, m0
418*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
419*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
420*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
421*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
422*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m1, m2, m0, m5
423*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m5 ; out0 out1
424*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m2 ; out2 out3
425*c0909341SAndroid Build Coastguard Worker%endmacro
426*c0909341SAndroid Build Coastguard Worker
427*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, dct
428*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, adst
429*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, flipadst
430*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN dct, identity
431*c0909341SAndroid Build Coastguard Worker
432*c0909341SAndroid Build Coastguard Workercglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
433*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
434*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
435*c0909341SAndroid Build Coastguard Worker    IDCT4_1D_PACKED
436*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(deint_shuf)]
437*c0909341SAndroid Build Coastguard Worker    shufps               m3, m0, m1, q1331
438*c0909341SAndroid Build Coastguard Worker    shufps               m0, m1, q0220
439*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
440*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m2
441*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
442*c0909341SAndroid Build Coastguard Worker.pass2:
443*c0909341SAndroid Build Coastguard Worker    IDCT4_1D_PACKED
444*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
445*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m2
446*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m2
447*c0909341SAndroid Build Coastguard Worker    ITX4_END              0, 1, 3, 2
448*c0909341SAndroid Build Coastguard Worker
449*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, dct
450*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, adst
451*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, flipadst
452*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN adst, identity
453*c0909341SAndroid Build Coastguard Worker
454*c0909341SAndroid Build Coastguard Workercglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
455*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
456*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
457*c0909341SAndroid Build Coastguard Worker    call .main
458*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m1
459*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
460*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
461*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
462*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
463*c0909341SAndroid Build Coastguard Worker.pass2:
464*c0909341SAndroid Build Coastguard Worker    call .main
465*c0909341SAndroid Build Coastguard Worker.end:
466*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
467*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m2
468*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m2
469*c0909341SAndroid Build Coastguard Worker.end2:
470*c0909341SAndroid Build Coastguard Worker    ITX4_END              0, 1, 2, 3
471*c0909341SAndroid Build Coastguard WorkerALIGN function_align
472*c0909341SAndroid Build Coastguard Workercglobal_label .main
473*c0909341SAndroid Build Coastguard Worker    IADST4_1D_PACKED
474*c0909341SAndroid Build Coastguard Worker    ret
475*c0909341SAndroid Build Coastguard Worker
476*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, dct
477*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, adst
478*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, flipadst
479*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN flipadst, identity
480*c0909341SAndroid Build Coastguard Worker
481*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
482*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
483*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
484*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x4_internal_8bpc).main
485*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m0
486*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
487*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
488*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
489*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
490*c0909341SAndroid Build Coastguard Worker.pass2:
491*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x4_internal_8bpc).main
492*c0909341SAndroid Build Coastguard Worker.end:
493*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
494*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*0], m2
495*c0909341SAndroid Build Coastguard Worker    mova          [cq+16*1], m2
496*c0909341SAndroid Build Coastguard Worker.end2:
497*c0909341SAndroid Build Coastguard Worker    ITX4_END              3, 2, 1, 0
498*c0909341SAndroid Build Coastguard Worker
499*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, dct
500*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, adst
501*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, flipadst
502*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X4_FN identity, identity
503*c0909341SAndroid Build Coastguard Worker
504*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2
505*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+16*0]
506*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+16*1]
507*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_1697x8)]
508*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m0
509*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
510*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
511*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3
512*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1
513*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
514*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m2
515*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
516*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
517*c0909341SAndroid Build Coastguard Worker.pass2:
518*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_1697x8)]
519*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m0
520*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
521*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
522*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3
523*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x4_internal_8bpc).end
524*c0909341SAndroid Build Coastguard Worker
525*c0909341SAndroid Build Coastguard Worker%macro WRITE_4X8 2 ; coefs[1-2]
526*c0909341SAndroid Build Coastguard Worker    movd                xm4, [dstq+strideq*0]
527*c0909341SAndroid Build Coastguard Worker    pinsrd              xm4, [dstq+strideq*1], 1
528*c0909341SAndroid Build Coastguard Worker    movd                xm5, [dstq+strideq*2]
529*c0909341SAndroid Build Coastguard Worker    pinsrd              xm5, [dstq+r3       ], 1
530*c0909341SAndroid Build Coastguard Worker    pinsrd              xm4, [r2  +strideq*0], 2
531*c0909341SAndroid Build Coastguard Worker    pinsrd              xm4, [r2  +strideq*1], 3
532*c0909341SAndroid Build Coastguard Worker    pinsrd              xm5, [r2  +strideq*2], 2
533*c0909341SAndroid Build Coastguard Worker    pinsrd              xm5, [r2  +r3       ], 3
534*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m4, xm4
535*c0909341SAndroid Build Coastguard Worker    pmovzxbw             m5, xm5
536*c0909341SAndroid Build Coastguard Worker    paddw                m4, m%1
537*c0909341SAndroid Build Coastguard Worker    paddw                m5, m%2
538*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m5
539*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m4, 1
540*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], xm4
541*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*1], xm4, 1
542*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+strideq*2], xm4, 2
543*c0909341SAndroid Build Coastguard Worker    pextrd [dstq+r3       ], xm4, 3
544*c0909341SAndroid Build Coastguard Worker    movd   [r2  +strideq*0], xm5
545*c0909341SAndroid Build Coastguard Worker    pextrd [r2  +strideq*1], xm5, 1
546*c0909341SAndroid Build Coastguard Worker    pextrd [r2  +strideq*2], xm5, 2
547*c0909341SAndroid Build Coastguard Worker    pextrd [r2  +r3       ], xm5, 3
548*c0909341SAndroid Build Coastguard Worker%endmacro
549*c0909341SAndroid Build Coastguard Worker
550*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X8_FN 2 ; type1, type2
551*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 4x8
552*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
553*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
554*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
555*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_2048)]
556*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
557*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
558*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
559*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
560*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
561*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
562*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x8_internal_8bpc).end3
563*c0909341SAndroid Build Coastguard Worker%endif
564*c0909341SAndroid Build Coastguard Worker%endmacro
565*c0909341SAndroid Build Coastguard Worker
566*c0909341SAndroid Build Coastguard Worker%macro IDCT8_1D_PACKED 0
567*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pd_2048)]
568*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m3, m0 ; in7 in1
569*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m1, m2 ; in3 in5
570*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1     ; in6 in2
571*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m0     ; in4 in0
572*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 0, 1, 6,  799, 4017, 3 ; t4a t7a
573*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
574*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 0, 1, 6, 1567, 3784    ; t3 t2
575*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m5, m4 ; t5a t6a (interleaved)
576*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m5     ; t4  t7  (interleaved)
577*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 1, 5, 6, 2896, 2896    ; t0 t1
578*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_m2896_2896)]
579*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 1, _, 6, 1, 5, 4 ; t6 t5
580*c0909341SAndroid Build Coastguard Worker%if mmsize > 16
581*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m1, [o(deint_shuf)]
582*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m1
583*c0909341SAndroid Build Coastguard Worker%else
584*c0909341SAndroid Build Coastguard Worker    pshufb               m4, [o(deint_shuf)]
585*c0909341SAndroid Build Coastguard Worker%endif
586*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2, m3 ; tmp3 tmp2
587*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2     ; tmp0 tmp1
588*c0909341SAndroid Build Coastguard Worker    shufps               m2, m4, m0, q1032 ; t7 t6
589*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, 0xcc      ; t4 t5
590*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3, m2 ; out0 out1
591*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m2     ; out7 out6
592*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m1, m4 ; out4 out5
593*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m4     ; out3 out2
594*c0909341SAndroid Build Coastguard Worker%endmacro
595*c0909341SAndroid Build Coastguard Worker
596*c0909341SAndroid Build Coastguard Worker%macro IADST8_1D_PACKED 1 ; pass
597*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pd_2048)]
598*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m4, m3 ; 0 7
599*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m5, m2 ; 2 5
600*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5     ; 4 3
601*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4     ; 6 1
602*c0909341SAndroid Build Coastguard Worker%if %1 == 1
603*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076, 3 ; t1a t0a
604*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
605*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
606*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
607*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m2 ; t5 t4
608*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2     ; t1 t0
609*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m1, m3 ; t6 t7
610*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3     ; t2 t3
611*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
612*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
613*c0909341SAndroid Build Coastguard Worker%if mmsize > 16
614*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [o(deint_shuf)]
615*c0909341SAndroid Build Coastguard Worker%else
616*c0909341SAndroid Build Coastguard Worker    mova                 m2, [o(deint_shuf)]
617*c0909341SAndroid Build Coastguard Worker%endif
618*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q2301
619*c0909341SAndroid Build Coastguard Worker    pshufhw              m1, m1, q2301
620*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m0, m1        ; t3 t2
621*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1            ; -out7  out0
622*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m4, m5        ; t7 t6
623*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m5            ;  out6 -out1
624*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
625*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m2
626*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_m2896_2896)]
627*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, m3
628*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1
629*c0909341SAndroid Build Coastguard Worker    paddd                m2, m6
630*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
631*c0909341SAndroid Build Coastguard Worker    psrad                m2, 12
632*c0909341SAndroid Build Coastguard Worker    psrad                m5, 12
633*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m5            ; out4 -out5
634*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2896_2896)]
635*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
636*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
637*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6
638*c0909341SAndroid Build Coastguard Worker    paddd                m1, m6
639*c0909341SAndroid Build Coastguard Worker    psrad                m3, 12
640*c0909341SAndroid Build Coastguard Worker    psrad                m1, 12
641*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3            ; out2 -out3
642*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m4, m0        ; out6 -out7
643*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m4            ; out0 -out1
644*c0909341SAndroid Build Coastguard Worker%else
645*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 4, 5, 6,  401, 4076 ; t0a t1a
646*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 4, 5, 6, 1931, 3612 ; t2a t3a
647*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 4, 5, 6, 3166, 2598 ; t4a t5a
648*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 4, 5, 6, 3920, 1189 ; t6a t7a
649*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m2 ; t4 t5
650*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2     ; t0 t1
651*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m1, m3 ; t6 t7
652*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3     ; t2 t3
653*c0909341SAndroid Build Coastguard Worker    shufps               m2, m5, m4, q1032
654*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2
655*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m2
656*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
657*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 2, 3, 6, 3784, 1567    ; t7a t6a
658*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m0, m1        ; t2 t3
659*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1            ; out0 -out7
660*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m4, m5        ; t7 t6
661*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m5            ; out6 -out1
662*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2896x8)]
663*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m0, m4, 0x33  ; out6 -out7
664*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m4, 0xcc      ; out0 -out1
665*c0909341SAndroid Build Coastguard Worker    shufps               m4, m2, m1, q1032 ; t3 t7
666*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m2, 0x33      ; t2 t6
667*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m1, m4        ; t2-t3 t6-t7
668*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m4            ; t2+t3 t6+t7
669*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5            ; out4 -out5
670*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1032
671*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5            ; out2 -out3
672*c0909341SAndroid Build Coastguard Worker%endif
673*c0909341SAndroid Build Coastguard Worker%endmacro
674*c0909341SAndroid Build Coastguard Worker
675*c0909341SAndroid Build Coastguard WorkerINIT_YMM avx2
676*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, dct
677*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, adst
678*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, flipadst
679*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN dct, identity
680*c0909341SAndroid Build Coastguard Worker
681*c0909341SAndroid Build Coastguard Workercglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
682*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120
683*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q3120
684*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_2896x8)]
685*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
686*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
687*c0909341SAndroid Build Coastguard Worker    IDCT4_1D_PACKED
688*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m2, [o(deint_shuf)]
689*c0909341SAndroid Build Coastguard Worker    shufps               m3, m0, m1, q1331
690*c0909341SAndroid Build Coastguard Worker    shufps               m0, m1, q0220
691*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m2
692*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3, m2
693*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
694*c0909341SAndroid Build Coastguard Worker.pass2:
695*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
696*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m1, 1
697*c0909341SAndroid Build Coastguard Worker    call .main
698*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
699*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm2, 1
700*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm3, 1
701*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1032
702*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x8_internal_8bpc).end2
703*c0909341SAndroid Build Coastguard WorkerALIGN function_align
704*c0909341SAndroid Build Coastguard Workercglobal_label .main
705*c0909341SAndroid Build Coastguard Worker    WRAP_XMM IDCT8_1D_PACKED
706*c0909341SAndroid Build Coastguard Worker    ret
707*c0909341SAndroid Build Coastguard Worker
708*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, dct
709*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, adst
710*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, flipadst
711*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN adst, identity
712*c0909341SAndroid Build Coastguard Worker
713*c0909341SAndroid Build Coastguard Workercglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
714*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120
715*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q3120
716*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_2896x8)]
717*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
718*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
719*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_8bpc).main
720*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m1
721*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
722*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
723*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
724*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
725*c0909341SAndroid Build Coastguard Worker.pass2:
726*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
727*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m1, 1
728*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm0, q1032
729*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm1, q1032
730*c0909341SAndroid Build Coastguard Worker    call .main_pass2
731*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
732*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm2, 1
733*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm3, 1
734*c0909341SAndroid Build Coastguard Worker    pxor                 m5, m5
735*c0909341SAndroid Build Coastguard Worker    psubw                m5, m4
736*c0909341SAndroid Build Coastguard Worker.end:
737*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m5, 0xcc
738*c0909341SAndroid Build Coastguard Worker.end2:
739*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
740*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
741*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
742*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
743*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m2
744*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*1], m2
745*c0909341SAndroid Build Coastguard Worker.end3:
746*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+strideq*4]
747*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
748*c0909341SAndroid Build Coastguard Worker    WRITE_4X8             0, 1
749*c0909341SAndroid Build Coastguard Worker    RET
750*c0909341SAndroid Build Coastguard WorkerALIGN function_align
751*c0909341SAndroid Build Coastguard Worker.main_pass1:
752*c0909341SAndroid Build Coastguard Worker    WRAP_XMM IADST8_1D_PACKED 1
753*c0909341SAndroid Build Coastguard Worker    ret
754*c0909341SAndroid Build Coastguard WorkerALIGN function_align
755*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2
756*c0909341SAndroid Build Coastguard Worker    WRAP_XMM IADST8_1D_PACKED 2
757*c0909341SAndroid Build Coastguard Worker    ret
758*c0909341SAndroid Build Coastguard Worker
759*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, dct
760*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, adst
761*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, flipadst
762*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN flipadst, identity
763*c0909341SAndroid Build Coastguard Worker
764*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
765*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120
766*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q3120
767*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_2896x8)]
768*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
769*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
770*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_8bpc).main
771*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1, m0
772*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
773*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m3
774*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3
775*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
776*c0909341SAndroid Build Coastguard Worker.pass2:
777*c0909341SAndroid Build Coastguard Worker    vextracti128        xm2, m0, 1
778*c0909341SAndroid Build Coastguard Worker    vextracti128        xm3, m1, 1
779*c0909341SAndroid Build Coastguard Worker    pshufd              xm4, xm0, q1032
780*c0909341SAndroid Build Coastguard Worker    pshufd              xm5, xm1, q1032
781*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x8_internal_8bpc).main_pass2
782*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
783*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm1, 1
784*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm0, 1
785*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
786*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5
787*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q1032
788*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m2, q1032
789*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x8_internal_8bpc).end
790*c0909341SAndroid Build Coastguard Worker
791*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, dct
792*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, adst
793*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, flipadst
794*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X8_FN identity, identity
795*c0909341SAndroid Build Coastguard Worker
796*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
797*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+32*0], q3120
798*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*1], q3120
799*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_2896x8)]
800*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_1697x8)]
801*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m0
802*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0
803*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
804*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
805*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
806*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
807*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4, m0
808*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m1
809*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
810*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m4
811*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
812*c0909341SAndroid Build Coastguard Worker.pass2:
813*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_4096)]
814*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x8_internal_8bpc).end2
815*c0909341SAndroid Build Coastguard Worker
816*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_4X16_FN 2 ; type1, type2
817*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 4x16
818*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
819*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
820*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
821*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_16384)]
822*c0909341SAndroid Build Coastguard Worker    movd                xm3, [o(pw_2048)]
823*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
824*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
825*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
826*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm3
827*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
828*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
829*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0
830*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
831*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x16_internal_8bpc).end3
832*c0909341SAndroid Build Coastguard Worker%endif
833*c0909341SAndroid Build Coastguard Worker%endmacro
834*c0909341SAndroid Build Coastguard Worker
835*c0909341SAndroid Build Coastguard Worker%macro IDCT16_1D_PACKED 0
836*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
837*c0909341SAndroid Build Coastguard Worker.main2:
838*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m7, m0 ; dct16 in15 in1
839*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m4, m0 ; dct4  in2  in0
840*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m3, m4 ; dct16 in7  in9
841*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m1     ; dct8  in7  in1
842*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6     ; dct16 in3  in13
843*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5     ; dct8  in3  in5
844*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m2     ; dct16 in11 in5
845*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m2     ; dct4  in3  in1
846*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 2, 4, 10,  401, 4076, 3 ; t8a  t15a
847*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 2, 4, 10, 3166, 2598, 3 ; t9a  t14a
848*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
849*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
850*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        7, 2, 4, 10,  799, 4017, 3 ; t4a  t7a
851*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 2, 4, 10, 3406, 2276, 3 ; t5a  t6a
852*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        6, 2, 4, 10, 1567, 3784    ; t3   t2
853*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m8, m0 ; t9  t14
854*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m0     ; t8  t15
855*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m1, m5 ; t10 t13
856*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5     ; t11 t12
857*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_m3784_1567)]  ; reuse pw_1567_3784
858*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 4, _, 10, 4, 5, 6   ; t9a  t14a
859*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
860*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 5, _, 10, 5, 4, 6   ; t10a t13a
861*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m8, m1 ; t11a t12a
862*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m1     ; t8a  t15a
863*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m7, m3 ; t5a  t6a
864*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m3     ; t4   t7
865*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2, m0 ; t9   t14
866*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m0     ; t10  t13
867*c0909341SAndroid Build Coastguard Worker%if mmsize > 16
868*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [o(deint_shuf)]
869*c0909341SAndroid Build Coastguard Worker%else
870*c0909341SAndroid Build Coastguard Worker    mova                 m0, [o(deint_shuf)]
871*c0909341SAndroid Build Coastguard Worker%endif
872*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m0
873*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m0
874*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m0
875*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        9, 0, 5, 10, 2896, 2896 ; t0   t1
876*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(pw_m2896_2896)]
877*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 5, _, 10, 5, 0, 4    ; t11  t12
878*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2896_2896)]
879*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 0, _, 10, 0, 5, 4    ; t6   t5
880*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(pw_m2896_2896)]
881*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 0, _, 10, 0, 5, 4    ; t13a t10a
882*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m8, m3        ; t15a t14
883*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m3            ; t8a  t9
884*c0909341SAndroid Build Coastguard Worker    shufps               m5, m4, m2, q1032 ; t12  t13a
885*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m2, 0xcc      ; t11  t10a
886*c0909341SAndroid Build Coastguard Worker    shufps               m2, m7, m1, q1032 ; t7 t6
887*c0909341SAndroid Build Coastguard Worker    vpblendd             m7, m1, 0xcc      ; t4 t5
888*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m9, m6 ; dct4 out3 out2
889*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m6     ; dct4 out0 out1
890*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m9, m2 ; dct8 out7 out6
891*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m2     ; dct8 out0 out1
892*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m1, m7 ; dct8 out4 out5
893*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m7     ; dct8 out3 out2
894*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m9, m0 ; out15 out14
895*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m9     ; out0  out1
896*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m1, m5 ; out12 out13
897*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5     ; out3  out2
898*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m2, m4 ; out11 out10
899*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m4     ; out4  out5
900*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m3, m8 ; out8  out9
901*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m8     ; out7  out6
902*c0909341SAndroid Build Coastguard Worker%endmacro
903*c0909341SAndroid Build Coastguard Worker
904*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, dct
905*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, adst
906*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, flipadst
907*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN dct, identity
908*c0909341SAndroid Build Coastguard Worker
909*c0909341SAndroid Build Coastguard Workercglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
910*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32*0]
911*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32*1]
912*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*2]
913*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*3]
914*c0909341SAndroid Build Coastguard Worker    call m(idct_16x4_internal_8bpc).main
915*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_16384)]
916*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2, m3
917*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
918*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m1
919*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
920*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m0, m4, m2, m3
921*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
922*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
923*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
924*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
925*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
926*c0909341SAndroid Build Coastguard Worker.pass2:
927*c0909341SAndroid Build Coastguard Worker    vextracti128        xm4, m0, 1
928*c0909341SAndroid Build Coastguard Worker    vextracti128        xm5, m1, 1
929*c0909341SAndroid Build Coastguard Worker    vextracti128        xm6, m2, 1
930*c0909341SAndroid Build Coastguard Worker    vextracti128        xm7, m3, 1
931*c0909341SAndroid Build Coastguard Worker    call .main
932*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm4, 1
933*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm5, 1
934*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
935*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm6, 1
936*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm7, 1
937*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1032
938*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q1032
939*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x16_internal_8bpc).end2
940*c0909341SAndroid Build Coastguard WorkerALIGN function_align
941*c0909341SAndroid Build Coastguard Workercglobal_label .main
942*c0909341SAndroid Build Coastguard Worker    WRAP_XMM IDCT16_1D_PACKED
943*c0909341SAndroid Build Coastguard Worker    ret
944*c0909341SAndroid Build Coastguard Worker
945*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, dct
946*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, adst
947*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, flipadst
948*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN adst, identity
949*c0909341SAndroid Build Coastguard Worker
950*c0909341SAndroid Build Coastguard Workercglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
951*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32*0]
952*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32*1]
953*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*2]
954*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*3]
955*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_8bpc).main
956*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_16384)]
957*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2, m3
958*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
959*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m1
960*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
961*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m4, m2, m3, m0
962*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
963*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
964*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
965*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
966*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
967*c0909341SAndroid Build Coastguard Worker.pass2:
968*c0909341SAndroid Build Coastguard Worker    call .main
969*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2896x8)]
970*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m4
971*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m4
972*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
973*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
974*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
975*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1032
976*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m1, m0, 0x33
977*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m2, 0x33
978*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0x33
979*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m1, 0x33
980*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q2031
981*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m2, q1302
982*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m3, q3120
983*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m4, q0213
984*c0909341SAndroid Build Coastguard Worker    psubw                m6, m7, m5
985*c0909341SAndroid Build Coastguard Worker.end:
986*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m6, 0xcc
987*c0909341SAndroid Build Coastguard Worker.end2:
988*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
989*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
990*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
991*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m4
992*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*1], m4
993*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*2], m4
994*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*3], m4
995*c0909341SAndroid Build Coastguard Worker.end3:
996*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+strideq*8]
997*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
998*c0909341SAndroid Build Coastguard Worker    WRITE_4X8             0, 1
999*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1000*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r2  +strideq*4]
1001*c0909341SAndroid Build Coastguard Worker    WRITE_4X8             2, 3
1002*c0909341SAndroid Build Coastguard Worker    RET
1003*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1004*c0909341SAndroid Build Coastguard Worker.main:
1005*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m1, m0, 0xcc
1006*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m0, 0x33
1007*c0909341SAndroid Build Coastguard Worker    vpblendd             m5, m2, m3, 0xcc
1008*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0x33
1009*c0909341SAndroid Build Coastguard Worker    vperm2i128           m3, m5, m2, 0x31
1010*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m1, xm4, 1 ; in0  in3  in2  in1
1011*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m1, m4, 0x31
1012*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m5, xm2, 1 ; in4  in7  in6  in5
1013*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q1032  ; in15 in12 in13 in14
1014*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m4, q1032  ; in11 in8  in9  in10
1015*c0909341SAndroid Build Coastguard Workercglobal_label .main2
1016*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pd_2048)]
1017*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
1018*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m3, m0 ; in12 in3  in14 in1
1019*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3     ; in0  in15 in2  in13
1020*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2, m1 ; in8  in7  in10 in5
1021*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2     ; in4  in11 in6  in9
1022*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        0, 2, 5, 6, 8,  201, 4091,  995, 3973, 3
1023*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
1024*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
1025*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        4, 2, 5, 6, 8, 3857, 1380, 4052,  601, 3
1026*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m0, m3 ; t9a  t8a  t11a t10a
1027*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3     ; t1a  t0a  t3a  t2a
1028*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m1, m4 ; t13a t12a t15a t14a
1029*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m4     ; t5a  t4a  t7a  t6a
1030*c0909341SAndroid Build Coastguard Worker    ITX_MUL4X_PACK        2, 4, 5, 6, 8,  799, 4017, 3406, 2276, 3
1031*c0909341SAndroid Build Coastguard Worker    psubw                m6, m7, m5
1032*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 5, _, 8, 6, 4, 6
1033*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_m3784_1567)]
1034*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_1567_3784)]
1035*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m1 ; t5   t4   t7   t6
1036*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1     ; t1   t0   t3   t2
1037*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2, m3 ; t13a t12a t15a t14a
1038*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m3     ; t9a  t8a  t11a t10a
1039*c0909341SAndroid Build Coastguard Worker    psubw                m3, m7, m6 ; pw_3784_m1567
1040*c0909341SAndroid Build Coastguard Worker    vpblendd             m6, m3, 0xf0
1041*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
1042*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
1043*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m5, [o(deint_shuf)]
1044*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m5
1045*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5
1046*c0909341SAndroid Build Coastguard Worker    vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
1047*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm2, 1        ; t1   t0   t9a  t8a
1048*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
1049*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, xm1, 1        ; t4a  t5a  t12  t13
1050*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
1051*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m0, m3        ; t3a t2a t11 t10
1052*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3     ; -out15  out0   out14 -out1
1053*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
1054*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m2            ; t6 t7 t14a t15a
1055*c0909341SAndroid Build Coastguard Worker    shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
1056*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m1, 0x33      ; t3a t7  t11 t15a
1057*c0909341SAndroid Build Coastguard Worker    ret
1058*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1059*c0909341SAndroid Build Coastguard Worker.main_pass1_end:
1060*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_m2896_2896)]
1061*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_2896_2896)]
1062*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4, m2
1063*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2
1064*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5, m4
1065*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m6
1066*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1
1067*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m6
1068*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m8}, m5, m1, m2, m4
1069*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m5, m2, m1, m4
1070*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m5     ; -out11  out8   out10 -out9
1071*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m4     ; -out7   out4   out6  -out5
1072*c0909341SAndroid Build Coastguard Worker    ret
1073*c0909341SAndroid Build Coastguard Worker
1074*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, dct
1075*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, adst
1076*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, flipadst
1077*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN flipadst, identity
1078*c0909341SAndroid Build Coastguard Worker
1079*c0909341SAndroid Build Coastguard Workercglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
1080*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32*0]
1081*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32*1]
1082*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*2]
1083*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*3]
1084*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_8bpc).main
1085*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_16384)]
1086*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m1, m0
1087*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0
1088*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3, m2
1089*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2
1090*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m5}, m4, m1, m0, m3
1091*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m1
1092*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m1
1093*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m4
1094*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m4
1095*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1096*c0909341SAndroid Build Coastguard Worker.pass2:
1097*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x16_internal_8bpc).main
1098*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2896x8)]
1099*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m4
1100*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m4
1101*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5     ; -out7   out4   out6  -out5
1102*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5     ;  out8  -out11 -out9   out10
1103*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_2048)]
1104*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1032
1105*c0909341SAndroid Build Coastguard Worker    vpblendd             m4, m0, m2, 0x33
1106*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m1, 0xcc
1107*c0909341SAndroid Build Coastguard Worker    vpblendd             m1, m3, 0xcc
1108*c0909341SAndroid Build Coastguard Worker    vpblendd             m2, m3, 0x33
1109*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1110*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q0213
1111*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q2031
1112*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m4, q1302
1113*c0909341SAndroid Build Coastguard Worker    psubw                m5, m7, m6
1114*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x16_internal_8bpc).end
1115*c0909341SAndroid Build Coastguard Worker
1116*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, dct
1117*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, adst
1118*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, flipadst
1119*c0909341SAndroid Build Coastguard WorkerINV_TXFM_4X16_FN identity, identity
1120*c0909341SAndroid Build Coastguard Worker
1121*c0909341SAndroid Build Coastguard Workercglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
1122*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*0]
1123*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*1]
1124*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+32*2]
1125*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+32*3]
1126*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_1697x8)]
1127*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m0, m0 ; -1
1128*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m2
1129*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2
1130*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m5
1131*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5
1132*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m8, m1
1133*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m8, m2
1134*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m8, m3
1135*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m4
1136*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m9, m0, m1 ; we want to do a signed avg, but pavgw is
1137*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m9     ; unsigned. as long as both signs are equal
1138*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m9, m0, m2 ; it still works, but if the input is -1 the
1139*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m9     ; pmulhrsw result will become 0 which causes
1140*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m9, m0, m3 ; pavgw to output -32768 instead of 0 unless
1141*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m9     ; we explicitly deal with that case here.
1142*c0909341SAndroid Build Coastguard Worker    pcmpeqw              m0, m4
1143*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m0
1144*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m5
1145*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m6
1146*c0909341SAndroid Build Coastguard Worker    pavgw                m3, m7
1147*c0909341SAndroid Build Coastguard Worker    pavgw                m4, m8
1148*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
1149*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
1150*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
1151*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
1152*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1153*c0909341SAndroid Build Coastguard Worker.pass2:
1154*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_1697x16)]
1155*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
1156*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m8, m0
1157*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m8, m1
1158*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m8, m2
1159*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m3
1160*c0909341SAndroid Build Coastguard Worker    REPX      {paddsw x, x}, m0, m1, m2, m3
1161*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
1162*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m6
1163*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m7
1164*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m8
1165*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_4x16_internal_8bpc).end2
1166*c0909341SAndroid Build Coastguard Worker
1167*c0909341SAndroid Build Coastguard Worker%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
1168*c0909341SAndroid Build Coastguard Worker    movq               xm%3, [dstq   ]
1169*c0909341SAndroid Build Coastguard Worker    movhps             xm%3, [dstq+%5]
1170*c0909341SAndroid Build Coastguard Worker    movq               xm%4, [dstq+%6]
1171*c0909341SAndroid Build Coastguard Worker    movhps             xm%4, [dstq+%7]
1172*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%3, xm%3
1173*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%4, xm%4
1174*c0909341SAndroid Build Coastguard Worker%ifnum %1
1175*c0909341SAndroid Build Coastguard Worker    paddw               m%3, m%1
1176*c0909341SAndroid Build Coastguard Worker%else
1177*c0909341SAndroid Build Coastguard Worker    paddw               m%3, %1
1178*c0909341SAndroid Build Coastguard Worker%endif
1179*c0909341SAndroid Build Coastguard Worker%ifnum %2
1180*c0909341SAndroid Build Coastguard Worker    paddw               m%4, m%2
1181*c0909341SAndroid Build Coastguard Worker%else
1182*c0909341SAndroid Build Coastguard Worker    paddw               m%4, %2
1183*c0909341SAndroid Build Coastguard Worker%endif
1184*c0909341SAndroid Build Coastguard Worker    packuswb            m%3, m%4
1185*c0909341SAndroid Build Coastguard Worker    vextracti128       xm%4, m%3, 1
1186*c0909341SAndroid Build Coastguard Worker    movq          [dstq   ], xm%3
1187*c0909341SAndroid Build Coastguard Worker    movhps        [dstq+%6], xm%3
1188*c0909341SAndroid Build Coastguard Worker    movq          [dstq+%5], xm%4
1189*c0909341SAndroid Build Coastguard Worker    movhps        [dstq+%7], xm%4
1190*c0909341SAndroid Build Coastguard Worker%endmacro
1191*c0909341SAndroid Build Coastguard Worker
1192*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X4_FN 2 ; type1, type2
1193*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 8x4
1194*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1195*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
1196*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
1197*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
1198*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
1199*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
1200*c0909341SAndroid Build Coastguard Worker%endif
1201*c0909341SAndroid Build Coastguard Worker%endmacro
1202*c0909341SAndroid Build Coastguard Worker
1203*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, dct
1204*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, adst
1205*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, flipadst
1206*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN dct, identity
1207*c0909341SAndroid Build Coastguard Worker
1208*c0909341SAndroid Build Coastguard Workercglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1209*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm3, [o(pw_2896x8)]
1210*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm3, [cq+16*0]
1211*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm1, xm3, [cq+16*1]
1212*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm2, xm3, [cq+16*2]
1213*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm3,      [cq+16*3]
1214*c0909341SAndroid Build Coastguard Worker    call m(idct_4x8_internal_8bpc).main
1215*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m4, [o(deint_shuf)]
1216*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m1, xm3, 1
1217*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m0, xm2, 1
1218*c0909341SAndroid Build Coastguard Worker    shufps               m0, m1, m3, q0220
1219*c0909341SAndroid Build Coastguard Worker    shufps               m1, m3, q1331
1220*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1221*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1222*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1223*c0909341SAndroid Build Coastguard Worker.pass2:
1224*c0909341SAndroid Build Coastguard Worker    IDCT4_1D_PACKED
1225*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1226*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q2031
1227*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x4_internal_8bpc).end2
1228*c0909341SAndroid Build Coastguard Worker
1229*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, dct
1230*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, adst
1231*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, flipadst
1232*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN adst, identity
1233*c0909341SAndroid Build Coastguard Worker
1234*c0909341SAndroid Build Coastguard Workercglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1235*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [o(pw_2896x8)]
1236*c0909341SAndroid Build Coastguard Worker    pshufd              xm4,      [cq+16*0], q1032
1237*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm3, xm0, [cq+16*3]
1238*c0909341SAndroid Build Coastguard Worker    pshufd              xm5,      [cq+16*1], q1032
1239*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm2, xm0, [cq+16*2]
1240*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm4, xm0
1241*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm5, xm0
1242*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x8_internal_8bpc).main_pass1
1243*c0909341SAndroid Build Coastguard Worker    vinserti128        m0, xm2, 1
1244*c0909341SAndroid Build Coastguard Worker    vinserti128        m1, xm3, 1
1245*c0909341SAndroid Build Coastguard Worker    punpckhwd          m2, m0, m1
1246*c0909341SAndroid Build Coastguard Worker    punpcklwd          m0, m1
1247*c0909341SAndroid Build Coastguard Worker    pxor               m3, m3
1248*c0909341SAndroid Build Coastguard Worker    psubsw             m3, m2
1249*c0909341SAndroid Build Coastguard Worker    punpckhwd          m1, m0, m3
1250*c0909341SAndroid Build Coastguard Worker    punpcklwd          m0, m3
1251*c0909341SAndroid Build Coastguard Worker    jmp              tx2q
1252*c0909341SAndroid Build Coastguard Worker.pass2:
1253*c0909341SAndroid Build Coastguard Worker    call .main
1254*c0909341SAndroid Build Coastguard Worker.end:
1255*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1256*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q3120
1257*c0909341SAndroid Build Coastguard Worker.end2:
1258*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_2048)]
1259*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
1260*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
1261*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
1262*c0909341SAndroid Build Coastguard Worker.end3:
1263*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
1264*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m2
1265*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*1], m2
1266*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1267*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             0, 1, 4, 5
1268*c0909341SAndroid Build Coastguard Worker    RET
1269*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1270*c0909341SAndroid Build Coastguard Workercglobal_label .main
1271*c0909341SAndroid Build Coastguard Worker    IADST4_1D_PACKED
1272*c0909341SAndroid Build Coastguard Worker    ret
1273*c0909341SAndroid Build Coastguard Worker
1274*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, dct
1275*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, adst
1276*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, flipadst
1277*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN flipadst, identity
1278*c0909341SAndroid Build Coastguard Worker
1279*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1280*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm0, [o(pw_2896x8)]
1281*c0909341SAndroid Build Coastguard Worker    pshufd              xm4,      [cq+16*0], q1032
1282*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm3, xm0, [cq+16*3]
1283*c0909341SAndroid Build Coastguard Worker    pshufd              xm5,      [cq+16*1], q1032
1284*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm2, xm0, [cq+16*2]
1285*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm4, xm0
1286*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm5, xm0
1287*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x8_internal_8bpc).main_pass1
1288*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm1, 1
1289*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm0, 1
1290*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3, m2
1291*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2
1292*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
1293*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m1
1294*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0, m3
1295*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
1296*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1297*c0909341SAndroid Build Coastguard Worker.pass2:
1298*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x4_internal_8bpc).main
1299*c0909341SAndroid Build Coastguard Worker    mova                 m2, m1
1300*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m0, q2031
1301*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m2, q2031
1302*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x4_internal_8bpc).end2
1303*c0909341SAndroid Build Coastguard Worker
1304*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, dct
1305*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, adst
1306*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, flipadst
1307*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X4_FN identity, identity
1308*c0909341SAndroid Build Coastguard Worker
1309*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1310*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16*0]
1311*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+16*1]
1312*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+16*2], 1
1313*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+16*3], 1
1314*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_2896x8)]
1315*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m0
1316*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0
1317*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
1318*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1319*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
1320*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
1321*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m0
1322*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m1
1323*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1324*c0909341SAndroid Build Coastguard Worker.pass2:
1325*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_1697x8)]
1326*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m0
1327*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
1328*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2
1329*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3
1330*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x4_internal_8bpc).end
1331*c0909341SAndroid Build Coastguard Worker
1332*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X8_FN 2 ; type1, type2
1333*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 8x8
1334*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1335*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
1336*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
1337*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_16384)]
1338*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
1339*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8
1340*c0909341SAndroid Build Coastguard Worker.dconly:
1341*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
1342*c0909341SAndroid Build Coastguard Worker.dconly2:
1343*c0909341SAndroid Build Coastguard Worker    movd                xm2, [pw_2048]
1344*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
1345*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
1346*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
1347*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
1348*c0909341SAndroid Build Coastguard Worker.dconly_loop:
1349*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             0, 0, 1, 2, strideq*1, strideq*2, r2
1350*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1351*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 4
1352*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
1353*c0909341SAndroid Build Coastguard Worker    RET
1354*c0909341SAndroid Build Coastguard Worker%endif
1355*c0909341SAndroid Build Coastguard Worker%endmacro
1356*c0909341SAndroid Build Coastguard Worker
1357*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, dct
1358*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, adst
1359*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, flipadst
1360*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN dct, identity
1361*c0909341SAndroid Build Coastguard Worker
1362*c0909341SAndroid Build Coastguard Workercglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1363*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120 ; 0 1
1364*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq+32*3], q3120 ; 6 7
1365*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+32*2], q3120 ; 4 5
1366*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q3120 ; 2 3
1367*c0909341SAndroid Build Coastguard Worker    call .main
1368*c0909341SAndroid Build Coastguard Worker    shufps               m4, m0, m1, q0220
1369*c0909341SAndroid Build Coastguard Worker    shufps               m5, m0, m1, q1331
1370*c0909341SAndroid Build Coastguard Worker    shufps               m1, m2, m3, q0220
1371*c0909341SAndroid Build Coastguard Worker    shufps               m3, m2, m3, q1331
1372*c0909341SAndroid Build Coastguard Worker    vbroadcasti128       m0, [o(deint_shuf)]
1373*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(pw_16384)]
1374*c0909341SAndroid Build Coastguard Worker    REPX   {pshufb   x, m0}, m4, m5, m1, m3
1375*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m2}, m4, m5, m1, m3
1376*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m4, xm1, 1
1377*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m4, m1, 0x31
1378*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m5, xm3, 1
1379*c0909341SAndroid Build Coastguard Worker    vperm2i128           m3, m5, m3, 0x31
1380*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1381*c0909341SAndroid Build Coastguard Worker.pass2:
1382*c0909341SAndroid Build Coastguard Worker    call .main
1383*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
1384*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m0, q3120
1385*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m1, q2031
1386*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m2, q3120
1387*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m3, q2031
1388*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_8bpc).end2
1389*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1390*c0909341SAndroid Build Coastguard Workercglobal_label .main
1391*c0909341SAndroid Build Coastguard Worker    IDCT8_1D_PACKED
1392*c0909341SAndroid Build Coastguard Worker    ret
1393*c0909341SAndroid Build Coastguard Worker
1394*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, dct
1395*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, adst
1396*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, flipadst
1397*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN adst, identity
1398*c0909341SAndroid Build Coastguard Worker
1399*c0909341SAndroid Build Coastguard Workercglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1400*c0909341SAndroid Build Coastguard Worker    vpermq               m4, [cq+32*0], q1302 ; 1 0
1401*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq+32*3], q3120 ; 6 7
1402*c0909341SAndroid Build Coastguard Worker    vpermq               m5, [cq+32*1], q1302 ; 3 2
1403*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+32*2], q3120 ; 4 5
1404*c0909341SAndroid Build Coastguard Worker    call .main_pass1
1405*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_16384)]
1406*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0, m1
1407*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
1408*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m3
1409*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3
1410*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
1411*c0909341SAndroid Build Coastguard Worker    psubw                m3, m5 ; negate odd elements during rounding
1412*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5
1413*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
1414*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
1415*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
1416*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m0
1417*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
1418*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2
1419*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2
1420*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m3, m0, 0x31
1421*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m3, xm0, 1
1422*c0909341SAndroid Build Coastguard Worker    vperm2i128           m3, m4, m1, 0x31
1423*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m4, xm1, 1
1424*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1425*c0909341SAndroid Build Coastguard Worker.pass2:
1426*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1032
1427*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q1032
1428*c0909341SAndroid Build Coastguard Worker    call .main_pass2
1429*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_2048)]
1430*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm4, [o(pw_4096)]
1431*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5 ; lower half = 2048, upper half = -2048
1432*c0909341SAndroid Build Coastguard Worker.end:
1433*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q3120}, m0, m1, m2, m3
1434*c0909341SAndroid Build Coastguard Worker.end2:
1435*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4
1436*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4
1437*c0909341SAndroid Build Coastguard Worker.end3:
1438*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4
1439*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4
1440*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
1441*c0909341SAndroid Build Coastguard Worker.end4:
1442*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
1443*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m4
1444*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*1], m4
1445*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*2], m4
1446*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*3], m4
1447*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1448*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             0, 1, 4, 5
1449*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1450*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             2, 3, 4, 5
1451*c0909341SAndroid Build Coastguard Worker    RET
1452*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1453*c0909341SAndroid Build Coastguard Worker.main_pass1:
1454*c0909341SAndroid Build Coastguard Worker    IADST8_1D_PACKED 1
1455*c0909341SAndroid Build Coastguard Worker    ret
1456*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1457*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2
1458*c0909341SAndroid Build Coastguard Worker    IADST8_1D_PACKED 2
1459*c0909341SAndroid Build Coastguard Worker    ret
1460*c0909341SAndroid Build Coastguard Worker
1461*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, dct
1462*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, adst
1463*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, flipadst
1464*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN flipadst, identity
1465*c0909341SAndroid Build Coastguard Worker
1466*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1467*c0909341SAndroid Build Coastguard Worker    vpermq               m4, [cq+32*0], q1302 ; 1 0
1468*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq+32*3], q3120 ; 6 7
1469*c0909341SAndroid Build Coastguard Worker    vpermq               m5, [cq+32*1], q1302 ; 3 2
1470*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+32*2], q3120 ; 4 5
1471*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_8bpc).main_pass1
1472*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m5, [o(pw_16384)]
1473*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m3, m2
1474*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2
1475*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m0
1476*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0
1477*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
1478*c0909341SAndroid Build Coastguard Worker    psubw                m0, m5
1479*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m0
1480*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
1481*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m0
1482*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
1483*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m4, m3
1484*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m3
1485*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2, m1
1486*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1
1487*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m0, xm3, 1
1488*c0909341SAndroid Build Coastguard Worker    vperm2i128           m3, m0, m3, 0x31
1489*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m4, xm2, 1
1490*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m4, m2, 0x31
1491*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1492*c0909341SAndroid Build Coastguard Worker.pass2:
1493*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m0, q1032
1494*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q1032
1495*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x8_internal_8bpc).main_pass2
1496*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
1497*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm5, [o(pw_4096)]
1498*c0909341SAndroid Build Coastguard Worker    psubw                m4, m5 ; lower half = -2048, upper half = 2048
1499*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m3, q2031
1500*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m0, q2031
1501*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m2, q2031
1502*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m1, q2031
1503*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m0, m4
1504*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5, m4
1505*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_8bpc).end3
1506*c0909341SAndroid Build Coastguard Worker
1507*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, dct
1508*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, adst
1509*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, flipadst
1510*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X8_FN identity, identity
1511*c0909341SAndroid Build Coastguard Worker
1512*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2
1513*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq+16*0]
1514*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16*1]
1515*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [cq+16*4], 1
1516*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+16*5], 1
1517*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq+16*2]
1518*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+16*3]
1519*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+16*6], 1
1520*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+16*7], 1
1521*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m2
1522*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2
1523*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0
1524*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
1525*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
1526*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
1527*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
1528*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
1529*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1530*c0909341SAndroid Build Coastguard Worker.pass2:
1531*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_4096)]
1532*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_8x8_internal_8bpc).end
1533*c0909341SAndroid Build Coastguard Worker
1534*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_8X16_FN 2 ; type1, type2
1535*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 8x16
1536*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1537*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
1538*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
1539*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_16384)]
1540*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
1541*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
1542*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
1543*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
1544*c0909341SAndroid Build Coastguard Worker%endif
1545*c0909341SAndroid Build Coastguard Worker%endmacro
1546*c0909341SAndroid Build Coastguard Worker
1547*c0909341SAndroid Build Coastguard Worker%macro ITX_8X16_LOAD_COEFS 0
1548*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2896x8)]
1549*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m4, [cq+32*0]
1550*c0909341SAndroid Build Coastguard Worker    add                  cq, 32*4
1551*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m4, [cq+32*3]
1552*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4, [cq-32*3]
1553*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m4, [cq+32*2]
1554*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4, [cq-32*2]
1555*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m4, [cq+32*1]
1556*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4, [cq-32*1]
1557*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4,     [cq+32*0]
1558*c0909341SAndroid Build Coastguard Worker%endmacro
1559*c0909341SAndroid Build Coastguard Worker
1560*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, dct
1561*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, adst
1562*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, flipadst
1563*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN dct, identity
1564*c0909341SAndroid Build Coastguard Worker
1565*c0909341SAndroid Build Coastguard Workercglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
1566*c0909341SAndroid Build Coastguard Worker    ITX_8X16_LOAD_COEFS
1567*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_8bpc).main
1568*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_16384)]
1569*c0909341SAndroid Build Coastguard Worker.pass1_end:
1570*c0909341SAndroid Build Coastguard Worker    vperm2i128           m9, m3, m7, 0x31
1571*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm7, 1
1572*c0909341SAndroid Build Coastguard Worker    vperm2i128           m8, m2, m6, 0x31
1573*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm6, 1
1574*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m1, m5, 0x31
1575*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm5, 1
1576*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m0, m4, 0x31
1577*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm4, 1
1578*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m2, m3
1579*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
1580*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m1
1581*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
1582*c0909341SAndroid Build Coastguard Worker.pass1_end2:
1583*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m5, m6
1584*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6
1585*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m8, m9
1586*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m9
1587*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8
1588*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
1589*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
1590*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
1591*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
1592*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m5, m6
1593*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m6
1594*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m8
1595*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m8
1596*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1597*c0909341SAndroid Build Coastguard Worker.pass2:
1598*c0909341SAndroid Build Coastguard Worker    call .main
1599*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q3120}, m0, m2, m4, m6
1600*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q2031}, m1, m3, m5, m7
1601*c0909341SAndroid Build Coastguard Worker.end:
1602*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2048)]
1603*c0909341SAndroid Build Coastguard Worker.end2:
1604*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
1605*c0909341SAndroid Build Coastguard Worker.end3:
1606*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
1607*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
1608*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1609*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             0, 1, 8, 9
1610*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1611*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             2, 3, 0, 1
1612*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1613*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             4, 5, 0, 1
1614*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
1615*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             6, 7, 0, 1
1616*c0909341SAndroid Build Coastguard Worker    RET
1617*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1618*c0909341SAndroid Build Coastguard Workercglobal_label .main
1619*c0909341SAndroid Build Coastguard Worker    IDCT16_1D_PACKED
1620*c0909341SAndroid Build Coastguard Worker    ret
1621*c0909341SAndroid Build Coastguard Worker
1622*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, dct
1623*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, adst
1624*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, flipadst
1625*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN adst, identity
1626*c0909341SAndroid Build Coastguard Worker
1627*c0909341SAndroid Build Coastguard Workercglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
1628*c0909341SAndroid Build Coastguard Worker    ITX_8X16_LOAD_COEFS
1629*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main
1630*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main_pass1_end
1631*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_16384)]
1632*c0909341SAndroid Build Coastguard Worker    pslld                m9, m10, 17
1633*c0909341SAndroid Build Coastguard Worker    psubw               m10, m9 ; 16384, -16384
1634*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_8bpc).pass1_end
1635*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1636*c0909341SAndroid Build Coastguard Worker.pass2:
1637*c0909341SAndroid Build Coastguard Worker    call .main
1638*c0909341SAndroid Build Coastguard Worker    call .main_pass2_end
1639*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_2048)]
1640*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm8, [o(pw_4096)]
1641*c0909341SAndroid Build Coastguard Worker    psubw                m8, m9
1642*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q2031}, m0, m1, m2, m3
1643*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q3120}, m4, m5, m6, m7
1644*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_8bpc).end2
1645*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1646*c0909341SAndroid Build Coastguard Workercglobal_label .main
1647*c0909341SAndroid Build Coastguard Worker    REPX {pshufd x, x, q1032}, m7, m1, m5, m3
1648*c0909341SAndroid Build Coastguard Worker.main2:
1649*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
1650*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m7, m0 ; in14 in1
1651*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m7     ; in0  in15
1652*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m6, m1 ; in12 in3
1653*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6     ; in2  in13
1654*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m5, m2 ; in10 in5
1655*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5     ; in4  in11
1656*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m4, m3 ; in8  in7
1657*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4     ; in6  in9
1658*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 4, 9, 10,  201, 4091, 3 ; t0  t1
1659*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1, 4, 9, 10,  995, 3973, 3 ; t2  t3
1660*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 4, 9, 10, 1751, 3703, 3 ; t4  t5
1661*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 4, 9, 10, 2440, 3290, 3 ; t6  t7
1662*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 4, 9, 10, 3035, 2751, 3 ; t8  t9
1663*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
1664*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
1665*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 4, 9, 10, 4052,  601, 3 ; t14 t15
1666*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m5 ; t9a  t8a
1667*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m5     ; t1a  t0a
1668*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m1, m6 ; t11a t10a
1669*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m6     ; t3a  t2a
1670*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m2, m7 ; t13a t12a
1671*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m7     ; t5a  t4a
1672*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m3, m8 ; t15a t14a
1673*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m8     ; t7a  t6a
1674*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m4017_799)]
1675*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_799_4017)]
1676*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
1677*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        4, 8, _, 10, 11, 12, 6 ; t8  t9
1678*c0909341SAndroid Build Coastguard Worker    psubw                m8, m9, m11 ; pw_4017_m799
1679*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        6, 12, _, 10, 12, 8, 6 ; t12 t13
1680*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m2276_3406)]
1681*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_3406_2276)]
1682*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 8, _, 10, 11, 12, 6 ; t10 t11
1683*c0909341SAndroid Build Coastguard Worker    psubw                m8, m9, m11 ; pw_2276_m3406
1684*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        7, 12, _, 10, 12, 8, 6 ; t14 t15
1685*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m1, m3 ; t7   t6
1686*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m3     ; t3   t2
1687*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m0, m2 ; t5   t4
1688*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2     ; t1   t0
1689*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m5, m7 ; t14a t15a
1690*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m5     ; t10a t11a
1691*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m4, m6 ; t12a t13a
1692*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m6     ; t8a  t9a
1693*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m3784_1567)]
1694*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_1567_3784)]
1695*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        3, 6, _, 10, 12, 11, 6 ; t5a t4a
1696*c0909341SAndroid Build Coastguard Worker    psubw                m6, m9, m11 ; pw_3784_m1567
1697*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 6, _, 10, 6, 12, 6  ; t7a t6a
1698*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m1567_3784)]
1699*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_3784_1567)]
1700*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        2, 6, _, 10, 11, 12, 6 ; t15 t14
1701*c0909341SAndroid Build Coastguard Worker    psubw                m6, m9, m11 ; pw_1567_m3784
1702*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        5, 12, _, 10, 12, 6, 6 ; t13 t12
1703*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m12, [o(deint_shuf)]
1704*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m4, m7        ; -out1  out14
1705*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m7            ;  t10    t11
1706*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m3, m8        ;  t7     t6
1707*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m3            ;  out12 -out3
1708*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m0, m1        ;  t3a    t2a
1709*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1            ; -out15  out0
1710*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m2, m5        ; -out13  out2
1711*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m2            ;  t15a   t14a
1712*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m12
1713*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
1714*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m12
1715*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m12
1716*c0909341SAndroid Build Coastguard Worker    shufps               m7, m6, m0, q1032 ;  out14 -out15
1717*c0909341SAndroid Build Coastguard Worker    vpblendd             m0, m6, 0x33      ; -out1   out0
1718*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m8, m1        ;  out12 -out13
1719*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m8, m1        ; -out3   out2
1720*c0909341SAndroid Build Coastguard Worker    ret
1721*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1722*c0909341SAndroid Build Coastguard Worker.main_pass1_end:
1723*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_m2896_2896)]
1724*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2896_2896)]
1725*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m8, m11       ; -out11
1726*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m12, m5       ; -out5
1727*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m8            ;  out10
1728*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m12           ;  out4
1729*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m10}, m9, m5, m2, m11
1730*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m9, m5, m2, m11
1731*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m9            ;  out10 -out11
1732*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m11           ; -out5   out4
1733*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m8, m3        ;  out8
1734*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2896_m2896)]
1735*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12           ; -out7
1736*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m4            ; -out9
1737*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m12           ;  out6
1738*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m10}, m11, m3, m8, m4
1739*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m11, m3, m8, m4
1740*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m4            ; -out7   out6
1741*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m11, m8       ;  out8  -out9
1742*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_16384)]
1743*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
1744*c0909341SAndroid Build Coastguard Worker    ret
1745*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1746*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2_end
1747*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2896x8)]
1748*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m11, m12
1749*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m12
1750*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m12
1751*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m12
1752*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m11, m5, m2        ;  t15a   t7
1753*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m2            ;  t14a   t6
1754*c0909341SAndroid Build Coastguard Worker    shufps               m2, m3, m4, q1032 ;  t2a    t10
1755*c0909341SAndroid Build Coastguard Worker    vpblendd             m3, m4, 0xcc      ;  t3a    t11
1756*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m2, m3        ;  out8  -out9
1757*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2            ; -out7   out6
1758*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m5, m11       ; -out5   out4
1759*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m11           ;  out10 -out11
1760*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m2, m3, m4, m5
1761*c0909341SAndroid Build Coastguard Worker    ret
1762*c0909341SAndroid Build Coastguard Worker
1763*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, dct
1764*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, adst
1765*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, flipadst
1766*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN flipadst, identity
1767*c0909341SAndroid Build Coastguard Worker
1768*c0909341SAndroid Build Coastguard Workercglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
1769*c0909341SAndroid Build Coastguard Worker    ITX_8X16_LOAD_COEFS
1770*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main
1771*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main_pass1_end
1772*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_16384)]
1773*c0909341SAndroid Build Coastguard Worker    pslld               m10, m9, 17
1774*c0909341SAndroid Build Coastguard Worker    psubw               m10, m9 ; -16384, 16384
1775*c0909341SAndroid Build Coastguard Worker    vperm2i128           m9, m4, m0, 0x31
1776*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m4, xm0, 1
1777*c0909341SAndroid Build Coastguard Worker    vperm2i128           m8, m5, m1, 0x31
1778*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, m5, xm1, 1
1779*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m7, m3, 0x31
1780*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m7, xm3, 1
1781*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m6, xm2, 1
1782*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m6, m2, 0x31
1783*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0
1784*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
1785*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3, m1
1786*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m1
1787*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_8bpc).pass1_end2
1788*c0909341SAndroid Build Coastguard Worker.pass2:
1789*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main
1790*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main_pass2_end
1791*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2048)]
1792*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        xm9, [o(pw_4096)]
1793*c0909341SAndroid Build Coastguard Worker    psubw                m8, m9
1794*c0909341SAndroid Build Coastguard Worker    vpermq               m9, m0, q3120
1795*c0909341SAndroid Build Coastguard Worker    vpermq               m0, m7, q2031
1796*c0909341SAndroid Build Coastguard Worker    vpermq               m7, m1, q3120
1797*c0909341SAndroid Build Coastguard Worker    vpermq               m1, m6, q2031
1798*c0909341SAndroid Build Coastguard Worker    vpermq               m6, m2, q3120
1799*c0909341SAndroid Build Coastguard Worker    vpermq               m2, m5, q2031
1800*c0909341SAndroid Build Coastguard Worker    vpermq               m5, m3, q3120
1801*c0909341SAndroid Build Coastguard Worker    vpermq               m3, m4, q2031
1802*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m8
1803*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m8
1804*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m8
1805*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m8
1806*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5, m8
1807*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m6, m8
1808*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, m8
1809*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m9, m8
1810*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_8bpc).end3
1811*c0909341SAndroid Build Coastguard Worker
1812*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, dct
1813*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, adst
1814*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, flipadst
1815*c0909341SAndroid Build Coastguard WorkerINV_TXFM_8X16_FN identity, identity
1816*c0909341SAndroid Build Coastguard Worker
1817*c0909341SAndroid Build Coastguard Worker%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
1818*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%2, m%3, m%1
1819*c0909341SAndroid Build Coastguard Worker%if %0 == 4 ; if downshifting by 1
1820*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%2, m%4
1821*c0909341SAndroid Build Coastguard Worker%else
1822*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m%1
1823*c0909341SAndroid Build Coastguard Worker%endif
1824*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m%2
1825*c0909341SAndroid Build Coastguard Worker%endmacro
1826*c0909341SAndroid Build Coastguard Worker
1827*c0909341SAndroid Build Coastguard Workercglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
1828*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq+16*0]
1829*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16*2]
1830*c0909341SAndroid Build Coastguard Worker    add                  cq, 16*8
1831*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [cq+16*0], 1
1832*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+16*2], 1
1833*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_2896x8)]
1834*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq-16*4]
1835*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq-16*2]
1836*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+16*4], 1
1837*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [cq+16*6], 1
1838*c0909341SAndroid Build Coastguard Worker    mova                xm7, [cq-16*7]
1839*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq-16*5]
1840*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [cq+16*1], 1
1841*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [cq+16*3], 1
1842*c0909341SAndroid Build Coastguard Worker    mova                xm8, [cq-16*3]
1843*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq-16*1]
1844*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [cq+16*5], 1
1845*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+16*7], 1
1846*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m2
1847*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2
1848*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m5
1849*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5
1850*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m7, m6
1851*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m6
1852*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m8, m0
1853*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0
1854*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8
1855*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
1856*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
1857*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
1858*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
1859*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m5, m6
1860*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m6
1861*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m8
1862*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m8
1863*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1864*c0909341SAndroid Build Coastguard Worker.pass2:
1865*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_1697x16)]
1866*c0909341SAndroid Build Coastguard Worker    REPX {vpermq   x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
1867*c0909341SAndroid Build Coastguard Worker    REPX {IDTX16   x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
1868*c0909341SAndroid Build Coastguard Worker    jmp m(idct_8x16_internal_8bpc).end
1869*c0909341SAndroid Build Coastguard Worker
1870*c0909341SAndroid Build Coastguard Worker%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
1871*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%3, [dstq+%5]
1872*c0909341SAndroid Build Coastguard Worker%ifnum %1
1873*c0909341SAndroid Build Coastguard Worker    paddw               m%3, m%1
1874*c0909341SAndroid Build Coastguard Worker%else
1875*c0909341SAndroid Build Coastguard Worker    paddw               m%3, %1
1876*c0909341SAndroid Build Coastguard Worker%endif
1877*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%4, [dstq+%6]
1878*c0909341SAndroid Build Coastguard Worker%ifnum %2
1879*c0909341SAndroid Build Coastguard Worker    paddw               m%4, m%2
1880*c0909341SAndroid Build Coastguard Worker%else
1881*c0909341SAndroid Build Coastguard Worker    paddw               m%4, %2
1882*c0909341SAndroid Build Coastguard Worker%endif
1883*c0909341SAndroid Build Coastguard Worker    packuswb            m%3, m%4
1884*c0909341SAndroid Build Coastguard Worker    vpermq              m%3, m%3, q3120
1885*c0909341SAndroid Build Coastguard Worker    mova          [dstq+%5], xm%3
1886*c0909341SAndroid Build Coastguard Worker    vextracti128  [dstq+%6], m%3, 1
1887*c0909341SAndroid Build Coastguard Worker%endmacro
1888*c0909341SAndroid Build Coastguard Worker
1889*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X4_FN 2 ; type1, type2
1890*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 16x4
1891*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
1892*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
1893*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
1894*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_16384)]
1895*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
1896*c0909341SAndroid Build Coastguard Worker    or                  r3d, 4
1897*c0909341SAndroid Build Coastguard Worker.dconly:
1898*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
1899*c0909341SAndroid Build Coastguard Worker    movd                xm2, [pw_2048] ; intentionally rip-relative
1900*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
1901*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
1902*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
1903*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
1904*c0909341SAndroid Build Coastguard Worker.dconly_loop:
1905*c0909341SAndroid Build Coastguard Worker    mova                xm1, [dstq+strideq*0]
1906*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [dstq+strideq*1], 1
1907*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m1, m3
1908*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3
1909*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
1910*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
1911*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
1912*c0909341SAndroid Build Coastguard Worker    mova         [dstq+strideq*0], xm1
1913*c0909341SAndroid Build Coastguard Worker    vextracti128 [dstq+strideq*1], m1, 1
1914*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
1915*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 2
1916*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
1917*c0909341SAndroid Build Coastguard Worker    RET
1918*c0909341SAndroid Build Coastguard Worker%endif
1919*c0909341SAndroid Build Coastguard Worker%endmacro
1920*c0909341SAndroid Build Coastguard Worker
1921*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, dct
1922*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, adst
1923*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, flipadst
1924*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN dct, identity
1925*c0909341SAndroid Build Coastguard Worker
1926*c0909341SAndroid Build Coastguard Workercglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
1927*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+16*0]
1928*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq+16*1]
1929*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16*2]
1930*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq+16*3]
1931*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq+16*4]
1932*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq+16*5]
1933*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq+16*6]
1934*c0909341SAndroid Build Coastguard Worker    mova                xm7, [cq+16*7]
1935*c0909341SAndroid Build Coastguard Worker    call m(idct_4x16_internal_8bpc).main
1936*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, m2, xm6, 1
1937*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m0, xm4, 1
1938*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m1, xm5, 1
1939*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m3, xm7, 1
1940*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2, m6
1941*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m6
1942*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_16384)]
1943*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0, m1
1944*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
1945*c0909341SAndroid Build Coastguard Worker    mova                 m1, m6
1946*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x4_internal_8bpc).pass1_end
1947*c0909341SAndroid Build Coastguard Worker.pass2:
1948*c0909341SAndroid Build Coastguard Worker    call .main
1949*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x4_internal_8bpc).end
1950*c0909341SAndroid Build Coastguard WorkerALIGN function_align
1951*c0909341SAndroid Build Coastguard Workercglobal_label .main
1952*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pd_2048)]
1953*c0909341SAndroid Build Coastguard Worker    IDCT4_1D              0, 1, 2, 3, 4, 5, 6
1954*c0909341SAndroid Build Coastguard Worker    ret
1955*c0909341SAndroid Build Coastguard Worker
1956*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, dct
1957*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, adst
1958*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, flipadst
1959*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN adst, identity
1960*c0909341SAndroid Build Coastguard Worker
1961*c0909341SAndroid Build Coastguard Workercglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
1962*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q1230
1963*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq+32*3], q2103
1964*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q1230
1965*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+32*2], q2103
1966*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x16_internal_8bpc).main2
1967*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x16_internal_8bpc).main_pass1_end
1968*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m3, m1
1969*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m2, m0
1970*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
1971*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3
1972*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_16384)]
1973*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m0, xm2, 1
1974*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m0, m2, 0x31
1975*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m4, xm5, 1
1976*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m4, m5, 0x31
1977*c0909341SAndroid Build Coastguard Worker    psubw                m6, m7, m1
1978*c0909341SAndroid Build Coastguard Worker.pass1_end:
1979*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m1
1980*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1981*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m1
1982*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6
1983*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m2
1984*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2
1985*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0
1986*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
1987*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1, m2
1988*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m2
1989*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m4
1990*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m4
1991*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
1992*c0909341SAndroid Build Coastguard Worker.pass2:
1993*c0909341SAndroid Build Coastguard Worker    call .main
1994*c0909341SAndroid Build Coastguard Worker.end:
1995*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
1996*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
1997*c0909341SAndroid Build Coastguard Worker    WIN64_RESTORE_XMM
1998*c0909341SAndroid Build Coastguard Worker.end2:
1999*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
2000*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m4
2001*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*1], m4
2002*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*2], m4
2003*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*3], m4
2004*c0909341SAndroid Build Coastguard Worker.end3:
2005*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0, 1, 4, 5, strideq*0, strideq*1
2006*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2007*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2, 3, 4, 5, strideq*0, strideq*1
2008*c0909341SAndroid Build Coastguard Worker    RET
2009*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2010*c0909341SAndroid Build Coastguard Workercglobal_label .main
2011*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_m3344_3344)]
2012*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_3803_1321)]
2013*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_m1321_2482)]
2014*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_2482_3344)]
2015*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m2, m0 ; in2 in0 l
2016*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0     ; in2 in0 h
2017*c0909341SAndroid Build Coastguard Worker    psrld                m5, m6, 16
2018*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, m6, m4 ; t2:02 l
2019*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2     ; t2:02 h
2020*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m7, m4 ; t0:02 l
2021*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m2     ; t0:02 h
2022*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m8     ; t1:02 l
2023*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m2     ; t1:02 h
2024*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3, m1 ; in3 in1 h
2025*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1     ; in3 in1 l
2026*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, m2 ; t2:3 h
2027*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m3     ; t2:3 l
2028*c0909341SAndroid Build Coastguard Worker    paddd                m6, m1
2029*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pd_2048)]
2030*c0909341SAndroid Build Coastguard Worker    paddd               m10, m5
2031*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m9, m3
2032*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m2
2033*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
2034*c0909341SAndroid Build Coastguard Worker    paddd                m7, m1
2035*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5     ; t0 + t3 + 2048 l
2036*c0909341SAndroid Build Coastguard Worker    paddd                m7, m9     ; t0 + t3 + 2048 h
2037*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m3803_3344)]
2038*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m9, m2
2039*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m3
2040*c0909341SAndroid Build Coastguard Worker    paddd               m10, m1     ; t2 + 2048 l
2041*c0909341SAndroid Build Coastguard Worker    paddd                m6, m1     ; t2 + 2048 h
2042*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1     ; t1:13 + 2048 h
2043*c0909341SAndroid Build Coastguard Worker    paddd                m1, m9     ; t1:13 + 2048 l
2044*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m3803_m6688)]
2045*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9
2046*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9
2047*c0909341SAndroid Build Coastguard Worker    paddd                m5, m8     ; t1 + t3 + 2048 h
2048*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4     ; t1 + t3 + 2048 l
2049*c0909341SAndroid Build Coastguard Worker    paddd                m8, m7
2050*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
2051*c0909341SAndroid Build Coastguard Worker    paddd                m2, m8     ; t0 + t1 - t3 + 2048 h
2052*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4     ; t0 + t1 - t3 + 2048 l
2053*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
2054*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m7
2055*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m5
2056*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m2
2057*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m10, m6
2058*c0909341SAndroid Build Coastguard Worker    ret
2059*c0909341SAndroid Build Coastguard Worker
2060*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, dct
2061*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, adst
2062*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, flipadst
2063*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN flipadst, identity
2064*c0909341SAndroid Build Coastguard Worker
2065*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
2066*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q1230
2067*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq+32*3], q2103
2068*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq+32*1], q1230
2069*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq+32*2], q2103
2070*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x16_internal_8bpc).main2
2071*c0909341SAndroid Build Coastguard Worker    call m(iadst_4x16_internal_8bpc).main_pass1_end
2072*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m3, m2
2073*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m1, m0
2074*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
2075*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3
2076*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_16384)]
2077*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m0, xm1, 1
2078*c0909341SAndroid Build Coastguard Worker    vperm2i128           m2, m0, m1, 0x31
2079*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m4, xm5, 1
2080*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m4, m5, 0x31
2081*c0909341SAndroid Build Coastguard Worker    psubw                m1, m7, m6
2082*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x4_internal_8bpc).pass1_end
2083*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2084*c0909341SAndroid Build Coastguard Worker.pass2:
2085*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x4_internal_8bpc).main
2086*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m4, [o(pw_2048)]
2087*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m4}, m3, m2, m1, m0
2088*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
2089*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m4
2090*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*1], m4
2091*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*2], m4
2092*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*3], m4
2093*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            3, 2, 4, 5, strideq*0, strideq*1
2094*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
2095*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            1, 0, 4, 5, strideq*0, strideq*1
2096*c0909341SAndroid Build Coastguard Worker    RET
2097*c0909341SAndroid Build Coastguard Worker
2098*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, dct
2099*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, adst
2100*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, flipadst
2101*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X4_FN identity, identity
2102*c0909341SAndroid Build Coastguard Worker
2103*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2
2104*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16*0]
2105*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq+16*1]
2106*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+16*4], 1
2107*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+16*5], 1
2108*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+16*2]
2109*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq+16*3]
2110*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+16*6], 1
2111*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [cq+16*7], 1
2112*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_1697x16)]
2113*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_16384)]
2114*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2, m4
2115*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4
2116*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0, m1
2117*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
2118*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m2
2119*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m2
2120*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0
2121*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0
2122*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7, m1
2123*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7, m2
2124*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, m3
2125*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m4
2126*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m5, m6, m7
2127*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m0
2128*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m5
2129*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m6
2130*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m7
2131*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2
2132*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2
2133*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4
2134*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
2135*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2136*c0909341SAndroid Build Coastguard Worker.pass2:
2137*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_1697x8)]
2138*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7, m0
2139*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7, m1
2140*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, m2
2141*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m3
2142*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4
2143*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m5
2144*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m6
2145*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m7
2146*c0909341SAndroid Build Coastguard Worker    jmp m(iadst_16x4_internal_8bpc).end
2147*c0909341SAndroid Build Coastguard Worker
2148*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X8_FN 2 ; type1, type2
2149*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 16x8
2150*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
2151*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
2152*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
2153*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_16384)]
2154*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
2155*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
2156*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8
2157*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
2158*c0909341SAndroid Build Coastguard Worker%endif
2159*c0909341SAndroid Build Coastguard Worker%endmacro
2160*c0909341SAndroid Build Coastguard Worker
2161*c0909341SAndroid Build Coastguard Worker%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
2162*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2896x8)]
2163*c0909341SAndroid Build Coastguard Worker    vpermq               m0, [cq+32*0], q3120
2164*c0909341SAndroid Build Coastguard Worker    add                  cq, 32*4
2165*c0909341SAndroid Build Coastguard Worker    vpermq               m7, [cq+32*3], q%1
2166*c0909341SAndroid Build Coastguard Worker    vpermq               m1, [cq-32*3], q%1
2167*c0909341SAndroid Build Coastguard Worker    vpermq               m6, [cq+32*2], q3120
2168*c0909341SAndroid Build Coastguard Worker    vpermq               m2, [cq-32*2], q3120
2169*c0909341SAndroid Build Coastguard Worker    vpermq               m5, [cq+32*1], q%1
2170*c0909341SAndroid Build Coastguard Worker    vpermq               m3, [cq-32*1], q%1
2171*c0909341SAndroid Build Coastguard Worker    vpermq               m4, [cq+32*0], q3120
2172*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
2173*c0909341SAndroid Build Coastguard Worker%endmacro
2174*c0909341SAndroid Build Coastguard Worker
2175*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, dct
2176*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, adst
2177*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, flipadst
2178*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN dct, identity
2179*c0909341SAndroid Build Coastguard Worker
2180*c0909341SAndroid Build Coastguard Workercglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
2181*c0909341SAndroid Build Coastguard Worker    ITX_16X8_LOAD_COEFS 3120
2182*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main
2183*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_16384)]
2184*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m0, m2
2185*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
2186*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m3
2187*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3
2188*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m4, m6
2189*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m6
2190*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m5, m7
2191*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m7
2192*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m8, m1, m4, m6
2193*c0909341SAndroid Build Coastguard Worker.pass1_end:
2194*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m0, m2, m9, m5
2195*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m8
2196*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m8
2197*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m2, m1
2198*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1
2199*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m9, m4
2200*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m4
2201*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6
2202*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6
2203*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
2204*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
2205*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m8
2206*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m8
2207*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m4
2208*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m4
2209*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m9, m5
2210*c0909341SAndroid Build Coastguard Worker    punpckhdq            m9, m5
2211*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m0, m6, 0x31
2212*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm6, 1
2213*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m1, m7, 0x31
2214*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm7, 1
2215*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m2, m8, 0x31
2216*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm8, 1
2217*c0909341SAndroid Build Coastguard Worker    vperm2i128           m7, m3, m9, 0x31
2218*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm9, 1
2219*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2220*c0909341SAndroid Build Coastguard Worker.pass2:
2221*c0909341SAndroid Build Coastguard Worker    call .main
2222*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2048)]
2223*c0909341SAndroid Build Coastguard Worker.end:
2224*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m2, m4, m6
2225*c0909341SAndroid Build Coastguard Worker.end2:
2226*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m1, m3, m5, m7
2227*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2228*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
2229*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
2230*c0909341SAndroid Build Coastguard Worker.end3:
2231*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2232*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
2233*c0909341SAndroid Build Coastguard Worker.end4:
2234*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2235*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4, 5, 0, 1, strideq*0, strideq*1
2236*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            6, 7, 0, 1, strideq*2, r3
2237*c0909341SAndroid Build Coastguard Worker    RET
2238*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2239*c0909341SAndroid Build Coastguard Workercglobal_label .main
2240*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
2241*c0909341SAndroid Build Coastguard Worker.main2:
2242*c0909341SAndroid Build Coastguard Worker    IDCT8_1D              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
2243*c0909341SAndroid Build Coastguard Worker    ret
2244*c0909341SAndroid Build Coastguard Worker
2245*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, dct
2246*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, adst
2247*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, flipadst
2248*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN adst, identity
2249*c0909341SAndroid Build Coastguard Worker
2250*c0909341SAndroid Build Coastguard Workercglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
2251*c0909341SAndroid Build Coastguard Worker    ITX_16X8_LOAD_COEFS 1302
2252*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main2
2253*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main_pass1_end
2254*c0909341SAndroid Build Coastguard Worker    psubw               m11, m9, m10
2255*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m0, m2
2256*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2
2257*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m3
2258*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3
2259*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m4, m6
2260*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m6
2261*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m5, m7
2262*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m7
2263*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m11}, m8, m1, m4, m6
2264*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_8bpc).pass1_end
2265*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2266*c0909341SAndroid Build Coastguard Worker.pass2:
2267*c0909341SAndroid Build Coastguard Worker    call .main
2268*c0909341SAndroid Build Coastguard Worker    call .main_pass2_end
2269*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
2270*c0909341SAndroid Build Coastguard Worker    psubw                m8, m9
2271*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m0, m2, m4, m6
2272*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_8bpc).end2
2273*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2274*c0909341SAndroid Build Coastguard Workercglobal_label .main
2275*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pd_2048)]
2276*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         7, 0, 8, 9, 10,  401, 4076 ; t1a, t0a
2277*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
2278*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
2279*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
2280*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m2, m6 ; t6
2281*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m6     ; t2
2282*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m0, m4 ; t4
2283*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4     ; t0
2284*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m5, m1 ; t7
2285*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m1     ; t3
2286*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m7, m3 ; t5
2287*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m3     ; t1
2288*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
2289*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
2290*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m6, m8 ;  t7
2291*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m8     ;  out6
2292*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m7, m5 ;  t3
2293*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m5     ; -out7
2294*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m0, m2 ;  t2
2295*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m2     ;  out0
2296*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m1, m4 ;  t6
2297*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m4     ; -out1
2298*c0909341SAndroid Build Coastguard Worker    ret
2299*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2300*c0909341SAndroid Build Coastguard Worker.main_pass1_end:
2301*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_m2896_2896)]
2302*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2896_2896)]
2303*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m3, m5
2304*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5
2305*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m11, m4
2306*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m12
2307*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m11, m3
2308*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m12
2309*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m10}, m5, m4, m8, m3
2310*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m5, m8, m4, m3
2311*c0909341SAndroid Build Coastguard Worker    packssdw             m3, m4     ; -out3
2312*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m8, m5 ;  out4
2313*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m9, m2
2314*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m2
2315*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m12, m5
2316*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m11
2317*c0909341SAndroid Build Coastguard Worker    pmaddwd             m12, m9
2318*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m9
2319*c0909341SAndroid Build Coastguard Worker    REPX     {paddd x, m10}, m2, m5, m12, m11
2320*c0909341SAndroid Build Coastguard Worker    REPX     {psrad x, 12 }, m2, m12, m5, m11
2321*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m12    ;  out2
2322*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m11    ; -out5
2323*c0909341SAndroid Build Coastguard Worker    ret
2324*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2325*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2_end
2326*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2896x8)]
2327*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m5, m3
2328*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m5
2329*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m2, m9
2330*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m9
2331*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m8     ;  out2
2332*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m8     ; -out3
2333*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m8     ;  out4
2334*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m8     ; -out5
2335*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_2048)]
2336*c0909341SAndroid Build Coastguard Worker    ret
2337*c0909341SAndroid Build Coastguard Worker
2338*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, dct
2339*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, adst
2340*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, flipadst
2341*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN flipadst, identity
2342*c0909341SAndroid Build Coastguard Worker
2343*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
2344*c0909341SAndroid Build Coastguard Worker    ITX_16X8_LOAD_COEFS 1302
2345*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main2
2346*c0909341SAndroid Build Coastguard Worker    call m(iadst_8x16_internal_8bpc).main_pass1_end
2347*c0909341SAndroid Build Coastguard Worker    psubw                m9, m10
2348*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m6, m4
2349*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m4
2350*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m7, m5
2351*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m5
2352*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m3, m1
2353*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1
2354*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m0
2355*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m0
2356*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m8, m4, m5, m1
2357*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m9 }, m6, m7, m3, m2
2358*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m7, m4
2359*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m4
2360*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m6, m8
2361*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m8
2362*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m3, m5
2363*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5
2364*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m2, m1
2365*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1
2366*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m6
2367*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m6
2368*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m4
2369*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m4
2370*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m3, m5
2371*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m5
2372*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m8, m2
2373*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m2
2374*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m6, xm5, 1
2375*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m5, 0x31
2376*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m1, m4, 0x31
2377*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm4, 1
2378*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m0, m3, 0x31
2379*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm3, 1
2380*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m7, xm8, 1
2381*c0909341SAndroid Build Coastguard Worker    vperm2i128           m7, m8, 0x31
2382*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2383*c0909341SAndroid Build Coastguard Worker.pass2:
2384*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main
2385*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x8_internal_8bpc).main_pass2_end
2386*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
2387*c0909341SAndroid Build Coastguard Worker    psubw                m8, m9
2388*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m7, m8
2389*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m0, m9
2390*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6, m9
2391*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m1, m8
2392*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5, m8
2393*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m2, m9
2394*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m4, m9
2395*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m3, m8
2396*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2397*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           10, 0, 8, 9, strideq*0, strideq*1
2398*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            1, 2, 0, 1, strideq*2, r3
2399*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_8bpc).end3
2400*c0909341SAndroid Build Coastguard Worker
2401*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, dct
2402*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, adst
2403*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, flipadst
2404*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X8_FN identity, identity
2405*c0909341SAndroid Build Coastguard Worker
2406*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2
2407*c0909341SAndroid Build Coastguard Worker    mova                xm7, [cq+16*0]
2408*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16*1]
2409*c0909341SAndroid Build Coastguard Worker    add                  cq, 16*8
2410*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_2896x8)]
2411*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [cq+16*0], 1
2412*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+16*1], 1
2413*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq-16*6]
2414*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq-16*5]
2415*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [cq+16*2], 1
2416*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+16*3], 1
2417*c0909341SAndroid Build Coastguard Worker    mova                xm8, [cq-16*4]
2418*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq-16*3]
2419*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [cq+16*4], 1
2420*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [cq+16*5], 1
2421*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq-16*2]
2422*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq-16*1]
2423*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+16*6], 1
2424*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [cq+16*7], 1
2425*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_1697x16)]
2426*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_16384)]
2427*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
2428*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m7, m2
2429*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m2
2430*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6, m4
2431*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m4
2432*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m8, m5
2433*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m5
2434*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m0, m1
2435*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
2436*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m3, m2
2437*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m2
2438*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m5
2439*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m5
2440*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m7, m6
2441*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m6
2442*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m8, m0
2443*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m0
2444*c0909341SAndroid Build Coastguard Worker    REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8
2445*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m2
2446*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m2
2447*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4
2448*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
2449*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5, m6
2450*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m6
2451*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m7, m8
2452*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m8
2453*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2454*c0909341SAndroid Build Coastguard Worker.pass2:
2455*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_4096)]
2456*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x8_internal_8bpc).end
2457*c0909341SAndroid Build Coastguard Worker
2458*c0909341SAndroid Build Coastguard Worker%define o_base pw_5 + 128
2459*c0909341SAndroid Build Coastguard Worker
2460*c0909341SAndroid Build Coastguard Worker%macro INV_TXFM_16X16_FN 2 ; type1, type2
2461*c0909341SAndroid Build Coastguard Worker    INV_TXFM_FN          %1, %2, 16x16
2462*c0909341SAndroid Build Coastguard Worker%ifidn %1_%2, dct_dct
2463*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
2464*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
2465*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_8192)]
2466*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
2467*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
2468*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
2469*c0909341SAndroid Build Coastguard Worker%endif
2470*c0909341SAndroid Build Coastguard Worker%endmacro
2471*c0909341SAndroid Build Coastguard Worker
2472*c0909341SAndroid Build Coastguard Worker%macro ITX_16X16_LOAD_COEFS 0
2473*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32*0]
2474*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32*1]
2475*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*2]
2476*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*3]
2477*c0909341SAndroid Build Coastguard Worker    add                  cq, 32*8
2478*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq-32*4]
2479*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq-32*3]
2480*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq-32*2]
2481*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq-32*1]
2482*c0909341SAndroid Build Coastguard Worker    mova                 m8, [cq+32*0]
2483*c0909341SAndroid Build Coastguard Worker    mova                 m9, [cq+32*1]
2484*c0909341SAndroid Build Coastguard Worker    mova                m10, [cq+32*2]
2485*c0909341SAndroid Build Coastguard Worker    mova                m11, [cq+32*3]
2486*c0909341SAndroid Build Coastguard Worker    mova                m12, [cq+32*4]
2487*c0909341SAndroid Build Coastguard Worker    mova                m13, [cq+32*5]
2488*c0909341SAndroid Build Coastguard Worker    mova                m14, [cq+32*6]
2489*c0909341SAndroid Build Coastguard Worker    mova                m15, [cq+32*7]
2490*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m15
2491*c0909341SAndroid Build Coastguard Worker%endmacro
2492*c0909341SAndroid Build Coastguard Worker
2493*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, dct
2494*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, adst
2495*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, flipadst
2496*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN dct, identity
2497*c0909341SAndroid Build Coastguard Worker
2498*c0909341SAndroid Build Coastguard Workercglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
2499*c0909341SAndroid Build Coastguard Worker    ITX_16X16_LOAD_COEFS
2500*c0909341SAndroid Build Coastguard Worker    call .main
2501*c0909341SAndroid Build Coastguard Worker.pass1_end:
2502*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_8192)]
2503*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
2504*c0909341SAndroid Build Coastguard Worker    vextracti128 [rsp+16*5], m8, 1
2505*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], xm8
2506*c0909341SAndroid Build Coastguard Worker.pass1_end2:
2507*c0909341SAndroid Build Coastguard Worker    vextracti128 [rsp+16*4], m0, 1
2508*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], xm0
2509*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
2510*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, [rsp+32*1]
2511*c0909341SAndroid Build Coastguard Worker    vperm2i128           m8, m1, m9, 0x31
2512*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm9, 1
2513*c0909341SAndroid Build Coastguard Worker    vperm2i128           m9, m2, m10, 0x31
2514*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm10, 1
2515*c0909341SAndroid Build Coastguard Worker    vperm2i128          m10, m3, m11, 0x31
2516*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm11, 1
2517*c0909341SAndroid Build Coastguard Worker    vperm2i128          m11, m4, m12, 0x31
2518*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, xm12, 1
2519*c0909341SAndroid Build Coastguard Worker    vperm2i128          m12, m5, m13, 0x31
2520*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, xm13, 1
2521*c0909341SAndroid Build Coastguard Worker    vperm2i128          m13, m6, m14, 0x31
2522*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, xm14, 1
2523*c0909341SAndroid Build Coastguard Worker    vperm2i128          m14, m7, m15, 0x31
2524*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, xm15, 1
2525*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+32*2]
2526*c0909341SAndroid Build Coastguard Worker.pass1_end3:
2527*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m9, m10
2528*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m10
2529*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m15, m8
2530*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m8
2531*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m11, m12
2532*c0909341SAndroid Build Coastguard Worker    punpcklwd           m11, m12
2533*c0909341SAndroid Build Coastguard Worker    punpckhwd           m12, m13, m14
2534*c0909341SAndroid Build Coastguard Worker    punpcklwd           m13, m14
2535*c0909341SAndroid Build Coastguard Worker    punpckhdq           m14, m11, m13
2536*c0909341SAndroid Build Coastguard Worker    punpckldq           m11, m13
2537*c0909341SAndroid Build Coastguard Worker    punpckldq           m13, m15, m9
2538*c0909341SAndroid Build Coastguard Worker    punpckhdq           m15, m9
2539*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m10, m0
2540*c0909341SAndroid Build Coastguard Worker    punpckhdq           m10, m0
2541*c0909341SAndroid Build Coastguard Worker    punpckhdq            m0, m8, m12
2542*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m12
2543*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m12, m13, m8
2544*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m13, m8
2545*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m9, m11
2546*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m9, m11
2547*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m11, m10, m14
2548*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m10, m14
2549*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m15, m0
2550*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m0
2551*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp]
2552*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m15
2553*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m4, m5
2554*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5
2555*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0, m1
2556*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
2557*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6, m7
2558*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7
2559*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m2, m3
2560*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
2561*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m2
2562*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
2563*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m6
2564*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m6
2565*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m5, m7
2566*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m7
2567*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m15, m1
2568*c0909341SAndroid Build Coastguard Worker    punpckhdq           m15, m1
2569*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2
2570*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2
2571*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4
2572*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
2573*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5, m7
2574*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m7
2575*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m6, m15
2576*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m15
2577*c0909341SAndroid Build Coastguard Worker    jmp                tx2q
2578*c0909341SAndroid Build Coastguard Worker.pass2:
2579*c0909341SAndroid Build Coastguard Worker    call .main
2580*c0909341SAndroid Build Coastguard Worker.end:
2581*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_2048)]
2582*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
2583*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m6
2584*c0909341SAndroid Build Coastguard Worker.end2:
2585*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
2586*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, [rsp+32*1]
2587*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2588*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  6,  0, strideq*0, strideq*1
2589*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
2590*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2591*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
2592*c0909341SAndroid Build Coastguard Worker    WRITE_16X2        [rsp],  7,  0,  1, strideq*2, r3
2593*c0909341SAndroid Build Coastguard Worker.end3:
2594*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
2595*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1
2596*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2597*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            8,  9,  0,  1, strideq*0, strideq*1
2598*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           10, 11,  0,  1, strideq*2, r3
2599*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m2},  0,  1,  2,  3,  4,  5,  6,  7
2600*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2601*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           12, 13,  0,  1, strideq*0, strideq*1
2602*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           14, 15,  0,  1, strideq*2, r3
2603*c0909341SAndroid Build Coastguard Worker    RET
2604*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2605*c0909341SAndroid Build Coastguard Workercglobal_label .main
2606*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
2607*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m1
2608*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m9
2609*c0909341SAndroid Build Coastguard Worker    IDCT8_1D              0,  2,  4,  6,  8, 10, 12, 14,  1,  9, 15
2610*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+32*2] ; in9
2611*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m14 ; tmp7
2612*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+gprsize+32*1] ; in1
2613*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m10 ; tmp5
2614*c0909341SAndroid Build Coastguard Worker    mova                m14, [rsp+gprsize+32*0] ; in15
2615*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m6  ; tmp3
2616*c0909341SAndroid Build Coastguard Worker    IDCT16_1D_ODDHALF     9,  3,  5,  7,  1, 11, 13, 14,  6, 10, 15
2617*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+32*1] ; tmp5
2618*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m0, m14  ; out15
2619*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m14      ; out0
2620*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m2, m13  ; out14
2621*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m13      ; out1
2622*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m2
2623*c0909341SAndroid Build Coastguard Worker    psubsw              m13, m4, m11  ; out13
2624*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m4, m11  ; out2
2625*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m8, m7   ; out11
2626*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m8, m7   ; out4
2627*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+32*2] ; tmp7
2628*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m6, m5   ; out10
2629*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m6       ; out5
2630*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m7, m9   ; out8
2631*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m9       ; out7
2632*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m12, m3  ; out9
2633*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m12, m3  ; out6
2634*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+32*0] ; tmp3
2635*c0909341SAndroid Build Coastguard Worker    psubsw              m12, m3, m1   ; out12
2636*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m1       ; out3
2637*c0909341SAndroid Build Coastguard Worker    ret
2638*c0909341SAndroid Build Coastguard Worker
2639*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, dct
2640*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, adst
2641*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN adst, flipadst
2642*c0909341SAndroid Build Coastguard Worker
2643*c0909341SAndroid Build Coastguard Workercglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
2644*c0909341SAndroid Build Coastguard Worker    ITX_16X16_LOAD_COEFS
2645*c0909341SAndroid Build Coastguard Worker    call .main
2646*c0909341SAndroid Build Coastguard Worker    call .main_pass1_end
2647*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m1, [cq+32*0]
2648*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m1, [cq+32*1]
2649*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m4, m6, m8, m10
2650*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m1, [cq+32*2]
2651*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m1, [cq+32*3]
2652*c0909341SAndroid Build Coastguard Worker    vextracti128 [rsp+16*5], m8, 1
2653*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], xm8
2654*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
2655*c0909341SAndroid Build Coastguard Worker    psubw                m1, m8, m1
2656*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_8bpc).pass1_end2
2657*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2658*c0909341SAndroid Build Coastguard Worker.pass2:
2659*c0909341SAndroid Build Coastguard Worker    call .main
2660*c0909341SAndroid Build Coastguard Worker    call .main_pass2_end
2661*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
2662*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m6
2663*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
2664*c0909341SAndroid Build Coastguard Worker    psubw                m1, m6, m1
2665*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_8bpc).end2
2666*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2667*c0909341SAndroid Build Coastguard Workercglobal_label .main
2668*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
2669*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m0
2670*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m4
2671*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        13,  2,  0,  4, 15,  995, 3973 ; t3,  t2
2672*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         9,  6,  0,  4, 15, 2440, 3290 ; t7,  t6
2673*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         5, 10,  0,  4, 15, 3513, 2106 ; t11, t10
2674*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         1, 14,  0,  4, 15, 4052,  601 ; t15, t14
2675*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m2, m10  ; t10a
2676*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m10      ; t2a
2677*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m13, m5  ; t11a
2678*c0909341SAndroid Build Coastguard Worker    paddsw              m13, m5       ; t3a
2679*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m6, m14  ; t14a
2680*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m14      ; t6a
2681*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m9, m1   ; t15a
2682*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m1       ; t7a
2683*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         0, 10,  1,  4, 15, 3406, 2276 ; t11, t10
2684*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        14,  5,  1,  4, 15, 2276, 3406 ; t14, t15
2685*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m10, m14 ; t14a
2686*c0909341SAndroid Build Coastguard Worker    paddsw              m10, m14      ; t10a
2687*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m0, m5   ; t15a
2688*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m5       ; t11a
2689*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m2, m6   ; t6
2690*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m6       ; t2
2691*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m13, m9  ; t7
2692*c0909341SAndroid Build Coastguard Worker    paddsw              m13, m9       ; t3
2693*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         6,  5,  4,  9, 15, 3784, 1567 ; t6a, t7a
2694*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        14,  1,  4,  9, 15, 3784, 1567 ; t14, t15
2695*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+gprsize+32*0] ; in15
2696*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m10 ; t10a
2697*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+32*1] ; in0
2698*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m6  ; t6a
2699*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+32*2] ; in4
2700*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m2  ; t2
2701*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         9,  4,  2, 10, 15,  201, 4091 ; t1,  t0
2702*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        11,  6,  2, 10, 15, 1751, 3703 ; t5,  t4
2703*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         7,  8,  2, 10, 15, 3035, 2751 ; t9,  t8
2704*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         3, 12,  2, 10, 15, 3857, 1380 ; t13, t12
2705*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m4, m8  ; t8a
2706*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m4      ; t0a
2707*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m9, m7  ; t9a
2708*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m7      ; t1a
2709*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m6, m12 ; t12a
2710*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m12     ; t4a
2711*c0909341SAndroid Build Coastguard Worker    psubsw              m12, m11, m3 ; t13a
2712*c0909341SAndroid Build Coastguard Worker    paddsw              m11, m3      ; t5a
2713*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        10,  4,  2,  3, 15,  799, 4017 ; t9,  t8
2714*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        12,  7,  2,  3, 15, 4017,  799 ; t12, t13
2715*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m9, m11 ; t5
2716*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m11     ; t1
2717*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m4, m12 ; t12a
2718*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m12     ; t8a
2719*c0909341SAndroid Build Coastguard Worker    paddsw              m12, m8, m6  ; t0
2720*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m6      ; t4
2721*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m10, m7 ; t9a
2722*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m7      ; t13a
2723*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         8,  3,  2,  7, 15, 1567, 3784 ; t5a, t4a
2724*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        11, 10,  2,  7, 15, 1567, 3784 ; t13, t12
2725*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+32*0] ; t10a
2726*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+32*1] ; t6a
2727*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m9, m13  ; -out15
2728*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m13      ;  t3a
2729*c0909341SAndroid Build Coastguard Worker    paddsw              m13, m11, m1  ; -out13
2730*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m1       ;  t15a
2731*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m4, m7   ;  t10
2732*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m4       ; -out1
2733*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m3, m2   ;  t6
2734*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2       ; -out3
2735*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m10, m14 ;  out2
2736*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m14      ;  t14a
2737*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m6, m0   ;  out14
2738*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m0       ;  t11
2739*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+32*2] ; t2
2740*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m7
2741*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m12, m0  ;  t2a
2742*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m12      ;  out0
2743*c0909341SAndroid Build Coastguard Worker    paddsw              m12, m8, m5   ;  out12
2744*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m5       ;  t7
2745*c0909341SAndroid Build Coastguard Worker    ret
2746*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2747*c0909341SAndroid Build Coastguard Worker.main_pass1_end:
2748*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m0
2749*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*1], m2
2750*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*2], m12
2751*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*3], m14
2752*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [pw_m2896_2896]
2753*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [pw_2896_2896]
2754*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [pd_2048]
2755*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m11, m10
2756*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m10
2757*c0909341SAndroid Build Coastguard Worker    pmaddwd             m10, m14, m5
2758*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m14, m11
2759*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m12
2760*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m12
2761*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m2}, m10, m0, m5, m11
2762*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m10, m0, m5, m11
2763*c0909341SAndroid Build Coastguard Worker    packssdw            m10, m0  ;  out10
2764*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m11 ; -out5
2765*c0909341SAndroid Build Coastguard Worker    punpcklwd           m11, m8, m4
2766*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m4
2767*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m12, m11
2768*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m12, m8
2769*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m14
2770*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m14
2771*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m2}, m4, m0, m11, m8
2772*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m4, m0, m11, m8
2773*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m0  ;  out4
2774*c0909341SAndroid Build Coastguard Worker    packssdw            m11, m8  ; -out11
2775*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9, m7
2776*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m7
2777*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m12, m8
2778*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m12, m9
2779*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m14
2780*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m14
2781*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m2}, m7, m0, m8, m9
2782*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m7, m0, m8, m9
2783*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0  ; -out7
2784*c0909341SAndroid Build Coastguard Worker    packssdw             m8, m9  ;  out8
2785*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m6, m1
2786*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m1
2787*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m14, m0
2788*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m14, m6
2789*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m12
2790*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m12
2791*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m2}, m1, m9, m0, m6
2792*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 12}, m1, m9, m0, m6
2793*c0909341SAndroid Build Coastguard Worker    packssdw             m9, m1  ; -out7
2794*c0909341SAndroid Build Coastguard Worker    packssdw             m6, m0  ;  out8
2795*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_8192)]
2796*c0909341SAndroid Build Coastguard Worker    ret
2797*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2798*c0909341SAndroid Build Coastguard Workercglobal_label .main_pass2_end
2799*c0909341SAndroid Build Coastguard Worker    ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
2800*c0909341SAndroid Build Coastguard Worker    ; 16-bit here will produce the same result as using 32-bit intermediates.
2801*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m10, m11 ; -out5
2802*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m11      ;  out10
2803*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m4, m8   ; -out11
2804*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m8       ;  out4
2805*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m7, m9   ;  out8
2806*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m9       ; -out7
2807*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m1, m6   ; -out9
2808*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m1       ;  out6
2809*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_2896x8)]
2810*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
2811*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(pw_2048)]
2812*c0909341SAndroid Build Coastguard Worker    ret
2813*c0909341SAndroid Build Coastguard Worker
2814*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, dct
2815*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, adst
2816*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN flipadst, flipadst
2817*c0909341SAndroid Build Coastguard Worker
2818*c0909341SAndroid Build Coastguard Workercglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
2819*c0909341SAndroid Build Coastguard Worker    ITX_16X16_LOAD_COEFS
2820*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_8bpc).main
2821*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_8bpc).main_pass1_end
2822*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m1
2823*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m1, m8
2824*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m6
2825*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m1, m4
2826*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m1, m10
2827*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m1, [cq+32*3]
2828*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m1, [cq+32*2]
2829*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m1, [cq+32*1]
2830*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m1, [cq+32*0]
2831*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2832*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1
2833*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m0}, m3, m5, m7, m11, m15
2834*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m0, m9
2835*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m0, m13
2836*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, [rsp+32*1]
2837*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], xm15
2838*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], xm7
2839*c0909341SAndroid Build Coastguard Worker    vperm2i128          m15, m15, m7, 0x31
2840*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, m2, xm14, 1
2841*c0909341SAndroid Build Coastguard Worker    vperm2i128          m14, m2, m14, 0x31
2842*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m9, xm5, 1
2843*c0909341SAndroid Build Coastguard Worker    vperm2i128           m9, m9, m5, 0x31
2844*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, m4, xm12, 1
2845*c0909341SAndroid Build Coastguard Worker    vperm2i128          m12, m4, m12, 0x31
2846*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, m11, xm3, 1
2847*c0909341SAndroid Build Coastguard Worker    vperm2i128          m11, m11, m3, 0x31
2848*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m10, xm6, 1
2849*c0909341SAndroid Build Coastguard Worker    vperm2i128          m10, m10, m6, 0x31
2850*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, m1, xm0, 1
2851*c0909341SAndroid Build Coastguard Worker    vperm2i128          m13, m1, m0, 0x31
2852*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m8, [rsp+32*2], 1
2853*c0909341SAndroid Build Coastguard Worker    vperm2i128           m8, m8, [rsp+32*2], 0x31
2854*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_8bpc).pass1_end3
2855*c0909341SAndroid Build Coastguard Worker.pass2:
2856*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_8bpc).main
2857*c0909341SAndroid Build Coastguard Worker    call m(iadst_16x16_internal_8bpc).main_pass2_end
2858*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m1
2859*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m1
2860*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m0
2861*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m8
2862*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
2863*c0909341SAndroid Build Coastguard Worker    psubw                m0, m1
2864*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m0, m7
2865*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m0, m9
2866*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m1, m6
2867*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m1, m10
2868*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m0, m5
2869*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m0, m11
2870*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m1, m4
2871*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m1, m12
2872*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m0, m3
2873*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m0, m13
2874*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m1, m2
2875*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m14
2876*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m0, [rsp+32*1]
2877*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15
2878*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
2879*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  2,  0, strideq*0, strideq*1
2880*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+32*0]
2881*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            3,  4,  0,  1, strideq*2, r3
2882*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
2883*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            5,  6,  0,  1, strideq*0, strideq*1
2884*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            7, [rsp+32*2],  0,  1, strideq*2, r3
2885*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_8bpc).end3
2886*c0909341SAndroid Build Coastguard Worker
2887*c0909341SAndroid Build Coastguard Worker%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
2888*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%2, m%3, m%1
2889*c0909341SAndroid Build Coastguard Worker    psraw               m%2, 1
2890*c0909341SAndroid Build Coastguard Worker    pavgw               m%1, m%2 ; signs are guaranteed to be equal
2891*c0909341SAndroid Build Coastguard Worker%endmacro
2892*c0909341SAndroid Build Coastguard Worker
2893*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, dct
2894*c0909341SAndroid Build Coastguard WorkerINV_TXFM_16X16_FN identity, identity
2895*c0909341SAndroid Build Coastguard Worker
2896*c0909341SAndroid Build Coastguard Workercglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
2897*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_1697x16)]
2898*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+16* 0]
2899*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+16*16], 1
2900*c0909341SAndroid Build Coastguard Worker    mova               xm15, [cq+16* 1]
2901*c0909341SAndroid Build Coastguard Worker    vinserti128         m15, [cq+16*17], 1
2902*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq+16* 2]
2903*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [cq+16*18], 1
2904*c0909341SAndroid Build Coastguard Worker    mova                xm8, [cq+16* 3]
2905*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [cq+16*19], 1
2906*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+16* 4]
2907*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+16*20], 1
2908*c0909341SAndroid Build Coastguard Worker    mova                xm9, [cq+16* 5]
2909*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [cq+16*21], 1
2910*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq+16* 6]
2911*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [cq+16*22], 1
2912*c0909341SAndroid Build Coastguard Worker    mova               xm10, [cq+16* 7]
2913*c0909341SAndroid Build Coastguard Worker    add                  cq, 16*16
2914*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [cq+16* 7], 1
2915*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq-16* 8]
2916*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+16* 8], 1
2917*c0909341SAndroid Build Coastguard Worker    mova               xm11, [cq-16* 7]
2918*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [cq+16* 9], 1
2919*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq-16* 6]
2920*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [cq+16*10], 1
2921*c0909341SAndroid Build Coastguard Worker    mova               xm12, [cq-16* 5]
2922*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [cq+16*11], 1
2923*c0909341SAndroid Build Coastguard Worker    mova               xm13, [cq-16* 3]
2924*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [cq+16*13], 1
2925*c0909341SAndroid Build Coastguard Worker    mova               xm14, [cq-16* 1]
2926*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [cq+16*15], 1
2927*c0909341SAndroid Build Coastguard Worker    REPX  {IDTX16B x, 6, 7},  0, 15,  1,  8,  2,  9,  3, \
2928*c0909341SAndroid Build Coastguard Worker                             10,  4, 11,  5, 12, 13, 14
2929*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq-16* 4]
2930*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [cq+16*12], 1
2931*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m0
2932*c0909341SAndroid Build Coastguard Worker    IDTX16B               6, 0, 7
2933*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq-16* 2]
2934*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+16*14], 1
2935*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m0
2936*c0909341SAndroid Build Coastguard Worker    psraw                m7, 1
2937*c0909341SAndroid Build Coastguard Worker    pavgw                m7, m0
2938*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_8bpc).pass1_end3
2939*c0909341SAndroid Build Coastguard WorkerALIGN function_align
2940*c0909341SAndroid Build Coastguard Worker.pass2:
2941*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_1697x16)]
2942*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m0
2943*c0909341SAndroid Build Coastguard Worker    REPX  {IDTX16 x, 0, 15},  1,  2,  3,  4,  5,  6,  7, \
2944*c0909341SAndroid Build Coastguard Worker                              8,  9, 10, 11, 12, 13, 14
2945*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+32*1]
2946*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m1
2947*c0909341SAndroid Build Coastguard Worker    IDTX16                0, 1, 15
2948*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*0]
2949*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m1
2950*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m1
2951*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m1
2952*c0909341SAndroid Build Coastguard Worker    jmp m(idct_16x16_internal_8bpc).end
2953*c0909341SAndroid Build Coastguard Worker
2954*c0909341SAndroid Build Coastguard Worker%define o_base deint_shuf + 128
2955*c0909341SAndroid Build Coastguard Worker
2956*c0909341SAndroid Build Coastguard Worker%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
2957*c0909341SAndroid Build Coastguard Worker%if %3
2958*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_2896x8)]
2959*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15, [%1+%2*0]
2960*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15, [%1+%2*1]
2961*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m15, [%1+%2*2]
2962*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m15, [%1+%2*3]
2963*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m15, [%1+%2*4]
2964*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m15, [%1+%2*5]
2965*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m15, [%1+%2*6]
2966*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m15, [%1+%2*7]
2967*c0909341SAndroid Build Coastguard Worker%else
2968*c0909341SAndroid Build Coastguard Worker    mova                 m0, [%1+%2*0]
2969*c0909341SAndroid Build Coastguard Worker    mova                 m1, [%1+%2*1]
2970*c0909341SAndroid Build Coastguard Worker    mova                 m2, [%1+%2*2]
2971*c0909341SAndroid Build Coastguard Worker    mova                 m3, [%1+%2*3]
2972*c0909341SAndroid Build Coastguard Worker    mova                 m4, [%1+%2*4]
2973*c0909341SAndroid Build Coastguard Worker    mova                 m5, [%1+%2*5]
2974*c0909341SAndroid Build Coastguard Worker    mova                 m6, [%1+%2*6]
2975*c0909341SAndroid Build Coastguard Worker    mova                 m7, [%1+%2*7]
2976*c0909341SAndroid Build Coastguard Worker%endif
2977*c0909341SAndroid Build Coastguard Worker%endmacro
2978*c0909341SAndroid Build Coastguard Worker
2979*c0909341SAndroid Build Coastguard Worker%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2
2980*c0909341SAndroid Build Coastguard Worker%if %3
2981*c0909341SAndroid Build Coastguard Worker%if %3 == 1
2982*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_2896x8)]
2983*c0909341SAndroid Build Coastguard Worker%endif
2984*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m15, [%1+%2*0]
2985*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m15, [%1+%2*1]
2986*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m15, [%1+%2*2]
2987*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m15, [%1+%2*3]
2988*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m15, [%1+%2*4]
2989*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m15, [%1+%2*5]
2990*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m15, [%1+%2*6]
2991*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15,      [%1+%2*7]
2992*c0909341SAndroid Build Coastguard Worker%else
2993*c0909341SAndroid Build Coastguard Worker    mova                 m8, [%1+%2*0]
2994*c0909341SAndroid Build Coastguard Worker    mova                 m9, [%1+%2*1]
2995*c0909341SAndroid Build Coastguard Worker    mova                m10, [%1+%2*2]
2996*c0909341SAndroid Build Coastguard Worker    mova                m11, [%1+%2*3]
2997*c0909341SAndroid Build Coastguard Worker    mova                m12, [%1+%2*4]
2998*c0909341SAndroid Build Coastguard Worker    mova                m13, [%1+%2*5]
2999*c0909341SAndroid Build Coastguard Worker    mova                m14, [%1+%2*6]
3000*c0909341SAndroid Build Coastguard Worker    mova                m15, [%1+%2*7]
3001*c0909341SAndroid Build Coastguard Worker%endif
3002*c0909341SAndroid Build Coastguard Worker%endmacro
3003*c0909341SAndroid Build Coastguard Worker
3004*c0909341SAndroid Build Coastguard Worker%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
3005*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%4_%5x8]
3006*c0909341SAndroid Build Coastguard Worker    punpcklwd           m%1, m%2, m%2
3007*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m%3
3008*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m%3, [r5-pw_201_4091x8+pw_%6_%7x8]
3009*c0909341SAndroid Build Coastguard Worker    punpckhwd           m%2, m%2
3010*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%2, m%3
3011*c0909341SAndroid Build Coastguard Worker%endmacro
3012*c0909341SAndroid Build Coastguard Worker
3013*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
3014*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
3015*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3016*c0909341SAndroid Build Coastguard Worker    jz .dconly
3017*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
3018*c0909341SAndroid Build Coastguard Worker    %undef cmp
3019*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 106
3020*c0909341SAndroid Build Coastguard Worker    jle .fast
3021*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+32*1, 32*2
3022*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_8bpc).main
3023*c0909341SAndroid Build Coastguard Worker    vperm2i128          m11, m0, m4, 0x31
3024*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm4, 1
3025*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m1, m5, 0x31
3026*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm5, 1
3027*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m2, m6, 0x31
3028*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm6, 1
3029*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m3, m7, 0x31
3030*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm7, 1
3031*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3032*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
3033*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0, m1
3034*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
3035*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3
3036*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
3037*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m11, m4
3038*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m4
3039*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5, m6
3040*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6
3041*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m0, m2
3042*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
3043*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m5
3044*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m5
3045*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m11, m4
3046*c0909341SAndroid Build Coastguard Worker    punpckldq           m11, m4
3047*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m7, m1
3048*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m1
3049*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m12, m6, m0
3050*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m6     ; out4
3051*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m13, m7, m4
3052*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m7     ; out5
3053*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m14, m3, m2
3054*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3     ; out6
3055*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m5, m11
3056*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m11, m5     ; out7
3057*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m0
3058*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m4
3059*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m2
3060*c0909341SAndroid Build Coastguard Worker.fast:
3061*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+32*0, 32*2
3062*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_8bpc).main
3063*c0909341SAndroid Build Coastguard Worker    vperm2i128           m8, m0, m4, 0x31
3064*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm4, 1
3065*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m1, m5, 0x31
3066*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm5, 1
3067*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m2, m6, 0x31
3068*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm6, 1
3069*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m3, m7, 0x31
3070*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm7, 1
3071*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_8192)]
3072*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3073*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
3074*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0, m1
3075*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
3076*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2, m3
3077*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
3078*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m8, m4
3079*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m4
3080*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5, m6
3081*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6
3082*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m0, m2
3083*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
3084*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m8, m5
3085*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m5
3086*c0909341SAndroid Build Coastguard Worker    punpckhdq            m5, m3, m4
3087*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m4
3088*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m7, m1
3089*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m1
3090*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m7, m4
3091*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m4     ; out9
3092*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m2, m8 ; out10
3093*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m8
3094*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m8, m3, m5
3095*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m5
3096*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m0, m6 ; out8
3097*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m6
3098*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7
3099*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 106
3100*c0909341SAndroid Build Coastguard Worker    jg .full
3101*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m5
3102*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m7
3103*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m4
3104*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m9, m8
3105*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
3106*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7
3107*c0909341SAndroid Build Coastguard Worker    call .main_fast
3108*c0909341SAndroid Build Coastguard Worker    jmp .pass2
3109*c0909341SAndroid Build Coastguard Worker.dconly:
3110*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
3111*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
3112*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_8192)]
3113*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
3114*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
3115*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
3116*c0909341SAndroid Build Coastguard Worker.full:
3117*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m12, m13, m14, m15
3118*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m9, [rsp+32*2]
3119*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m4
3120*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9, [rsp+32*0]
3121*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m5
3122*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9, [rsp+32*1]
3123*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m7
3124*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m9, m11
3125*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m9, m8
3126*c0909341SAndroid Build Coastguard Worker    call .main
3127*c0909341SAndroid Build Coastguard Worker.pass2:
3128*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2048)]
3129*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m0,  m1,  m2,  m3,  m4,  m5,  m6,  m7, \
3130*c0909341SAndroid Build Coastguard Worker                             m8,  m9,  m10, m11,      m13, m14, m15
3131*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, [rsp]
3132*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14
3133*c0909341SAndroid Build Coastguard Worker    REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15
3134*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m4
3135*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m6
3136*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3137*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             0,  1,  4,  6
3138*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3139*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             2,  3,  4,  6
3140*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3141*c0909341SAndroid Build Coastguard Worker    WRITE_8X4    [rsp+32*0],  5,  4,  6
3142*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3143*c0909341SAndroid Build Coastguard Worker    WRITE_8X4    [rsp+32*1],  7,  4,  6
3144*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3145*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             8,  9,  4,  6
3146*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3147*c0909341SAndroid Build Coastguard Worker    WRITE_8X4            10, 11,  4,  6
3148*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3149*c0909341SAndroid Build Coastguard Worker    WRITE_8X4            12, 13,  4,  6
3150*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
3151*c0909341SAndroid Build Coastguard Worker    WRITE_8X4            14, 15,  4,  6
3152*c0909341SAndroid Build Coastguard Worker    RET
3153*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3154*c0909341SAndroid Build Coastguard Workercglobal_label .main_fast ; bottom half is zero
3155*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main
3156*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+gprsize+0*32]
3157*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0*32], m0
3158*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+gprsize+1*32]
3159*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+1*32], m1
3160*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+2*32]
3161*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+2*32], m6
3162*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r6-(o_base)+pw_201_4091x8]
3163*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW    1,  8,  6,  201, 4091,  m601, 4052 ; t16a, t31a, t23a, t24a
3164*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW   15,  9,  6,  995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
3165*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW   14,  0,  6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
3166*c0909341SAndroid Build Coastguard Worker    ITX_UNPACK_MULHRSW   13, 11,  6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
3167*c0909341SAndroid Build Coastguard Worker    jmp .main2
3168*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3169*c0909341SAndroid Build Coastguard Workercglobal_label .main
3170*c0909341SAndroid Build Coastguard Worker    call m(idct_8x16_internal_8bpc).main
3171*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+gprsize+0*32]
3172*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0*32], m0
3173*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+gprsize+1*32]
3174*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+1*32], m1
3175*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+2*32]
3176*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+2*32], m6
3177*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m15, m8  ; in31 in1
3178*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m15      ; in3  in29
3179*c0909341SAndroid Build Coastguard Worker    punpcklwd           m15, m14, m9  ; in27 in5
3180*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m14      ; in7  in25
3181*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m13, m0  ; in23 in9
3182*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m13      ; in11 in21
3183*c0909341SAndroid Build Coastguard Worker    punpcklwd           m13, m12, m11 ; in19 in13
3184*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m12      ; in15 in17
3185*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        1,  6, 12, 10,  201, 4091, 3 ; t16a, t31a
3186*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8,  6, 12, 10, 4052,  601, 3 ; t23a, t24a
3187*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       15,  6, 12, 10,  995, 3973, 3 ; t20a, t27a
3188*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        9,  6, 12, 10, 3857, 1380, 3 ; t19a, t28a
3189*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       14,  6, 12, 10, 1751, 3703, 3 ; t18a, t29a
3190*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0,  6, 12, 10, 3513, 2106, 3 ; t21a, t26a
3191*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       13,  6, 12, 10, 2440, 3290, 3 ; t22a, t25a
3192*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       11,  6, 12, 10, 3035, 2751, 3 ; t17a, t30a
3193*c0909341SAndroid Build Coastguard Worker.main2:
3194*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m1, m11  ; t17 t30
3195*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m11      ; t16 t31
3196*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m9, m14  ; t18 t29
3197*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m14      ; t19 t28
3198*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m15, m0  ; t21 t26
3199*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m0       ; t20 t27
3200*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m8, m13  ; t22 t25
3201*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m13      ; t23 t24
3202*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        6, 12, 13, 10,   799, 4017, 3 ; t17a t30a
3203*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       11, 12, 13, 10, m4017,  799, 3 ; t18a t29a
3204*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       14, 12, 13, 10,  3406, 2276, 3 ; t21a t26a
3205*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
3206*c0909341SAndroid Build Coastguard Worker    psubsw              m13, m1, m9   ; t19a t28a
3207*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m9       ; t16a t31a
3208*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m8, m15  ; t20a t27a
3209*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m15      ; t23a t24a
3210*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m6, m11  ; t18  t29
3211*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m11      ; t17  t30
3212*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m0, m14  ; t21  t26
3213*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m14      ; t22  t25
3214*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       15, 12, 14, 10,  1567, 3784, 3 ; t18a t29a
3215*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       13, 12, 14, 10,  1567, 3784, 3 ; t19  t28
3216*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        9, 12, 14, 10, m3784, 1567, 3 ; t20  t27
3217*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
3218*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m12, [o(deint_shuf)]
3219*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m1, m8   ; t23  t24
3220*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m8       ; t16  t31
3221*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m6, m0   ; t22a t25a
3222*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m0       ; t17a t30a
3223*c0909341SAndroid Build Coastguard Worker    psubsw               m0, m15, m11 ; t21  t26
3224*c0909341SAndroid Build Coastguard Worker    paddsw              m15, m11      ; t18  t29
3225*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m13, m9  ; t20a t27a
3226*c0909341SAndroid Build Coastguard Worker    paddsw              m13, m9       ; t19a t28a
3227*c0909341SAndroid Build Coastguard Worker    REPX    {pshufb x, m12}, m1, m6, m15, m13
3228*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       14,  9, 12, 10, 2896, 2896 ; t24a t23a
3229*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m2896_2896)]
3230*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        8, 12,  _, 10, 12,  9, 4  ; t22  t25
3231*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2896_2896)]
3232*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK        0, 12,  _, 10, 12,  9, 4  ; t21a t26a
3233*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_2896_2896)]
3234*c0909341SAndroid Build Coastguard Worker    ITX_MUL2X_PACK       11,  9,  _, 10,  9, 12, 4  ; t27  t20
3235*c0909341SAndroid Build Coastguard Worker    shufps               m9, m14, m8, q1032 ; t23a t22
3236*c0909341SAndroid Build Coastguard Worker    vpblendd            m14, m8, 0xcc       ; t24a t25
3237*c0909341SAndroid Build Coastguard Worker    shufps               m8, m11, m0, q1032 ; t20  t21a
3238*c0909341SAndroid Build Coastguard Worker    vpblendd            m11, m0, 0xcc       ; t27  t26a
3239*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m1, m6   ; t16  t17a
3240*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m6       ; t31  t30a
3241*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m5, m8   ; out20 out21
3242*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m8       ; out11 out10
3243*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m3, m14  ; out24 out25
3244*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m14      ; out7  out6
3245*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m7, m0   ; out16 out17
3246*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m0       ; out15 out14
3247*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+gprsize+0*32]
3248*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m12, m13, m15 ; t19a t18
3249*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m13, m15      ; t28a t29
3250*c0909341SAndroid Build Coastguard Worker    psubsw              m15, m0, m1   ; out31 out30
3251*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1       ; out0  out1
3252*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+1*32]
3253*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0*32], m6
3254*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+2*32]
3255*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m1, m13  ; out28 out29
3256*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m13      ; out3  out2
3257*c0909341SAndroid Build Coastguard Worker    psubsw              m13, m2, m11  ; out27 out26
3258*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m11      ; out4  out5
3259*c0909341SAndroid Build Coastguard Worker    psubsw              m11, m4, m9   ; out23 out22
3260*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m9       ; out8  out9
3261*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m6, m12  ; out19 out18
3262*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m12      ; out12 out13
3263*c0909341SAndroid Build Coastguard Worker    ret
3264*c0909341SAndroid Build Coastguard Worker
3265*c0909341SAndroid Build Coastguard Worker%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
3266*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m%1, [cq+16*%3]
3267*c0909341SAndroid Build Coastguard Worker    vbroadcasti128      m%2, [cq+16*%4]
3268*c0909341SAndroid Build Coastguard Worker    shufpd              m%1, m%2, 0x0c
3269*c0909341SAndroid Build Coastguard Worker%endmacro
3270*c0909341SAndroid Build Coastguard Worker
3271*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
3272*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
3273*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3274*c0909341SAndroid Build Coastguard Worker    jnz .normal
3275*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
3276*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
3277*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_8192)]
3278*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
3279*c0909341SAndroid Build Coastguard Worker    or                  r3d, 8
3280*c0909341SAndroid Build Coastguard Worker.dconly:
3281*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
3282*c0909341SAndroid Build Coastguard Worker    movd                xm2, [pw_2048] ; intentionally rip-relative
3283*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
3284*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
3285*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
3286*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
3287*c0909341SAndroid Build Coastguard Worker.dconly_loop:
3288*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq]
3289*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m1, m3
3290*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3
3291*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
3292*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
3293*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
3294*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m1
3295*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3296*c0909341SAndroid Build Coastguard Worker    dec                 r3d
3297*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
3298*c0909341SAndroid Build Coastguard Worker    RET
3299*c0909341SAndroid Build Coastguard Worker.normal:
3300*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 4, 16, 32*3, dst, stride, c, eob
3301*c0909341SAndroid Build Coastguard Worker    %undef cmp
3302*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      0,  7,  0,  2 ; in0  in2
3303*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      4,  7,  1,  3 ; in1  in3
3304*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      1,  7,  4,  6 ; in4  in6
3305*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      5,  7,  5,  7 ; in5  in7
3306*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
3307*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
3308*c0909341SAndroid Build Coastguard Worker    add                  cq, 16*16
3309*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      2,  7, -8, -6 ; in8  in10
3310*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      6,  7, -7, -5 ; in9  in11
3311*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      3,  7, -4, -2 ; in12 in14
3312*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     11,  7, -3, -1 ; in13 in15
3313*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1
3314*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m4
3315*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m5
3316*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m6
3317*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 106
3318*c0909341SAndroid Build Coastguard Worker    jg .full
3319*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
3320*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7
3321*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
3322*c0909341SAndroid Build Coastguard Worker    jmp .pass2
3323*c0909341SAndroid Build Coastguard Worker.full:
3324*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      4,  7,  0,  2 ; in16 in18
3325*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     12,  7,  3,  1 ; in19 in17
3326*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      5,  7,  4,  6 ; in20 in22
3327*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     13,  7,  7,  5 ; in23 in21
3328*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
3329*c0909341SAndroid Build Coastguard Worker    add                  cq, 16*8
3330*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      6,  7,  0,  2 ; in24 in26
3331*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     14,  7,  3,  1 ; in27 in25
3332*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2      7,  8,  4,  6 ; in28 in30
3333*c0909341SAndroid Build Coastguard Worker    LOAD_PACKED_16X2     15,  8,  7,  5 ; in31 in29
3334*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
3335*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8},  0,  1,  2,  3
3336*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_8x32_8bpc).main
3337*c0909341SAndroid Build Coastguard Worker.pass2:
3338*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_8192)]
3339*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
3340*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m9
3341*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m10
3342*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m0, m2
3343*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
3344*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m3
3345*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3
3346*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m4, m6
3347*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m6
3348*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m5, m7
3349*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m7
3350*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m9
3351*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m9
3352*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m2, m1
3353*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1
3354*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m10, m4
3355*c0909341SAndroid Build Coastguard Worker    punpckhwd           m10, m4
3356*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6
3357*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6
3358*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
3359*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
3360*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m9
3361*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m9
3362*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m4
3363*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m4
3364*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m10, m5
3365*c0909341SAndroid Build Coastguard Worker    punpckhdq           m10, m5
3366*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10
3367*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, [rsp+32*0]
3368*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m8
3369*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m0, m6, 0x31
3370*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm6, 1
3371*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m1, m7, 0x31
3372*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm7, 1
3373*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m2, m9, 0x31
3374*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm9, 1
3375*c0909341SAndroid Build Coastguard Worker    vperm2i128           m7, m3, m10, 0x31
3376*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm10, 1
3377*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_8bpc).main
3378*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2048)]
3379*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
3380*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
3381*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
3382*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
3383*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*4]
3384*c0909341SAndroid Build Coastguard Worker    %define dstq r3
3385*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
3386*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
3387*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+32*0]
3388*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
3389*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+32*2]
3390*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0, m2
3391*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2
3392*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1, m11
3393*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m11
3394*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m12, m14
3395*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m14
3396*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m13, m15
3397*c0909341SAndroid Build Coastguard Worker    punpcklwd           m13, m15
3398*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m0, m7
3399*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m7
3400*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m2, m1
3401*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1
3402*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m12, m4
3403*c0909341SAndroid Build Coastguard Worker    punpckhwd           m12, m4
3404*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m13
3405*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m13
3406*c0909341SAndroid Build Coastguard Worker    punpckhdq            m1, m0, m2
3407*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
3408*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3, m9
3409*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m9
3410*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m7, m4
3411*c0909341SAndroid Build Coastguard Worker    punpckhdq            m7, m4
3412*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m12, m5
3413*c0909341SAndroid Build Coastguard Worker    punpckhdq           m12, m5
3414*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m0, m6, 0x31
3415*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, xm6, 1
3416*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m1, m7, 0x31
3417*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, xm7, 1
3418*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m2, m9, 0x31
3419*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, xm9, 1
3420*c0909341SAndroid Build Coastguard Worker    vperm2i128           m7, m3, m12, 0x31
3421*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, xm12, 1
3422*c0909341SAndroid Build Coastguard Worker    call m(idct_16x8_internal_8bpc).main2
3423*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2048)]
3424*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
3425*c0909341SAndroid Build Coastguard Worker    add                  r0, 16
3426*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
3427*c0909341SAndroid Build Coastguard Worker    %define dstq r0
3428*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
3429*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2,  3,  0,  1, strideq*2, r2
3430*c0909341SAndroid Build Coastguard Worker    %define dstq r3
3431*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
3432*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            6,  7,  0,  1, strideq*2, r2
3433*c0909341SAndroid Build Coastguard Worker    RET
3434*c0909341SAndroid Build Coastguard Worker
3435*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob
3436*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pw_5]
3437*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
3438*c0909341SAndroid Build Coastguard Worker    sub                eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
3439*c0909341SAndroid Build Coastguard Worker.loop:
3440*c0909341SAndroid Build Coastguard Worker    mova                xm0,[cq+16* 0]
3441*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq+16* 4]
3442*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+16* 1], 1
3443*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [cq+16* 5], 1
3444*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
3445*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m8
3446*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*2], m8
3447*c0909341SAndroid Build Coastguard Worker    add                  cq, 16*16
3448*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq-16* 8]
3449*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq-16* 4]
3450*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq-16* 7], 1
3451*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [cq-16* 3], 1
3452*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq+16* 0]
3453*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq+16* 4]
3454*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+16* 1], 1
3455*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [cq+16* 5], 1
3456*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq+16* 8]
3457*c0909341SAndroid Build Coastguard Worker    mova                xm7, [cq+16*12]
3458*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [cq+16* 9], 1
3459*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [cq+16*13], 1
3460*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8}, -4, -2,  0,  2,  4,  6
3461*c0909341SAndroid Build Coastguard Worker    REPX  {paddsw    x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
3462*c0909341SAndroid Build Coastguard Worker    call .transpose8x8
3463*c0909341SAndroid Build Coastguard Worker    REPX  {psraw     x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
3464*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             0,  4,  8, 10, strideq*8, strideq*4, r4*4
3465*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3466*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             1,  5,  0,  4, strideq*8, strideq*4, r4*4
3467*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3468*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             2,  6,  0,  4, strideq*8, strideq*4, r4*4
3469*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3470*c0909341SAndroid Build Coastguard Worker    WRITE_8X4             3,  7,  0,  4, strideq*8, strideq*4, r4*4
3471*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3472*c0909341SAndroid Build Coastguard Worker    sub                  cq, 16*16-32
3473*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+r4*4]
3474*c0909341SAndroid Build Coastguard Worker    add                eobd, 0x80000000
3475*c0909341SAndroid Build Coastguard Worker    jnc .loop
3476*c0909341SAndroid Build Coastguard Worker    RET
3477*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3478*c0909341SAndroid Build Coastguard Worker.transpose8x8:
3479*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m4, m5
3480*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5
3481*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0, m1
3482*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
3483*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6, m7
3484*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7
3485*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m2, m3
3486*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
3487*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m2
3488*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
3489*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m6
3490*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m6
3491*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m5, m7
3492*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m7
3493*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m8, m1
3494*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m1
3495*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2
3496*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2
3497*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4
3498*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
3499*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5, m7
3500*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m7
3501*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m6, m8
3502*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m8
3503*c0909341SAndroid Build Coastguard Worker    ret
3504*c0909341SAndroid Build Coastguard Worker
3505*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob
3506*c0909341SAndroid Build Coastguard Worker    add                  cq, 16*8
3507*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pw_4096]
3508*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
3509*c0909341SAndroid Build Coastguard Worker    lea                  r5, [dstq+strideq*4]
3510*c0909341SAndroid Build Coastguard Worker    sub                eobd, 107
3511*c0909341SAndroid Build Coastguard Worker.loop:
3512*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq-16*8]
3513*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq-16*7]
3514*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+16*0], 1
3515*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [cq+16*1], 1
3516*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq-16*6]
3517*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq-16*5]
3518*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+16*2], 1
3519*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [cq+16*3], 1
3520*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq-16*4]
3521*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq-16*3]
3522*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+16*4], 1
3523*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [cq+16*5], 1
3524*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq-16*2]
3525*c0909341SAndroid Build Coastguard Worker    mova                xm7, [cq-16*1]
3526*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [cq+16*6], 1
3527*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [cq+16*7], 1
3528*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
3529*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8}, -4, -3, -2, -1,  0,  1,  2,  3
3530*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
3531*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
3532*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
3533*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
3534*c0909341SAndroid Build Coastguard Worker    %define dstq r5
3535*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
3536*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
3537*c0909341SAndroid Build Coastguard Worker    add                  cq, 16*16
3538*c0909341SAndroid Build Coastguard Worker    add                  r0, 16
3539*c0909341SAndroid Build Coastguard Worker    add                  r5, 16
3540*c0909341SAndroid Build Coastguard Worker    add                eobd, 0x80000000
3541*c0909341SAndroid Build Coastguard Worker    jnc .loop
3542*c0909341SAndroid Build Coastguard Worker    RET
3543*c0909341SAndroid Build Coastguard Worker
3544*c0909341SAndroid Build Coastguard Worker%define o_base pw_5 + 128
3545*c0909341SAndroid Build Coastguard Worker
3546*c0909341SAndroid Build Coastguard Worker%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs
3547*c0909341SAndroid Build Coastguard Worker%if %3
3548*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_2896x8)]
3549*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15, [%1+%2* 0]
3550*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15, [%1+%2* 1]
3551*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m15, [%1+%2* 2]
3552*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m15, [%1+%2* 3]
3553*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m15, [%1+%2* 4]
3554*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m15, [%1+%2* 5]
3555*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m15, [%1+%2* 6]
3556*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m15, [%1+%2* 7]
3557*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m15, [%1+%2* 8]
3558*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m15, [%1+%2* 9]
3559*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m15, [%1+%2*10]
3560*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m15, [%1+%2*11]
3561*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m15, [%1+%2*12]
3562*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m15, [%1+%2*13]
3563*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m15, [%1+%2*14]
3564*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15,      [%1+%2*15]
3565*c0909341SAndroid Build Coastguard Worker%else
3566*c0909341SAndroid Build Coastguard Worker    mova                 m0, [%1+%2* 0]
3567*c0909341SAndroid Build Coastguard Worker    mova                 m1, [%1+%2* 1]
3568*c0909341SAndroid Build Coastguard Worker    mova                 m2, [%1+%2* 2]
3569*c0909341SAndroid Build Coastguard Worker    mova                 m3, [%1+%2* 3]
3570*c0909341SAndroid Build Coastguard Worker    mova                 m4, [%1+%2* 4]
3571*c0909341SAndroid Build Coastguard Worker    mova                 m5, [%1+%2* 5]
3572*c0909341SAndroid Build Coastguard Worker    mova                 m6, [%1+%2* 6]
3573*c0909341SAndroid Build Coastguard Worker    mova                 m7, [%1+%2* 7]
3574*c0909341SAndroid Build Coastguard Worker    mova                 m8, [%1+%2* 8]
3575*c0909341SAndroid Build Coastguard Worker    mova                 m9, [%1+%2* 9]
3576*c0909341SAndroid Build Coastguard Worker    mova                m10, [%1+%2*10]
3577*c0909341SAndroid Build Coastguard Worker    mova                m11, [%1+%2*11]
3578*c0909341SAndroid Build Coastguard Worker    mova                m12, [%1+%2*12]
3579*c0909341SAndroid Build Coastguard Worker    mova                m13, [%1+%2*13]
3580*c0909341SAndroid Build Coastguard Worker    mova                m14, [%1+%2*14]
3581*c0909341SAndroid Build Coastguard Worker    mova                m15, [%1+%2*15]
3582*c0909341SAndroid Build Coastguard Worker%endif
3583*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m15
3584*c0909341SAndroid Build Coastguard Worker%if %4
3585*c0909341SAndroid Build Coastguard Worker    pxor                m15, m15
3586*c0909341SAndroid Build Coastguard Worker    REPX {mova [%1+%2*x], m15}, 0,  1,  2,  3,  4,  5,  6,  7, \
3587*c0909341SAndroid Build Coastguard Worker                                8,  9, 10, 11, 12, 13, 14, 15
3588*c0909341SAndroid Build Coastguard Worker%endif
3589*c0909341SAndroid Build Coastguard Worker%endmacro
3590*c0909341SAndroid Build Coastguard Worker
3591*c0909341SAndroid Build Coastguard Worker%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2]
3592*c0909341SAndroid Build Coastguard Worker    mova                m%4, [%2]
3593*c0909341SAndroid Build Coastguard Worker    paddsw              m%3, m%1, m%4
3594*c0909341SAndroid Build Coastguard Worker    psubsw              m%1, m%4
3595*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%4, [dstq+%6]
3596*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%3, m%5
3597*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m%5
3598*c0909341SAndroid Build Coastguard Worker    paddw               m%3, m%4
3599*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%4, [r2+%7]
3600*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m%4
3601*c0909341SAndroid Build Coastguard Worker    packuswb            m%3, m%1
3602*c0909341SAndroid Build Coastguard Worker    vpermq              m%3, m%3, q3120
3603*c0909341SAndroid Build Coastguard Worker    mova          [dstq+%6], xm%3
3604*c0909341SAndroid Build Coastguard Worker    vextracti128    [r2+%7], m%3, 1
3605*c0909341SAndroid Build Coastguard Worker%endmacro
3606*c0909341SAndroid Build Coastguard Worker
3607*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob
3608*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
3609*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3610*c0909341SAndroid Build Coastguard Worker    jz .dconly
3611*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \
3612*c0909341SAndroid Build Coastguard Worker                                           base, tmp3
3613*c0909341SAndroid Build Coastguard Worker    %undef cmp
3614*c0909341SAndroid Build Coastguard Worker    LOAD_16ROWS          cq, 64, 1
3615*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
3616*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*7]
3617*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
3618*c0909341SAndroid Build Coastguard Worker    lea               tmp3q, [tmp1q+32*16]
3619*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
3620*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m6
3621*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m7
3622*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_16384)]
3623*c0909341SAndroid Build Coastguard Worker    call .transpose_2x8x8_round
3624*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+32*0]
3625*c0909341SAndroid Build Coastguard Worker    mova         [tmp3q-32*4+ 0], xm0
3626*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp3q+32*0+ 0], m0, 1
3627*c0909341SAndroid Build Coastguard Worker    mova         [tmp3q-32*3+ 0], xm2
3628*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp3q+32*1+ 0], m2, 1
3629*c0909341SAndroid Build Coastguard Worker    mova         [tmp3q-32*2+ 0], xm4
3630*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp3q+32*2+ 0], m4, 1
3631*c0909341SAndroid Build Coastguard Worker    mova         [tmp3q-32*1+ 0], xm6
3632*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp3q+32*3+ 0], m6, 1
3633*c0909341SAndroid Build Coastguard Worker    mova         [tmp3q-32*4+16], xm8
3634*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp3q+32*0+16], m8, 1
3635*c0909341SAndroid Build Coastguard Worker    mova         [tmp3q-32*3+16], xm10
3636*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp3q+32*1+16], m10, 1
3637*c0909341SAndroid Build Coastguard Worker    mova         [tmp3q-32*2+16], xm12
3638*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp3q+32*2+16], m12, 1
3639*c0909341SAndroid Build Coastguard Worker    mova         [tmp3q-32*1+16], xm14
3640*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp3q+32*3+16], m14, 1
3641*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 150
3642*c0909341SAndroid Build Coastguard Worker    jg .full
3643*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, m1, xm9, 1
3644*c0909341SAndroid Build Coastguard Worker    vperm2i128           m4, m1, m9, 0x31
3645*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, m5, xm13, 1
3646*c0909341SAndroid Build Coastguard Worker    vperm2i128           m6, m5, m13, 0x31
3647*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, m3, xm11, 1
3648*c0909341SAndroid Build Coastguard Worker    vperm2i128           m5, m3, m11, 0x31
3649*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, m7, xm15, 1
3650*c0909341SAndroid Build Coastguard Worker    vperm2i128           m7, m7, m15, 0x31
3651*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf_fast
3652*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
3653*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
3654*c0909341SAndroid Build Coastguard Worker    jmp .idct16
3655*c0909341SAndroid Build Coastguard Worker.dconly:
3656*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
3657*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
3658*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_16384)]
3659*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
3660*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
3661*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
3662*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
3663*c0909341SAndroid Build Coastguard Worker.full:
3664*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m1
3665*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m3
3666*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m5
3667*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m7
3668*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m9
3669*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m11
3670*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m13
3671*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m15
3672*c0909341SAndroid Build Coastguard Worker    LOAD_16ROWS       cq+32, 64, 1
3673*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
3674*c0909341SAndroid Build Coastguard Worker    lea                  r2, [tmp3q+32*8]
3675*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
3676*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m6
3677*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m7
3678*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_16384)]
3679*c0909341SAndroid Build Coastguard Worker    call .transpose_2x8x8_round
3680*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+32*0]
3681*c0909341SAndroid Build Coastguard Worker    mova         [r2-32*4+ 0], xm0
3682*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2+32*0+ 0], m0, 1
3683*c0909341SAndroid Build Coastguard Worker    mova         [r2-32*3+ 0], xm2
3684*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2+32*1+ 0], m2, 1
3685*c0909341SAndroid Build Coastguard Worker    mova         [r2-32*2+ 0], xm4
3686*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2+32*2+ 0], m4, 1
3687*c0909341SAndroid Build Coastguard Worker    mova         [r2-32*1+ 0], xm6
3688*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2+32*3+ 0], m6, 1
3689*c0909341SAndroid Build Coastguard Worker    mova         [r2-32*4+16], xm8
3690*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2+32*0+16], m8, 1
3691*c0909341SAndroid Build Coastguard Worker    mova         [r2-32*3+16], xm10
3692*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2+32*1+16], m10, 1
3693*c0909341SAndroid Build Coastguard Worker    mova         [r2-32*2+16], xm12
3694*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2+32*2+16], m12, 1
3695*c0909341SAndroid Build Coastguard Worker    mova         [r2-32*1+16], xm14
3696*c0909341SAndroid Build Coastguard Worker    vextracti128 [r2+32*3+16], m14, 1
3697*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, m1, xm9, 1
3698*c0909341SAndroid Build Coastguard Worker    vperm2i128          m12, m1, m9, 0x31
3699*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tmp1q-32*4]
3700*c0909341SAndroid Build Coastguard Worker    mova                xm1, [tmp1q-32*3]
3701*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tmp1q+32*0], 1
3702*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tmp1q+32*1], 1
3703*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, m5, xm13, 1
3704*c0909341SAndroid Build Coastguard Worker    vperm2i128          m14, m5, m13, 0x31
3705*c0909341SAndroid Build Coastguard Worker    mova                xm4, [tmp1q-32*4+16]
3706*c0909341SAndroid Build Coastguard Worker    mova                xm5, [tmp1q-32*3+16]
3707*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [tmp1q+32*0+16], 1
3708*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [tmp1q+32*1+16], 1
3709*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, m3, xm11, 1
3710*c0909341SAndroid Build Coastguard Worker    vperm2i128          m13, m3, m11, 0x31
3711*c0909341SAndroid Build Coastguard Worker    mova                xm2, [tmp1q-32*2]
3712*c0909341SAndroid Build Coastguard Worker    mova                xm3, [tmp1q-32*1]
3713*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tmp1q+32*2], 1
3714*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [tmp1q+32*3], 1
3715*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, m7, xm15, 1
3716*c0909341SAndroid Build Coastguard Worker    vperm2i128          m15, m7, m15, 0x31
3717*c0909341SAndroid Build Coastguard Worker    mova                xm6, [tmp1q-32*2+16]
3718*c0909341SAndroid Build Coastguard Worker    mova                xm7, [tmp1q-32*1+16]
3719*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [tmp1q+32*2+16], 1
3720*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [tmp1q+32*3+16], 1
3721*c0909341SAndroid Build Coastguard Worker    call .main_oddhalf
3722*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H    r2-32*4, 32
3723*c0909341SAndroid Build Coastguard Worker.idct16:
3724*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS   tmp3q-32*4, 32
3725*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m15
3726*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
3727*c0909341SAndroid Build Coastguard Worker    imul                 r2, strideq, 19
3728*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
3729*c0909341SAndroid Build Coastguard Worker    add                  r2, dstq
3730*c0909341SAndroid Build Coastguard Worker    call .pass2_end
3731*c0909341SAndroid Build Coastguard Worker    RET
3732*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3733*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf_fast ; lower half is zero
3734*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m7
3735*c0909341SAndroid Build Coastguard Worker    pxor                 m7, m7
3736*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m7
3737*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m7
3738*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_3703x8)]
3739*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_1751x8)]
3740*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m1380x8)]
3741*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_3857x8)]
3742*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_3973x8)]
3743*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_995x8)]
3744*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m4  ; t29a
3745*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7  ; t18a
3746*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m3  ; t19a
3747*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m8  ; t28a
3748*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m2  ; t27a
3749*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m15 ; t20a
3750*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_m2106x8)]
3751*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_3513x8)]
3752*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_3290x8)]
3753*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_2440x8)]
3754*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(pw_m601x8)]
3755*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_4052x8)]
3756*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m5  ; t21a
3757*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7  ; t26a
3758*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m6  ; t25a
3759*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m8  ; t22a
3760*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m1  ; t23a
3761*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15 ; t24a
3762*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
3763*c0909341SAndroid Build Coastguard Worker    jmp .main2
3764*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3765*c0909341SAndroid Build Coastguard Workercglobal_label .main_oddhalf
3766*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m15
3767*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m7
3768*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m8
3769*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
3770*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         4, 11,  7,  8, 15, 1751, 3703 ; t18a, t29a
3771*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        12,  3,  7,  8, 15, 3857, 1380 ; t19a, t28a
3772*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         2, 13,  7,  8, 15,  995, 3973 ; t20a, t27a
3773*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        10,  5,  7,  8, 15, 3513, 2106 ; t21a, t26a
3774*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         6,  9,  7,  8, 15, 2440, 3290 ; t22a, t25a
3775*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        14,  1,  7,  8, 15, 4052,  601 ; t23a, t24a
3776*c0909341SAndroid Build Coastguard Worker.main2:
3777*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m12, m4  ; t18
3778*c0909341SAndroid Build Coastguard Worker    paddsw              m12, m4       ; t19
3779*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m2, m10  ; t21
3780*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m10      ; t20
3781*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m14, m6  ; t22
3782*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m6       ; t23
3783*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m1, m9   ; t25
3784*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m9       ; t24
3785*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m13, m5  ; t26
3786*c0909341SAndroid Build Coastguard Worker    paddsw              m13, m5       ; t27
3787*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m3, m11  ; t29
3788*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m11      ; t28
3789*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         5,  7,  8, 11, 15, m4017,  799 ; t18a, t29a
3790*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         9,  4,  8, 11, 15,  3406, 2276 ; t21a, t26a
3791*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         6, 10,  8, 11, 15, m2276, 3406 ; t22a, t25a
3792*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m14, m2  ; t20a
3793*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m2       ; t23a
3794*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m1, m13  ; t27a
3795*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m13      ; t24a
3796*c0909341SAndroid Build Coastguard Worker    psubsw              m13, m6, m9   ; t21
3797*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m9       ; t22
3798*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m10, m4  ; t26
3799*c0909341SAndroid Build Coastguard Worker    paddsw              m10, m4       ; t25
3800*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         2,  8,  4, 11, 15, m3784, 1567 ; t20,  t27
3801*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         9, 13,  4, 11, 15, m3784, 1567 ; t21a, t26a
3802*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+32*0] ; in31
3803*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m6  ; t22
3804*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+32*1] ; in15
3805*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m14 ; t23a
3806*c0909341SAndroid Build Coastguard Worker    mova                m14, [rsp+gprsize+32*2] ; in17
3807*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m1  ; t24a
3808*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         0,  4,  1, 11, 15,  201, 4091 ; t16a, t31a
3809*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        14,  6,  1, 11, 15, 3035, 2751 ; t17a, t30a
3810*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m0, m14  ; t17
3811*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m14      ; t16
3812*c0909341SAndroid Build Coastguard Worker    psubsw              m14, m4, m6   ; t30
3813*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m6       ; t31
3814*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        14,  1,  6, 11, 15,  799, 4017 ; t17a, t30a
3815*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m0, m12  ; t19a
3816*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m12      ; t16a
3817*c0909341SAndroid Build Coastguard Worker    psubsw              m12, m4, m3   ; t28a
3818*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m3       ; t31a
3819*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m14, m5  ; t18
3820*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m5       ; t17
3821*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m1, m7   ; t29
3822*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m7       ; t30
3823*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         5,  3,  7, 11, 15, 1567, 3784 ; t18a, t29a
3824*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        12,  6,  7, 11, 15, 1567, 3784 ; t19,  t28
3825*c0909341SAndroid Build Coastguard Worker    psubsw               m7, m1, m10  ; t25a
3826*c0909341SAndroid Build Coastguard Worker    paddsw               m1, m10      ; t30a
3827*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m5, m9   ; t21
3828*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m9       ; t18
3829*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m12, m2  ; t20a
3830*c0909341SAndroid Build Coastguard Worker    paddsw              m12, m2       ; t19a
3831*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m3, m13  ; t26
3832*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m13      ; t29
3833*c0909341SAndroid Build Coastguard Worker    psubsw              m13, m6, m8   ; t27a
3834*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m8       ; t28a
3835*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m5
3836*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m12
3837*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*0], m6
3838*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*1], m3
3839*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*2], m1
3840*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+32*0] ; t22
3841*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+gprsize+32*1] ; t23
3842*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+32*2] ; t24a
3843*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m14, m5  ; t22a
3844*c0909341SAndroid Build Coastguard Worker    paddsw              m14, m5       ; t17a
3845*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m0, m6   ; t23
3846*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m6       ; t16
3847*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m4, m3   ; t24
3848*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m3       ; t31
3849*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(pw_m2896_2896)]
3850*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(pw_2896_2896)]
3851*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
3852*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m14
3853*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*3], m4
3854*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W        13,  9,  0,  4, 15,  3,  8 ; t20,  t27
3855*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         2, 10,  0,  4, 15,  3,  8 ; t21a, t26a
3856*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         7,  1,  0,  4, 15,  3,  8 ; t22,  t25
3857*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         6,  5,  0,  4, 15,  3,  8 ; t23a, t24a
3858*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m13
3859*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m2
3860*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m7
3861*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m6
3862*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*4], m5
3863*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*3], m1
3864*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*2], m10
3865*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*1], m9
3866*c0909341SAndroid Build Coastguard Worker    ret
3867*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3868*c0909341SAndroid Build Coastguard Worker.transpose_2x8x8_round:
3869*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m12, m13
3870*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13
3871*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m8, m9
3872*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9
3873*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m14, m15
3874*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15
3875*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m10, m11
3876*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m11
3877*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5
3878*c0909341SAndroid Build Coastguard Worker    punpckhdq           m11, m8, m10
3879*c0909341SAndroid Build Coastguard Worker    punpckldq            m8, m10
3880*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m12, m14
3881*c0909341SAndroid Build Coastguard Worker    punpckhdq           m12, m14
3882*c0909341SAndroid Build Coastguard Worker    punpckhdq           m14, m13, m15
3883*c0909341SAndroid Build Coastguard Worker    punpckldq           m13, m15
3884*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m6, m9
3885*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m9
3886*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m9, m8, m10
3887*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m8, m10
3888*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m10, m11, m12
3889*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m11, m12
3890*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m12, m13, m15
3891*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m13, m15
3892*c0909341SAndroid Build Coastguard Worker    punpckhqdq          m15, m14, m6
3893*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m14, m6
3894*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, [rsp+gprsize+32*0]
3895*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15
3896*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, [rsp+gprsize+32*1]
3897*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m15
3898*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m4, m5
3899*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5
3900*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0, m1
3901*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1
3902*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6, m7
3903*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7
3904*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m2, m3
3905*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
3906*c0909341SAndroid Build Coastguard Worker    punpckhdq            m3, m0, m2
3907*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2
3908*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4, m6
3909*c0909341SAndroid Build Coastguard Worker    punpckhdq            m4, m6
3910*c0909341SAndroid Build Coastguard Worker    punpckhdq            m6, m5, m7
3911*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m7
3912*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m15, m1
3913*c0909341SAndroid Build Coastguard Worker    punpckhdq           m15, m1
3914*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m0, m2
3915*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m0, m2
3916*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m2, m3, m4
3917*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m4
3918*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m4, m5, m7
3919*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m5, m7
3920*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m7, m6, m15
3921*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m15
3922*c0909341SAndroid Build Coastguard Worker    ret
3923*c0909341SAndroid Build Coastguard WorkerALIGN function_align
3924*c0909341SAndroid Build Coastguard Worker.pass2_end:
3925*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m7
3926*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*2], m15
3927*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_2048)]
3928*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4
3929*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8
3930*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4
3931*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END     12, tmp1q-32*1, 0, 4, 15, r3*4,      strideq*0
3932*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3933*c0909341SAndroid Build Coastguard Worker    sub                  r2, strideq
3934*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+32*1]
3935*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4
3936*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8
3937*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4
3938*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END     13, tmp1q-32*2, 0, 4, 15, r3*4,      strideq*0
3939*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3940*c0909341SAndroid Build Coastguard Worker    sub                  r2, strideq
3941*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4
3942*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8
3943*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END     10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4
3944*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END     14, tmp1q-32*3, 0, 4, 15, r3*4,      strideq*0
3945*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
3946*c0909341SAndroid Build Coastguard Worker    sub                  r2, strideq
3947*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+gprsize+32*0]
3948*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+32*2]
3949*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4
3950*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8
3951*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END     11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4
3952*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS2_END      1, tmp1q-32*4, 0, 4, 15, r3*4,      strideq*0
3953*c0909341SAndroid Build Coastguard Worker    ret
3954*c0909341SAndroid Build Coastguard Worker
3955*c0909341SAndroid Build Coastguard Worker; Perform the final sumsub step and YMM lane shuffling
3956*c0909341SAndroid Build Coastguard Worker%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2]
3957*c0909341SAndroid Build Coastguard Worker    mova                m%3, [tmp2q+32*( 3-%1)]
3958*c0909341SAndroid Build Coastguard Worker    psubsw              m%4, m%1, m%3
3959*c0909341SAndroid Build Coastguard Worker    paddsw              m%1, m%3
3960*c0909341SAndroid Build Coastguard Worker    mova                m%3, [tmp1q+32*(11-%2)]
3961*c0909341SAndroid Build Coastguard Worker    mova         [tmp1q+32*(11-%2)+16], xm%4
3962*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1
3963*c0909341SAndroid Build Coastguard Worker    paddsw              m%4, m%2, m%3
3964*c0909341SAndroid Build Coastguard Worker    psubsw              m%2, m%3
3965*c0909341SAndroid Build Coastguard Worker    mova         [tmp1q+32*(11-%2)], xm%2
3966*c0909341SAndroid Build Coastguard Worker    vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
3967*c0909341SAndroid Build Coastguard Worker    vperm2i128          m%2, m%1, m%4, 0x31
3968*c0909341SAndroid Build Coastguard Worker    vinserti128         m%1, xm%4, 1
3969*c0909341SAndroid Build Coastguard Worker%endmacro
3970*c0909341SAndroid Build Coastguard Worker
3971*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob
3972*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
3973*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
3974*c0909341SAndroid Build Coastguard Worker    jnz .normal
3975*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
3976*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
3977*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_16384)]
3978*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
3979*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
3980*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
3981*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
3982*c0909341SAndroid Build Coastguard Worker.normal:
3983*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2
3984*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_2896x8)]
3985*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m15, [cq+32* 1]
3986*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15, [cq+32* 3]
3987*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m15, [cq+32* 5]
3988*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m15, [cq+32* 7]
3989*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m15, [cq+32* 9]
3990*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m15, [cq+32*11]
3991*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m15, [cq+32*13]
3992*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m15, [cq+32*15]
3993*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m15, [cq+32*17]
3994*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m15, [cq+32*19]
3995*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m15, [cq+32*21]
3996*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m15, [cq+32*23]
3997*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m15, [cq+32*25]
3998*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m13, m15, [cq+32*27]
3999*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m15, [cq+32*29]
4000*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15,      [cq+32*31]
4001*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*7]
4002*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
4003*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
4004*c0909341SAndroid Build Coastguard Worker    LOAD_16ROWS     cq+32*0, 32*2, 1, 0
4005*c0909341SAndroid Build Coastguard Worker    pxor                m15, m15
4006*c0909341SAndroid Build Coastguard Worker    mov                 r3d, 8
4007*c0909341SAndroid Build Coastguard Worker.zero_loop:
4008*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*0], m15
4009*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*1], m15
4010*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*2], m15
4011*c0909341SAndroid Build Coastguard Worker    mova          [cq+32*3], m15
4012*c0909341SAndroid Build Coastguard Worker    add                  cq, 32*4
4013*c0909341SAndroid Build Coastguard Worker    dec                 r3d
4014*c0909341SAndroid Build Coastguard Worker    jg .zero_loop
4015*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
4016*c0909341SAndroid Build Coastguard Worker    call .pass1_end
4017*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
4018*c0909341SAndroid Build Coastguard Worker    mov                  r3, dstq
4019*c0909341SAndroid Build Coastguard Worker.pass2:
4020*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_16384)]
4021*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
4022*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
4023*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*2], m15
4024*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_2048)]
4025*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m15}, m2, m3, m0
4026*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
4027*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15, [rsp+32*1]
4028*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
4029*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4030*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m15}, m4, m5, m6, m7
4031*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
4032*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
4033*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4034*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11
4035*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
4036*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
4037*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4038*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m15}, m11, m12, m13, m14
4039*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, [rsp+32*2]
4040*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
4041*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
4042*c0909341SAndroid Build Coastguard Worker    test                 r3, r3
4043*c0909341SAndroid Build Coastguard Worker    jnz .right_half
4044*c0909341SAndroid Build Coastguard Worker    RET
4045*c0909341SAndroid Build Coastguard Worker.right_half:
4046*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS   tmp1q-32*4, 32
4047*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H tmp2q-32*4, 32
4048*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r3+16]
4049*c0909341SAndroid Build Coastguard Worker    xor                 r3d, r3d
4050*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m6
4051*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m7
4052*c0909341SAndroid Build Coastguard Worker    jmp .pass2
4053*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4054*c0909341SAndroid Build Coastguard Worker.pass1_end:
4055*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m9
4056*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      0,  8,  1,  9
4057*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      2, 10,  1,  9
4058*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      3, 11,  1,  9
4059*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      4, 12,  1,  9
4060*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      5, 13,  1,  9
4061*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      6, 14,  1,  9
4062*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      7, 15,  1,  9
4063*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+gprsize+32*1]
4064*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+gprsize+32*0]
4065*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*0], m6
4066*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+32*1], m7
4067*c0909341SAndroid Build Coastguard Worker    IDCT32_PASS1_END      1,  9,  6,  7
4068*c0909341SAndroid Build Coastguard Worker    ret
4069*c0909341SAndroid Build Coastguard Worker
4070*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob
4071*c0909341SAndroid Build Coastguard Worker%undef cmp
4072*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
4073*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_2896x8)]
4074*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_1697x16)]
4075*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_8192)]
4076*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 43   ; if (eob > 43)
4077*c0909341SAndroid Build Coastguard Worker    setg                r4b       ;   iteration_count++
4078*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 150  ; if (eob > 150)
4079*c0909341SAndroid Build Coastguard Worker    setg                 al       ;   iteration_count++
4080*c0909341SAndroid Build Coastguard Worker    add                eobd, -279 ; if (eob > 278)
4081*c0909341SAndroid Build Coastguard Worker    adc                 r4b, al   ;   iteration_count++
4082*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4083*c0909341SAndroid Build Coastguard Worker    mov                  r6, cq
4084*c0909341SAndroid Build Coastguard Worker    paddw               m11, m12, m12 ; pw_16384
4085*c0909341SAndroid Build Coastguard Worker.loop:
4086*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+64* 0]
4087*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq+64* 1]
4088*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+64* 8], 1
4089*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [cq+64* 9], 1
4090*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+64* 2]
4091*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq+64* 3]
4092*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+64*10], 1
4093*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [cq+64*11], 1
4094*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq+64* 4]
4095*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq+64* 5]
4096*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+64*12], 1
4097*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [cq+64*13], 1
4098*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq+64* 6]
4099*c0909341SAndroid Build Coastguard Worker    mova                xm7, [cq+64* 7]
4100*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [cq+64*14], 1
4101*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [cq+64*15], 1
4102*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
4103*c0909341SAndroid Build Coastguard Worker    REPX  {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
4104*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4105*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
4106*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
4107*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
4108*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4109*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
4110*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
4111*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4112*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
4113*c0909341SAndroid Build Coastguard Worker    dec                 r4b
4114*c0909341SAndroid Build Coastguard Worker    jge .loop
4115*c0909341SAndroid Build Coastguard Worker    sub                  cq, 32
4116*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
4117*c0909341SAndroid Build Coastguard Worker    mov                 r0d, 8
4118*c0909341SAndroid Build Coastguard Worker    cmp                  cq, r6
4119*c0909341SAndroid Build Coastguard Worker    ja .zero_loop
4120*c0909341SAndroid Build Coastguard Worker.zero_loop_half:
4121*c0909341SAndroid Build Coastguard Worker    mova          [r6+64*0], m0
4122*c0909341SAndroid Build Coastguard Worker    mova          [r6+64*1], m0
4123*c0909341SAndroid Build Coastguard Worker    add                  r6, 64*4
4124*c0909341SAndroid Build Coastguard Worker    mova          [r6-64*2], m0
4125*c0909341SAndroid Build Coastguard Worker    mova          [r6-64*1], m0
4126*c0909341SAndroid Build Coastguard Worker    sub                 r0d, 2
4127*c0909341SAndroid Build Coastguard Worker    jg .zero_loop_half
4128*c0909341SAndroid Build Coastguard Worker    RET
4129*c0909341SAndroid Build Coastguard Worker.zero_loop:
4130*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*0], m0
4131*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*1], m0
4132*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*2], m0
4133*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*3], m0
4134*c0909341SAndroid Build Coastguard Worker    add                  r6, 32*4
4135*c0909341SAndroid Build Coastguard Worker    dec                 r0d
4136*c0909341SAndroid Build Coastguard Worker    jg .zero_loop
4137*c0909341SAndroid Build Coastguard Worker    RET
4138*c0909341SAndroid Build Coastguard Worker
4139*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob
4140*c0909341SAndroid Build Coastguard Worker%undef cmp
4141*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
4142*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_2896x8)]
4143*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_1697x16)]
4144*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_2048)]
4145*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 35  ; if (eob > 35)
4146*c0909341SAndroid Build Coastguard Worker    setg                r4b      ;   iteration_count++
4147*c0909341SAndroid Build Coastguard Worker    cmp                eobd, 150 ; if (eob > 150)
4148*c0909341SAndroid Build Coastguard Worker    setg                r3b      ;   iteration_count += 2
4149*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [r4+r3*2]
4150*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4151*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
4152*c0909341SAndroid Build Coastguard Worker    mov                  r6, cq
4153*c0909341SAndroid Build Coastguard Worker.loop:
4154*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+32* 0]
4155*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq+32* 1]
4156*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+32* 8], 1
4157*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [cq+32* 9], 1
4158*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+32* 2]
4159*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq+32* 3]
4160*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+32*10], 1
4161*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [cq+32*11], 1
4162*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq+32* 4]
4163*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq+32* 5]
4164*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+32*12], 1
4165*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [cq+32*13], 1
4166*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq+32* 6]
4167*c0909341SAndroid Build Coastguard Worker    mova                xm7, [cq+32* 7]
4168*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [cq+32*14], 1
4169*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [cq+32*15], 1
4170*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
4171*c0909341SAndroid Build Coastguard Worker    REPX  {paddsw   x, x  }, m0, m1, m2, m3, m4, m5, m6, m7
4172*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4173*c0909341SAndroid Build Coastguard Worker    REPX  {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7
4174*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
4175*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
4176*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2,  3,  0,  1, strideq*2, r3
4177*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4178*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
4179*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            6,  7,  0,  1, strideq*2, r3
4180*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4181*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
4182*c0909341SAndroid Build Coastguard Worker    dec                 r4b
4183*c0909341SAndroid Build Coastguard Worker    jl .ret
4184*c0909341SAndroid Build Coastguard Worker    test                r4b, 1
4185*c0909341SAndroid Build Coastguard Worker    jz .loop
4186*c0909341SAndroid Build Coastguard Worker    add                  cq, 32*15
4187*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16]
4188*c0909341SAndroid Build Coastguard Worker    jmp .loop
4189*c0909341SAndroid Build Coastguard Worker.ret:
4190*c0909341SAndroid Build Coastguard Worker    sub                  cd, eax
4191*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
4192*c0909341SAndroid Build Coastguard Worker    add                  cd, 384
4193*c0909341SAndroid Build Coastguard Worker.zero_loop:
4194*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*0], m0
4195*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*1], m0
4196*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*2], m0
4197*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*3], m0
4198*c0909341SAndroid Build Coastguard Worker    add                  r6, 32*4
4199*c0909341SAndroid Build Coastguard Worker    sub                  cd, 128
4200*c0909341SAndroid Build Coastguard Worker    jge .zero_loop
4201*c0909341SAndroid Build Coastguard Worker    RET
4202*c0909341SAndroid Build Coastguard Worker
4203*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob
4204*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
4205*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
4206*c0909341SAndroid Build Coastguard Worker    jnz .normal
4207*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
4208*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
4209*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_8192)]
4210*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
4211*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
4212*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
4213*c0909341SAndroid Build Coastguard Worker.normal:
4214*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \
4215*c0909341SAndroid Build Coastguard Worker                                           base, tmp3, tmp4
4216*c0909341SAndroid Build Coastguard Worker    %undef cmp
4217*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*7]
4218*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
4219*c0909341SAndroid Build Coastguard Worker    sub                eobd, 136
4220*c0909341SAndroid Build Coastguard Worker    mov               tmp4d, eobd
4221*c0909341SAndroid Build Coastguard Worker.pass1_loop:
4222*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+64*1, 64*2
4223*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
4224*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
4225*c0909341SAndroid Build Coastguard Worker    test              tmp4d, tmp4d
4226*c0909341SAndroid Build Coastguard Worker    jl .fast
4227*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H   cq+64*17, 64*2
4228*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
4229*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H   cq+64*16, 64*2
4230*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
4231*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
4232*c0909341SAndroid Build Coastguard Worker                               24, 25, 26, 27, 28, 29, 30, 31
4233*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m15
4234*c0909341SAndroid Build Coastguard Worker    jmp .idct16
4235*c0909341SAndroid Build Coastguard Worker.fast:
4236*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4237*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
4238*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
4239*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m8
4240*c0909341SAndroid Build Coastguard Worker.idct16:
4241*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+64*0, 64*2
4242*c0909341SAndroid Build Coastguard Worker    pxor                m15, m15
4243*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
4244*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
4245*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
4246*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_8192)]
4247*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
4248*c0909341SAndroid Build Coastguard Worker    lea               tmp3q, [tmp1q+32*32]
4249*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp]
4250*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q-32*4], m0
4251*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q-32*3], m2
4252*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q-32*2], m4
4253*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q-32*1], m6
4254*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q+32*0], m8
4255*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q+32*1], m10
4256*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q+32*2], m12
4257*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q+32*3], m14
4258*c0909341SAndroid Build Coastguard Worker    add               tmp3q, 32*8
4259*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q-32*4], m1
4260*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q-32*3], m3
4261*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q-32*2], m5
4262*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q-32*1], m7
4263*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q+32*0], m9
4264*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q+32*1], m11
4265*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q+32*2], m13
4266*c0909341SAndroid Build Coastguard Worker    mova       [tmp3q+32*3], m15
4267*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_8192)]
4268*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m9, [tmp1q-32*4]
4269*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m9, [tmp1q-32*3]
4270*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m9, [tmp1q-32*2]
4271*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m9, [tmp1q-32*1]
4272*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9, [tmp1q+32*0]
4273*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9, [tmp1q+32*1]
4274*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m9, [tmp1q+32*2]
4275*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m9, [tmp1q+32*3]
4276*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4277*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
4278*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m9, [tmp2q-32*4]
4279*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*4], m1
4280*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m9, [tmp2q-32*3]
4281*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m2
4282*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m9, [tmp2q-32*2]
4283*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*3], m3
4284*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m9, [tmp2q-32*1]
4285*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m4
4286*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9, [tmp2q+32*0]
4287*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*2], m5
4288*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9, [tmp2q+32*1]
4289*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m6
4290*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m9, [tmp2q+32*2]
4291*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*1], m7
4292*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m9, [tmp2q+32*3]
4293*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4294*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m0
4295*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*0], m1
4296*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m2
4297*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*1], m3
4298*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m4
4299*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*2], m5
4300*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m6
4301*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*3], m7
4302*c0909341SAndroid Build Coastguard Worker    add                  cq, 32
4303*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
4304*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*16
4305*c0909341SAndroid Build Coastguard Worker    add                eobd, 0x80000000
4306*c0909341SAndroid Build Coastguard Worker    jnc .pass1_loop
4307*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*24
4308*c0909341SAndroid Build Coastguard Worker    imul                 r2, strideq, 19
4309*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
4310*c0909341SAndroid Build Coastguard Worker    add                  r2, dstq
4311*c0909341SAndroid Build Coastguard Worker    test              tmp4d, tmp4d
4312*c0909341SAndroid Build Coastguard Worker    jge .pass2_loop
4313*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
4314*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*16
4315*c0909341SAndroid Build Coastguard Worker    add               tmp3q, 32*16
4316*c0909341SAndroid Build Coastguard Worker.pass2_loop:
4317*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS   tmp2q-32*4, 32
4318*c0909341SAndroid Build Coastguard Worker    test              tmp4d, tmp4d
4319*c0909341SAndroid Build Coastguard Worker    jl .fast2
4320*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H tmp3q-32*4, 32
4321*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
4322*c0909341SAndroid Build Coastguard Worker    sub               tmp3q, 32*8
4323*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H tmp3q-32*4, 32
4324*c0909341SAndroid Build Coastguard Worker    sub               tmp3q, 32*16
4325*c0909341SAndroid Build Coastguard Worker    jmp .pass2_loop_end
4326*c0909341SAndroid Build Coastguard Worker.fast2:
4327*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4328*c0909341SAndroid Build Coastguard Worker    sub               tmp3q, 32*24
4329*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
4330*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
4331*c0909341SAndroid Build Coastguard Worker.pass2_loop_end:
4332*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS   tmp3q-32*4, 32
4333*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m15
4334*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
4335*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
4336*c0909341SAndroid Build Coastguard Worker    lea               tmp3q, [tmp1q-32*32]
4337*c0909341SAndroid Build Coastguard Worker    cmp               tmp2q, tmp3q
4338*c0909341SAndroid Build Coastguard Worker    jb .ret
4339*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, 32*32
4340*c0909341SAndroid Build Coastguard Worker    sub                dstq, r3
4341*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r2+r3+16]
4342*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
4343*c0909341SAndroid Build Coastguard Worker    jmp .pass2_loop
4344*c0909341SAndroid Build Coastguard Worker.ret:
4345*c0909341SAndroid Build Coastguard Worker    RET
4346*c0909341SAndroid Build Coastguard Worker
4347*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob
4348*c0909341SAndroid Build Coastguard Worker    %undef cmp
4349*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [pw_8192]
4350*c0909341SAndroid Build Coastguard Worker    sub                eobd, 136 ; if (eob < 136)
4351*c0909341SAndroid Build Coastguard Worker    shr                eobd, 30  ;     topleft 16x16 only
4352*c0909341SAndroid Build Coastguard Worker    lea                eobd, [eobq*2-8]
4353*c0909341SAndroid Build Coastguard Worker    lea                  r4, [strideq*3]
4354*c0909341SAndroid Build Coastguard Worker    mov                  r5, dstq
4355*c0909341SAndroid Build Coastguard Worker    lea                  r6, [cq+32]
4356*c0909341SAndroid Build Coastguard Worker.loop:
4357*c0909341SAndroid Build Coastguard Worker    mova                xm0, [cq+64* 0]
4358*c0909341SAndroid Build Coastguard Worker    mova                xm1, [cq+64* 1]
4359*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [cq+64* 8], 1
4360*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [cq+64* 9], 1
4361*c0909341SAndroid Build Coastguard Worker    mova                xm2, [cq+64* 2]
4362*c0909341SAndroid Build Coastguard Worker    mova                xm3, [cq+64* 3]
4363*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [cq+64*10], 1
4364*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [cq+64*11], 1
4365*c0909341SAndroid Build Coastguard Worker    mova                xm4, [cq+64* 4]
4366*c0909341SAndroid Build Coastguard Worker    mova                xm5, [cq+64* 5]
4367*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [cq+64*12], 1
4368*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [cq+64*13], 1
4369*c0909341SAndroid Build Coastguard Worker    mova                xm6, [cq+64* 6]
4370*c0909341SAndroid Build Coastguard Worker    mova                xm7, [cq+64* 7]
4371*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [cq+64*14], 1
4372*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [cq+64*15], 1
4373*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
4374*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
4375*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
4376*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2,  3,  0,  1, strideq*2, r4
4377*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4378*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4,  5,  0,  1, strideq*0, strideq*1
4379*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            6,  7,  0,  1, strideq*2, r4
4380*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
4381*c0909341SAndroid Build Coastguard Worker    add                  cq, 16
4382*c0909341SAndroid Build Coastguard Worker    inc                eobd
4383*c0909341SAndroid Build Coastguard Worker    jz .ret
4384*c0909341SAndroid Build Coastguard Worker    test               eobd, 3
4385*c0909341SAndroid Build Coastguard Worker    jnz .loop
4386*c0909341SAndroid Build Coastguard Worker    add                  cq, 64*15
4387*c0909341SAndroid Build Coastguard Worker    lea                dstq, [r5+16]
4388*c0909341SAndroid Build Coastguard Worker    jmp .loop
4389*c0909341SAndroid Build Coastguard Worker.ret:
4390*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
4391*c0909341SAndroid Build Coastguard Worker    mov                 r0d, 16
4392*c0909341SAndroid Build Coastguard Worker    cmp                  cq, r6
4393*c0909341SAndroid Build Coastguard Worker    jne .zero_loop
4394*c0909341SAndroid Build Coastguard Worker.zero_loop_topleft:
4395*c0909341SAndroid Build Coastguard Worker    mova          [r6-32*1], m0
4396*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*1], m0
4397*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*3], m0
4398*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*5], m0
4399*c0909341SAndroid Build Coastguard Worker    add                  r6, 64*4
4400*c0909341SAndroid Build Coastguard Worker    sub                 r0d, 4
4401*c0909341SAndroid Build Coastguard Worker    jg .zero_loop_topleft
4402*c0909341SAndroid Build Coastguard Worker    RET
4403*c0909341SAndroid Build Coastguard Worker.zero_loop:
4404*c0909341SAndroid Build Coastguard Worker    mova          [r6-32*1], m0
4405*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*0], m0
4406*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*1], m0
4407*c0909341SAndroid Build Coastguard Worker    mova          [r6+32*2], m0
4408*c0909341SAndroid Build Coastguard Worker    add                  r6, 32*4
4409*c0909341SAndroid Build Coastguard Worker    dec                 r0d
4410*c0909341SAndroid Build Coastguard Worker    jg .zero_loop
4411*c0909341SAndroid Build Coastguard Worker    RET
4412*c0909341SAndroid Build Coastguard Worker
4413*c0909341SAndroid Build Coastguard Worker%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
4414*c0909341SAndroid Build Coastguard Worker%if %1 & 1
4415*c0909341SAndroid Build Coastguard Worker    mova                m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n
4416*c0909341SAndroid Build Coastguard Worker    mova                m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n
4417*c0909341SAndroid Build Coastguard Worker%else
4418*c0909341SAndroid Build Coastguard Worker    mova                m%5, [tmp1q-32*(45-%1)]
4419*c0909341SAndroid Build Coastguard Worker    mova                m%4, [tmp2q-32*(20+%1)]
4420*c0909341SAndroid Build Coastguard Worker%endif
4421*c0909341SAndroid Build Coastguard Worker    psubsw              m%6, m%5, m%4 ; idct32 out31-n
4422*c0909341SAndroid Build Coastguard Worker    paddsw              m%5, m%4      ; idct32 out 0+n
4423*c0909341SAndroid Build Coastguard Worker    psubsw              m%4, m%6, m%3 ; out32+n
4424*c0909341SAndroid Build Coastguard Worker    paddsw              m%6, m%3      ; out31-n
4425*c0909341SAndroid Build Coastguard Worker    psubsw              m%3, m%5, m%2 ; out63-n
4426*c0909341SAndroid Build Coastguard Worker    paddsw              m%5, m%2      ; out 0+n
4427*c0909341SAndroid Build Coastguard Worker%if %0 == 6 ; pass 1
4428*c0909341SAndroid Build Coastguard Worker%if %1 & 1
4429*c0909341SAndroid Build Coastguard Worker    mova [tmp2q-32*(19-%1)], m%4
4430*c0909341SAndroid Build Coastguard Worker    mova [tmp1q-32*(14+%1)], m%6
4431*c0909341SAndroid Build Coastguard Worker    mova [tmp1q+32*(18-%1)], m%3
4432*c0909341SAndroid Build Coastguard Worker    mova [tmp2q-32*(51-%1)], m%5
4433*c0909341SAndroid Build Coastguard Worker%else
4434*c0909341SAndroid Build Coastguard Worker    mova [tmp1q-32*(13-%1)], m%4
4435*c0909341SAndroid Build Coastguard Worker    mova [tmp2q-32*(20+%1)], m%6
4436*c0909341SAndroid Build Coastguard Worker    mova [tmp2q+32*(12-%1)], m%3
4437*c0909341SAndroid Build Coastguard Worker    mova [tmp1q-32*(45-%1)], m%5
4438*c0909341SAndroid Build Coastguard Worker%endif
4439*c0909341SAndroid Build Coastguard Worker%else ; pass 2
4440*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m14}, m%4, m%6, m%3, m%5
4441*c0909341SAndroid Build Coastguard Worker%if %1 & 1
4442*c0909341SAndroid Build Coastguard Worker    %define %%d0 r2
4443*c0909341SAndroid Build Coastguard Worker    %define %%d1 dstq
4444*c0909341SAndroid Build Coastguard Worker%else
4445*c0909341SAndroid Build Coastguard Worker    %define %%d0 dstq
4446*c0909341SAndroid Build Coastguard Worker    %define %%d1 r2
4447*c0909341SAndroid Build Coastguard Worker%endif
4448*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%2, [%%d0+%9 ]
4449*c0909341SAndroid Build Coastguard Worker    paddw               m%2, m%4
4450*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%4, [%%d1+%8 ]
4451*c0909341SAndroid Build Coastguard Worker    paddw               m%4, m%6
4452*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%6, [%%d1+%10]
4453*c0909341SAndroid Build Coastguard Worker    paddw               m%3, m%6
4454*c0909341SAndroid Build Coastguard Worker    pmovzxbw            m%6, [%%d0+%7 ]
4455*c0909341SAndroid Build Coastguard Worker    paddw               m%5, m%6
4456*c0909341SAndroid Build Coastguard Worker    packuswb            m%2, m%4
4457*c0909341SAndroid Build Coastguard Worker    packuswb            m%3, m%5
4458*c0909341SAndroid Build Coastguard Worker    vpermq              m%2, m%2, q3120
4459*c0909341SAndroid Build Coastguard Worker    vpermq              m%3, m%3, q3120
4460*c0909341SAndroid Build Coastguard Worker    mova         [%%d0+%9 ], xm%2
4461*c0909341SAndroid Build Coastguard Worker    vextracti128 [%%d1+%8 ], m%2, 1
4462*c0909341SAndroid Build Coastguard Worker    mova         [%%d1+%10], xm%3
4463*c0909341SAndroid Build Coastguard Worker    vextracti128 [%%d0+%7 ], m%3, 1
4464*c0909341SAndroid Build Coastguard Worker%endif
4465*c0909341SAndroid Build Coastguard Worker%endmacro
4466*c0909341SAndroid Build Coastguard Worker
4467*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob
4468*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
4469*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
4470*c0909341SAndroid Build Coastguard Worker    jnz .normal
4471*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
4472*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
4473*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_8192)]
4474*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
4475*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
4476*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
4477*c0909341SAndroid Build Coastguard Worker.normal:
4478*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
4479*c0909341SAndroid Build Coastguard Worker    %undef cmp
4480*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*23]
4481*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*24]
4482*c0909341SAndroid Build Coastguard Worker    sub                eobd, 151
4483*c0909341SAndroid Build Coastguard Worker    mov                 r7d, eobd
4484*c0909341SAndroid Build Coastguard Worker.pass1_loop:
4485*c0909341SAndroid Build Coastguard Worker    LOAD_16ROWS          cq, 64
4486*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
4487*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
4488*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m6
4489*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m7
4490*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_8192)]
4491*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
4492*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+32*0]
4493*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
4494*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m2
4495*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m4
4496*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m6
4497*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m8
4498*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m10
4499*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m12
4500*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m14
4501*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*4], m1
4502*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*3], m3
4503*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*2], m5
4504*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*1], m7
4505*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*0], m9
4506*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*1], m11
4507*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*2], m13
4508*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*3], m15
4509*c0909341SAndroid Build Coastguard Worker    add                  cq, 32
4510*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
4511*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*8
4512*c0909341SAndroid Build Coastguard Worker    add                eobd, 0x80000000
4513*c0909341SAndroid Build Coastguard Worker    jnc .pass1_loop
4514*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+32*23]
4515*c0909341SAndroid Build Coastguard Worker    mova                xm0, [r2-32*4+ 0]
4516*c0909341SAndroid Build Coastguard Worker    mova                xm1, [r2-32*2+ 0]
4517*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r2+32*0+ 0], 1
4518*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r2+32*2+ 0], 1
4519*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r2-32*4+16]
4520*c0909341SAndroid Build Coastguard Worker    mova                xm3, [r2-32*2+16]
4521*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [r2+32*0+16], 1
4522*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [r2+32*2+16], 1
4523*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4524*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
4525*c0909341SAndroid Build Coastguard Worker    test                r7d, r7d
4526*c0909341SAndroid Build Coastguard Worker    jl .fast
4527*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r2+32*8]
4528*c0909341SAndroid Build Coastguard Worker    mova                xm4, [r3-32*4+ 0]
4529*c0909341SAndroid Build Coastguard Worker    mova                xm5, [r3-32*2+ 0]
4530*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [r3+32*0+ 0], 1
4531*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r3+32*2+ 0], 1
4532*c0909341SAndroid Build Coastguard Worker    mova                xm6, [r3-32*4+16]
4533*c0909341SAndroid Build Coastguard Worker    mova                xm7, [r3-32*2+16]
4534*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r3+32*0+16], 1
4535*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [r3+32*2+16], 1
4536*c0909341SAndroid Build Coastguard Worker.fast:
4537*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m8
4538*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*7]
4539*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
4540*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
4541*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
4542*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m1
4543*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m2
4544*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m3
4545*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m4
4546*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m5
4547*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m6
4548*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m7
4549*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
4550*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m8
4551*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m9
4552*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m10
4553*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m11
4554*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m12
4555*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m13
4556*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m14
4557*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m15
4558*c0909341SAndroid Build Coastguard Worker    mova                xm0, [r2-32*3+ 0]
4559*c0909341SAndroid Build Coastguard Worker    mova                xm1, [r2-32*1+ 0]
4560*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r2+32*1+ 0], 1
4561*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r2+32*3+ 0], 1
4562*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r2-32*3+16]
4563*c0909341SAndroid Build Coastguard Worker    mova                xm3, [r2-32*1+16]
4564*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [r2+32*1+16], 1
4565*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [r2+32*3+16], 1
4566*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
4567*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7
4568*c0909341SAndroid Build Coastguard Worker    test                r7d, r7d
4569*c0909341SAndroid Build Coastguard Worker    jl .fast2
4570*c0909341SAndroid Build Coastguard Worker    mova                xm4, [r3-32*3+ 0]
4571*c0909341SAndroid Build Coastguard Worker    mova                xm5, [r3-32*1+ 0]
4572*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [r3+32*1+ 0], 1
4573*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r3+32*3+ 0], 1
4574*c0909341SAndroid Build Coastguard Worker    mova                xm6, [r3-32*3+16]
4575*c0909341SAndroid Build Coastguard Worker    mova                xm7, [r3-32*1+16]
4576*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r3+32*1+16], 1
4577*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [r3+32*3+16], 1
4578*c0909341SAndroid Build Coastguard Worker.fast2:
4579*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
4580*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
4581*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4582*c0909341SAndroid Build Coastguard Worker    add                  r2, 32*24
4583*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
4584*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
4585*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*32
4586*c0909341SAndroid Build Coastguard Worker    mova                xm0, [r2-32*4+ 0]
4587*c0909341SAndroid Build Coastguard Worker    mova                xm3, [r2-32*1+16]
4588*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r2+32*0+ 0], 1
4589*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [r2+32*3+16], 1
4590*c0909341SAndroid Build Coastguard Worker    mova                xm4, [r2-32*4+16]
4591*c0909341SAndroid Build Coastguard Worker    mova                xm7, [r2-32*1+ 0]
4592*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [r2+32*0+16], 1
4593*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [r2+32*3+ 0], 1
4594*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
4595*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m1}, m2, m5, m6
4596*c0909341SAndroid Build Coastguard Worker    test                r7d, r7d
4597*c0909341SAndroid Build Coastguard Worker    jl .fast3
4598*c0909341SAndroid Build Coastguard Worker    add                  r3, 32*24
4599*c0909341SAndroid Build Coastguard Worker    mova                xm1, [r3-32*1+16]
4600*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r3-32*4+ 0]
4601*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r3+32*3+16], 1
4602*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [r3+32*0+ 0], 1
4603*c0909341SAndroid Build Coastguard Worker    mova                xm5, [r3-32*1+ 0]
4604*c0909341SAndroid Build Coastguard Worker    mova                xm6, [r3-32*4+16]
4605*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r3+32*3+ 0], 1
4606*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r3+32*0+16], 1
4607*c0909341SAndroid Build Coastguard Worker.fast3:
4608*c0909341SAndroid Build Coastguard Worker    add                  r6, o_idct64_offset
4609*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
4610*c0909341SAndroid Build Coastguard Worker    add                  r6, 8
4611*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
4612*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, 32*8
4613*c0909341SAndroid Build Coastguard Worker    mova                xm0, [r2-32*2+ 0]
4614*c0909341SAndroid Build Coastguard Worker    mova                xm3, [r2-32*3+16]
4615*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [r2+32*2+ 0], 1
4616*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [r2+32*1+16], 1
4617*c0909341SAndroid Build Coastguard Worker    mova                xm4, [r2-32*2+16]
4618*c0909341SAndroid Build Coastguard Worker    mova                xm7, [r2-32*3+ 0]
4619*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [r2+32*2+16], 1
4620*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [r2+32*1+ 0], 1
4621*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
4622*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m1}, m2, m5, m6
4623*c0909341SAndroid Build Coastguard Worker    test                r7d, r7d
4624*c0909341SAndroid Build Coastguard Worker    jl .fast4
4625*c0909341SAndroid Build Coastguard Worker    mova                xm1, [r3-32*3+16]
4626*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r3-32*2+ 0]
4627*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [r3+32*1+16], 1
4628*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [r3+32*2+ 0], 1
4629*c0909341SAndroid Build Coastguard Worker    mova                xm5, [r3-32*3+ 0]
4630*c0909341SAndroid Build Coastguard Worker    mova                xm6, [r3-32*2+16]
4631*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [r3+32*1+ 0], 1
4632*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [r3+32*2+16], 1
4633*c0909341SAndroid Build Coastguard Worker.fast4:
4634*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
4635*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
4636*c0909341SAndroid Build Coastguard Worker    RET
4637*c0909341SAndroid Build Coastguard WorkerALIGN function_align
4638*c0909341SAndroid Build Coastguard Worker%define o_base idct64_mul - 8
4639*c0909341SAndroid Build Coastguard Workercglobal_label .main_part1
4640*c0909341SAndroid Build Coastguard Worker    ; idct64 steps 1-5:
4641*c0909341SAndroid Build Coastguard Worker    ; in1/31/17/15/ 9/23/25/ 7 ->
4642*c0909341SAndroid Build Coastguard Worker    ;     t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a
4643*c0909341SAndroid Build Coastguard Worker    ; in5/27/21/11/13/19/29/ 3 ->
4644*c0909341SAndroid Build Coastguard Worker    ;     t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a
4645*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(idct64_mul+4* 0)]
4646*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(idct64_mul+4* 1)]
4647*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(idct64_mul+4* 4)]
4648*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(idct64_mul+4* 5)]
4649*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m0  ; t63a
4650*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m13 ; t32a
4651*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m1  ; t62a
4652*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12 ; t33a
4653*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(idct64_mul+4* 8)]
4654*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(idct64_mul+4* 9)]
4655*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(idct64_mul+4*12)]
4656*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(idct64_mul+4*13)]
4657*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m2  ; t61a
4658*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m13 ; t34a
4659*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m3  ; t60a
4660*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m12 ; t35a
4661*c0909341SAndroid Build Coastguard Worker    psubsw              m12, m0, m1   ; t33
4662*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1       ; t32
4663*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m3, m2   ; t34
4664*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2       ; t35
4665*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m8, m9   ; t61
4666*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m9       ; t60
4667*c0909341SAndroid Build Coastguard Worker    psubsw               m9, m11, m10 ; t62
4668*c0909341SAndroid Build Coastguard Worker    paddsw              m11, m10      ; t63
4669*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         2,  1, 10, 13, 15, m4076, 401 ; t34a, t61a
4670*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(pw_401_4076)]
4671*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         9, 12, 10, 13, 15, 14, 13 ; t33a, t62a
4672*c0909341SAndroid Build Coastguard Worker    psubsw              m10, m0, m3  ; t35a
4673*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3      ; t32a
4674*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m11, m8 ; t60a
4675*c0909341SAndroid Build Coastguard Worker    paddsw              m11, m8      ; t63a
4676*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m9, m2  ; t34
4677*c0909341SAndroid Build Coastguard Worker    paddsw               m9, m2      ; t33
4678*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m12, m1 ; t61
4679*c0909341SAndroid Build Coastguard Worker    paddsw              m12, m1      ; t62
4680*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
4681*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m9
4682*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*2], m12
4683*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*3], m11
4684*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_m4017_799)]
4685*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(pw_799_4017)]
4686*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         2,  8,  0,  1, 15, 14, 13 ; t34a, t61a
4687*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         3, 10,  0,  1, 15, 14, 13 ; t35,  t60
4688*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m2
4689*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m3
4690*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*0], m10
4691*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*1], m8
4692*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m3, [o(idct64_mul+4*16)]
4693*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(idct64_mul+4*17)]
4694*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m2, [o(idct64_mul+4*20)]
4695*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(idct64_mul+4*21)]
4696*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m1, [o(idct64_mul+4*24)]
4697*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(idct64_mul+4*25)]
4698*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m0, [o(idct64_mul+4*28)]
4699*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m8, [o(idct64_mul+4*29)]
4700*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m4  ; t59a
4701*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m11 ; t36a
4702*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5  ; t58a
4703*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m10 ; t37a
4704*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6  ; t57a
4705*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m9  ; t38a
4706*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7  ; t56a
4707*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m8  ; t39a
4708*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m4, m5 ; t37
4709*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m5     ; t36
4710*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m7, m6 ; t38
4711*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m6     ; t39
4712*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m0, m1 ; t57
4713*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m1     ; t56
4714*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m3, m2 ; t58
4715*c0909341SAndroid Build Coastguard Worker    paddsw               m3, m2     ; t59
4716*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         6,  5,  2,  9, 15, m2598, 3166 ; t38a, t57a
4717*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_3166_2598)]
4718*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         1,  8,  2,  9, 15, 10,  9 ; t37a, t58a
4719*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m7, m4 ; t36a
4720*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m4     ; t39a
4721*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m0, m3 ; t59a
4722*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m3     ; t56a
4723*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m6, m1 ; t37
4724*c0909341SAndroid Build Coastguard Worker    paddsw               m6, m1     ; t38
4725*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m5, m8 ; t58
4726*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m8     ; t57
4727*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m6
4728*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m7
4729*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*4], m0
4730*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*3], m5
4731*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m6, [o(pw_m799_m4017)]
4732*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_m4017_799)]
4733*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         4,  2,  0,  5, 15,  7,  6 ; t36,  t59
4734*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         1,  3,  0,  5, 15,  7,  6 ; t37a, t58a
4735*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m4
4736*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m1
4737*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*2], m3
4738*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*1], m2
4739*c0909341SAndroid Build Coastguard Worker    ret
4740*c0909341SAndroid Build Coastguard Worker%define o_base pw_5 + 128
4741*c0909341SAndroid Build Coastguard Worker.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub
4742*c0909341SAndroid Build Coastguard Worker    sub                  r6, o_idct64_offset + 8
4743*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1567_3784)]
4744*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m3784_1567)]
4745*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_2896_2896)]
4746*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(pw_m2896_2896)]
4747*c0909341SAndroid Build Coastguard Worker.main_part2_pass1_loop:
4748*c0909341SAndroid Build Coastguard Worker    call .main_part2_internal
4749*c0909341SAndroid Build Coastguard Worker    IDCT64_PART2_END      0,  7,  0,  6,  9, 10
4750*c0909341SAndroid Build Coastguard Worker    IDCT64_PART2_END      7,  8,  5,  0,  6,  7
4751*c0909341SAndroid Build Coastguard Worker    IDCT64_PART2_END      8,  2,  1,  0,  6,  7
4752*c0909341SAndroid Build Coastguard Worker    IDCT64_PART2_END     15,  3,  4,  0,  6,  7
4753*c0909341SAndroid Build Coastguard Worker    cmp               tmp1q, tmp2q
4754*c0909341SAndroid Build Coastguard Worker    jne .main_part2_pass1_loop
4755*c0909341SAndroid Build Coastguard Worker    ret
4756*c0909341SAndroid Build Coastguard Workercglobal_label .main_part2_internal
4757*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q-32*12] ; t32a
4758*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tmp2q-32*13] ; t39a
4759*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q-32* 4] ; t40a
4760*c0909341SAndroid Build Coastguard Worker    mova                 m5, [tmp2q+32* 3] ; t55a
4761*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32
4762*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, 32
4763*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmp1q+32* 3] ; t48a
4764*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp2q-32* 4] ; t47a
4765*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp1q+32*11] ; t56a
4766*c0909341SAndroid Build Coastguard Worker    mova                 m7, [tmp2q+32*12] ; t63a
4767*c0909341SAndroid Build Coastguard Worker    psubsw               m8, m0, m6 ; t39
4768*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m6     ; t32
4769*c0909341SAndroid Build Coastguard Worker    psubsw               m6, m4, m1 ; t40
4770*c0909341SAndroid Build Coastguard Worker    paddsw               m4, m1     ; t47
4771*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m2, m5 ; t55
4772*c0909341SAndroid Build Coastguard Worker    paddsw               m2, m5     ; t48
4773*c0909341SAndroid Build Coastguard Worker    psubsw               m5, m7, m3 ; t56
4774*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m3     ; t63
4775*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         5,  8,  3,  9, 15, 11, 12 ; t39a, t56a
4776*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_m1567_m3784)]
4777*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         1,  6,  3,  9, 15, 12,  9 ; t40a, t55a
4778*c0909341SAndroid Build Coastguard Worker    psubsw               m3, m0, m4 ; t47a
4779*c0909341SAndroid Build Coastguard Worker    paddsw               m0, m4     ; t32a
4780*c0909341SAndroid Build Coastguard Worker    psubsw               m4, m7, m2 ; t48a
4781*c0909341SAndroid Build Coastguard Worker    paddsw               m7, m2     ; t63a
4782*c0909341SAndroid Build Coastguard Worker    psubsw               m2, m5, m1 ; t40
4783*c0909341SAndroid Build Coastguard Worker    paddsw               m5, m1     ; t39
4784*c0909341SAndroid Build Coastguard Worker    psubsw               m1, m8, m6 ; t55
4785*c0909341SAndroid Build Coastguard Worker    paddsw               m8, m6     ; t56
4786*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         4,  3,  6,  9, 15, 13, 14 ; t47,  t48
4787*c0909341SAndroid Build Coastguard Worker    ITX_MULSUB_2W         1,  2,  6,  9, 15, 13, 14 ; t40a, t55a
4788*c0909341SAndroid Build Coastguard Worker    ret
4789*c0909341SAndroid Build Coastguard Worker.main_part2_pass2:
4790*c0909341SAndroid Build Coastguard Worker    sub                  r6, o_idct64_offset + 8
4791*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m11, [o(pw_1567_3784)]
4792*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m12, [o(pw_m3784_1567)]
4793*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m13, [o(pw_2896_2896)]
4794*c0909341SAndroid Build Coastguard Worker    lea                  r9, [strideq*5]    ; stride*5
4795*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r9+strideq*1] ; stride*6
4796*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r9+strideq*2] ; stride*7
4797*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r3+strideq*2] ; stride*8
4798*c0909341SAndroid Build Coastguard Worker    lea                  r2, [dstq+r7]
4799*c0909341SAndroid Build Coastguard Worker.main_part2_pass2_loop:
4800*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(pw_m2896_2896)]
4801*c0909341SAndroid Build Coastguard Worker    call .main_part2_internal
4802*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m14, [o(pw_2048)]
4803*c0909341SAndroid Build Coastguard Worker    IDCT64_PART2_END      0,  7,  0,  6,  9, 10, strideq*0, r3*4, r8*4, r7*8
4804*c0909341SAndroid Build Coastguard Worker    IDCT64_PART2_END      7,  8,  5,  0,  6,  7, strideq*0, r3*4, r8*4, r7*8
4805*c0909341SAndroid Build Coastguard Worker    IDCT64_PART2_END      8,  2,  1,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
4806*c0909341SAndroid Build Coastguard Worker    IDCT64_PART2_END     15,  3,  4,  0,  6,  7, strideq*8, r8*2, r9*8, r3*8
4807*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4808*c0909341SAndroid Build Coastguard Worker    sub                  r2, strideq
4809*c0909341SAndroid Build Coastguard Worker    cmp               tmp1q, tmp2q
4810*c0909341SAndroid Build Coastguard Worker    jne .main_part2_pass2_loop
4811*c0909341SAndroid Build Coastguard Worker    ret
4812*c0909341SAndroid Build Coastguard Worker
4813*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob
4814*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
4815*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
4816*c0909341SAndroid Build Coastguard Worker    jnz .normal
4817*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
4818*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
4819*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_8192)]
4820*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
4821*c0909341SAndroid Build Coastguard Worker    or                  r3d, 16
4822*c0909341SAndroid Build Coastguard Worker.dconly:
4823*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
4824*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_2048)]
4825*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
4826*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm2
4827*c0909341SAndroid Build Coastguard Worker    vpbroadcastw         m0, xm0
4828*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
4829*c0909341SAndroid Build Coastguard Worker.dconly_loop:
4830*c0909341SAndroid Build Coastguard Worker    mova                 m2, [dstq+32*0]
4831*c0909341SAndroid Build Coastguard Worker    mova                 m3, [dstq+32*1]
4832*c0909341SAndroid Build Coastguard Worker    punpckhbw            m4, m2, m1
4833*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m1
4834*c0909341SAndroid Build Coastguard Worker    punpckhbw            m5, m3, m1
4835*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m1
4836*c0909341SAndroid Build Coastguard Worker    paddw                m4, m0
4837*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
4838*c0909341SAndroid Build Coastguard Worker    paddw                m5, m0
4839*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0
4840*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m4
4841*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m5
4842*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*0], m2
4843*c0909341SAndroid Build Coastguard Worker    mova        [dstq+32*1], m3
4844*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
4845*c0909341SAndroid Build Coastguard Worker    dec                 r3d
4846*c0909341SAndroid Build Coastguard Worker    jg .dconly_loop
4847*c0909341SAndroid Build Coastguard Worker    RET
4848*c0909341SAndroid Build Coastguard Worker.normal:
4849*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2
4850*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+32*0, 32*4
4851*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
4852*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
4853*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
4854*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m8
4855*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*7]
4856*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
4857*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
4858*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
4859*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m1
4860*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m2
4861*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m3
4862*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m4
4863*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m5
4864*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m6
4865*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m7
4866*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
4867*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m8
4868*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m9
4869*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m10
4870*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m11
4871*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m12
4872*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m13
4873*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m14
4874*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m15
4875*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+32*2, 32*4
4876*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
4877*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
4878*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
4879*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
4880*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
4881*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
4882*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
4883*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*32
4884*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32* 1]
4885*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32*31]
4886*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*17]
4887*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*15]
4888*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+32* 9]
4889*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+32*23]
4890*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+32*25]
4891*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+32* 7]
4892*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
4893*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
4894*c0909341SAndroid Build Coastguard Worker    add                  r6, o_idct64_offset
4895*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
4896*c0909341SAndroid Build Coastguard Worker    add                  r6, 8
4897*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
4898*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, 32*8
4899*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+32* 5]
4900*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+32*27]
4901*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+32*21]
4902*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+32*11]
4903*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+32*13]
4904*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+32*19]
4905*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+32*29]
4906*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+32* 3]
4907*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
4908*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
4909*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
4910*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
4911*c0909341SAndroid Build Coastguard Worker    sub               tmp1q, 32*36
4912*c0909341SAndroid Build Coastguard Worker    lea                  r2, [strideq*3]
4913*c0909341SAndroid Build Coastguard Worker    mov               tmp2d, 4
4914*c0909341SAndroid Build Coastguard Worker.pass2_loop:
4915*c0909341SAndroid Build Coastguard Worker    lea                  r3, [tmp1q-32*8]
4916*c0909341SAndroid Build Coastguard Worker    mova                xm0, [r3   -32*4]
4917*c0909341SAndroid Build Coastguard Worker    mova                xm1, [r3   -32*3]
4918*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tmp1q-32*4], 1
4919*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tmp1q-32*3], 1
4920*c0909341SAndroid Build Coastguard Worker    mova                xm2, [r3   -32*2]
4921*c0909341SAndroid Build Coastguard Worker    mova                xm3, [r3   -32*1]
4922*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tmp1q-32*2], 1
4923*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [tmp1q-32*1], 1
4924*c0909341SAndroid Build Coastguard Worker    mova                xm4, [r3   +32*0]
4925*c0909341SAndroid Build Coastguard Worker    mova                xm5, [r3   +32*1]
4926*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [tmp1q+32*0], 1
4927*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [tmp1q+32*1], 1
4928*c0909341SAndroid Build Coastguard Worker    mova                xm6, [r3   +32*2]
4929*c0909341SAndroid Build Coastguard Worker    mova                xm7, [r3   +32*3]
4930*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [tmp1q+32*2], 1
4931*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [tmp1q+32*3], 1
4932*c0909341SAndroid Build Coastguard Worker    mova                xm8, [r3   -32*4+16]
4933*c0909341SAndroid Build Coastguard Worker    mova                xm9, [r3   -32*3+16]
4934*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [tmp1q-32*4+16], 1
4935*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [tmp1q-32*3+16], 1
4936*c0909341SAndroid Build Coastguard Worker    mova               xm10, [r3   -32*2+16]
4937*c0909341SAndroid Build Coastguard Worker    mova               xm11, [r3   -32*1+16]
4938*c0909341SAndroid Build Coastguard Worker    vinserti128         m10, [tmp1q-32*2+16], 1
4939*c0909341SAndroid Build Coastguard Worker    vinserti128         m11, [tmp1q-32*1+16], 1
4940*c0909341SAndroid Build Coastguard Worker    mova               xm12, [r3   +32*0+16]
4941*c0909341SAndroid Build Coastguard Worker    mova               xm13, [r3   +32*1+16]
4942*c0909341SAndroid Build Coastguard Worker    vinserti128         m12, [tmp1q+32*0+16], 1
4943*c0909341SAndroid Build Coastguard Worker    vinserti128         m13, [tmp1q+32*1+16], 1
4944*c0909341SAndroid Build Coastguard Worker    mova               xm14, [r3   +32*2+16]
4945*c0909341SAndroid Build Coastguard Worker    mova               xm15, [r3   +32*3+16]
4946*c0909341SAndroid Build Coastguard Worker    vinserti128         m14, [tmp1q+32*2+16], 1
4947*c0909341SAndroid Build Coastguard Worker    vinserti128         m15, [tmp1q+32*3+16], 1
4948*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m6
4949*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*1], m7
4950*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_8192)]
4951*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
4952*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
4953*c0909341SAndroid Build Coastguard Worker    mova         [rsp+32*0], m15
4954*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pw_2048)]
4955*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7
4956*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            2,  3,  1,  2, strideq*2, r2
4957*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m15, [rsp+32*1]
4958*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            0,  1,  2,  3, strideq*0, strideq*1
4959*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+strideq*4]
4960*c0909341SAndroid Build Coastguard Worker    %define dstq r3
4961*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            4,  5,  2,  3, strideq*0, strideq*1
4962*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            6,  7,  2,  3, strideq*2, r2
4963*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
4964*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3+strideq*4]
4965*c0909341SAndroid Build Coastguard Worker    WRITE_16X2            8,  9,  2,  3, strideq*0, strideq*1
4966*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           10, 11,  2,  3, strideq*2, r2
4967*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, [rsp+32*0]
4968*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3+strideq*4]
4969*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           12, 13,  2,  3, strideq*0, strideq*1
4970*c0909341SAndroid Build Coastguard Worker    WRITE_16X2           14, 15,  2,  3, strideq*2, r2
4971*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
4972*c0909341SAndroid Build Coastguard Worker    add                  r0, 16
4973*c0909341SAndroid Build Coastguard Worker    dec               tmp2d
4974*c0909341SAndroid Build Coastguard Worker    jg .pass2_loop
4975*c0909341SAndroid Build Coastguard Worker    RET
4976*c0909341SAndroid Build Coastguard Worker
4977*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob
4978*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
4979*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
4980*c0909341SAndroid Build Coastguard Worker    jnz .normal
4981*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
4982*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
4983*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_16384)]
4984*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
4985*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
4986*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
4987*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly
4988*c0909341SAndroid Build Coastguard Worker.normal:
4989*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2
4990*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*7]
4991*c0909341SAndroid Build Coastguard Worker    lea                r10d, [eobq-136]
4992*c0909341SAndroid Build Coastguard Worker    sar                r10d, 31
4993*c0909341SAndroid Build Coastguard Worker.pass1_loop:
4994*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*16]
4995*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+64*1, 64*2, 1
4996*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
4997*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15
4998*c0909341SAndroid Build Coastguard Worker    test               r10b, r10b
4999*c0909341SAndroid Build Coastguard Worker    jnz .fast
5000*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H   cq+64*17, 64*2, 2
5001*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
5002*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H   cq+64*16, 64*2, 1
5003*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m15
5004*c0909341SAndroid Build Coastguard Worker    pxor                m15, m15
5005*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \
5006*c0909341SAndroid Build Coastguard Worker                                24, 25, 26, 27, 28, 29, 30, 31
5007*c0909341SAndroid Build Coastguard Worker    jmp .idct16
5008*c0909341SAndroid Build Coastguard Worker.fast:
5009*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5010*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5011*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
5012*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m8
5013*c0909341SAndroid Build Coastguard Worker.idct16:
5014*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+64*0, 64*2, 1
5015*c0909341SAndroid Build Coastguard Worker    pxor                m15, m15
5016*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14
5017*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
5018*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end
5019*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_16384)]
5020*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
5021*c0909341SAndroid Build Coastguard Worker    lea                  r3, [tmp1q+32*48]
5022*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp]
5023*c0909341SAndroid Build Coastguard Worker    mova          [r3-32*4], m0
5024*c0909341SAndroid Build Coastguard Worker    mova          [r3-32*3], m2
5025*c0909341SAndroid Build Coastguard Worker    mova          [r3-32*2], m4
5026*c0909341SAndroid Build Coastguard Worker    mova          [r3-32*1], m6
5027*c0909341SAndroid Build Coastguard Worker    mova          [r3+32*0], m8
5028*c0909341SAndroid Build Coastguard Worker    mova          [r3+32*1], m10
5029*c0909341SAndroid Build Coastguard Worker    mova          [r3+32*2], m12
5030*c0909341SAndroid Build Coastguard Worker    mova          [r3+32*3], m14
5031*c0909341SAndroid Build Coastguard Worker    add                  r3, 32*24
5032*c0909341SAndroid Build Coastguard Worker    mova          [r3-32*4], m1
5033*c0909341SAndroid Build Coastguard Worker    mova          [r3-32*3], m3
5034*c0909341SAndroid Build Coastguard Worker    mova          [r3-32*2], m5
5035*c0909341SAndroid Build Coastguard Worker    mova          [r3-32*1], m7
5036*c0909341SAndroid Build Coastguard Worker    mova          [r3+32*0], m9
5037*c0909341SAndroid Build Coastguard Worker    mova          [r3+32*1], m11
5038*c0909341SAndroid Build Coastguard Worker    mova          [r3+32*2], m13
5039*c0909341SAndroid Build Coastguard Worker    mova          [r3+32*3], m15
5040*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m9, [o(pw_16384)]
5041*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m9, [tmp1q-32*4]
5042*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m9, [tmp1q-32*3]
5043*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m9, [tmp1q-32*2]
5044*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m9, [tmp1q-32*1]
5045*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9, [tmp1q+32*0]
5046*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9, [tmp1q+32*1]
5047*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m9, [tmp1q+32*2]
5048*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m9, [tmp1q+32*3]
5049*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
5050*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
5051*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m9, [tmp2q-32*4]
5052*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*4], m1
5053*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m9, [tmp2q-32*3]
5054*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m2
5055*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m9, [tmp2q-32*2]
5056*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*3], m3
5057*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m9, [tmp2q-32*1]
5058*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m4
5059*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m9, [tmp2q+32*0]
5060*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*2], m5
5061*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m9, [tmp2q+32*1]
5062*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m6
5063*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m9, [tmp2q+32*2]
5064*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*1], m7
5065*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m9, [tmp2q+32*3]
5066*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
5067*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m0
5068*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*0], m1
5069*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m2
5070*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*1], m3
5071*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m4
5072*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*2], m5
5073*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m6
5074*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*3], m7
5075*c0909341SAndroid Build Coastguard Worker    add                  cq, 32
5076*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5077*c0909341SAndroid Build Coastguard Worker    add                r10d, 0x80000000
5078*c0909341SAndroid Build Coastguard Worker    jnc .pass1_loop
5079*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+32*55]
5080*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r2+32*24]
5081*c0909341SAndroid Build Coastguard Worker.pass2_loop:
5082*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r2+32*8]
5083*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r7+32*8]
5084*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r2-32*4]
5085*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r2-32*2]
5086*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r2+32*0]
5087*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r2+32*2]
5088*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
5089*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
5090*c0909341SAndroid Build Coastguard Worker    test               r10b, r10b
5091*c0909341SAndroid Build Coastguard Worker    jnz .fast2
5092*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3-32*4]
5093*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3-32*2]
5094*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+32*0]
5095*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+32*2]
5096*c0909341SAndroid Build Coastguard Worker.fast2:
5097*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m8
5098*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*39]
5099*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
5100*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
5101*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
5102*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m1
5103*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m2
5104*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m3
5105*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m4
5106*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m5
5107*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m6
5108*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m7
5109*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5110*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m8
5111*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m9
5112*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m10
5113*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m11
5114*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m12
5115*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m13
5116*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m14
5117*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m15
5118*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r2-32*3]
5119*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r2-32*1]
5120*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r2+32*1]
5121*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r2+32*3]
5122*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
5123*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7
5124*c0909341SAndroid Build Coastguard Worker    test               r10b, r10b
5125*c0909341SAndroid Build Coastguard Worker    jnz .fast3
5126*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3-32*3]
5127*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3-32*1]
5128*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+32*1]
5129*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+32*3]
5130*c0909341SAndroid Build Coastguard Worker.fast3:
5131*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5132*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
5133*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5134*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
5135*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
5136*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*32
5137*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r7-32*4]
5138*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r7+32*3]
5139*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r7+32*0]
5140*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r7-32*1]
5141*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
5142*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m1}, m2, m5, m6
5143*c0909341SAndroid Build Coastguard Worker    test               r10b, r10b
5144*c0909341SAndroid Build Coastguard Worker    jnz .fast4
5145*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r8+32*3]
5146*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r8-32*4]
5147*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r8-32*1]
5148*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r8+32*0]
5149*c0909341SAndroid Build Coastguard Worker.fast4:
5150*c0909341SAndroid Build Coastguard Worker    add                  r6, o_idct64_offset
5151*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5152*c0909341SAndroid Build Coastguard Worker    add                  r6, 8
5153*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5154*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, 32*8
5155*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r7-32*2]
5156*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r7+32*1]
5157*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r7+32*2]
5158*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r7-32*3]
5159*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
5160*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m1}, m2, m5, m6
5161*c0909341SAndroid Build Coastguard Worker    test               r10b, r10b
5162*c0909341SAndroid Build Coastguard Worker    jnz .fast5
5163*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r8+32*1]
5164*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r8-32*2]
5165*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r8-32*3]
5166*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r8+32*2]
5167*c0909341SAndroid Build Coastguard Worker.fast5:
5168*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5169*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
5170*c0909341SAndroid Build Coastguard Worker    add                r10d, 0x80000000
5171*c0909341SAndroid Build Coastguard Worker    jc .ret
5172*c0909341SAndroid Build Coastguard Worker    lea                  r2, [rsp+32*7]
5173*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r2+32*16]
5174*c0909341SAndroid Build Coastguard Worker    sub                dstq, r8
5175*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4+16]
5176*c0909341SAndroid Build Coastguard Worker    jmp .pass2_loop
5177*c0909341SAndroid Build Coastguard Worker.ret:
5178*c0909341SAndroid Build Coastguard Worker    RET
5179*c0909341SAndroid Build Coastguard Worker
5180*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob
5181*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
5182*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
5183*c0909341SAndroid Build Coastguard Worker    jnz .normal
5184*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
5185*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
5186*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_16384)]
5187*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
5188*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1
5189*c0909341SAndroid Build Coastguard Worker    or                  r3d, 32
5190*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
5191*c0909341SAndroid Build Coastguard Worker.normal:
5192*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \
5193*c0909341SAndroid Build Coastguard Worker                                            base, tmp3, tmp4
5194*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*7]
5195*c0909341SAndroid Build Coastguard Worker    lea               tmp4d, [eobq-136]
5196*c0909341SAndroid Build Coastguard Worker.pass1_loop:
5197*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+64*0, 64*4, 1
5198*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5199*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
5200*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
5201*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m8
5202*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
5203*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
5204*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
5205*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m1
5206*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m2
5207*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m3
5208*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m4
5209*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m5
5210*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m6
5211*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m7
5212*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5213*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m8
5214*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m9
5215*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m10
5216*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m11
5217*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m12
5218*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m13
5219*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m14
5220*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m15
5221*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+64*2, 64*4, 1
5222*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5223*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
5224*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5225*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
5226*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5227*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
5228*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
5229*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*32
5230*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_2896x8)]
5231*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7, [cq+64* 1]
5232*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7, [cq+64*31]
5233*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7, [cq+64*17]
5234*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m7, [cq+64*15]
5235*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7, [cq+64* 9]
5236*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7, [cq+64*23]
5237*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, [cq+64*25]
5238*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7,     [cq+64* 7]
5239*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5240*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
5241*c0909341SAndroid Build Coastguard Worker    add                  r6, o_idct64_offset
5242*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5243*c0909341SAndroid Build Coastguard Worker    vpbroadcastd         m7, [o(pw_2896x8-(o_idct64_offset))]
5244*c0909341SAndroid Build Coastguard Worker    add                  r6, 8
5245*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5246*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, 32*8
5247*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7, [cq+64* 5]
5248*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7, [cq+64*27]
5249*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m7, [cq+64*21]
5250*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m7, [cq+64*11]
5251*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7, [cq+64*13]
5252*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7, [cq+64*19]
5253*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7, [cq+64*29]
5254*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7,     [cq+64* 3]
5255*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5256*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
5257*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5258*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
5259*c0909341SAndroid Build Coastguard Worker    sub               tmp1q, 32*44
5260*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_16384)]
5261*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
5262*c0909341SAndroid Build Coastguard Worker    add                  cq, 32
5263*c0909341SAndroid Build Coastguard Worker    add               tmp4d, 0x80000000
5264*c0909341SAndroid Build Coastguard Worker    jnc .pass1_loop
5265*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*15]
5266*c0909341SAndroid Build Coastguard Worker    imul                 r2, strideq, 19
5267*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
5268*c0909341SAndroid Build Coastguard Worker    add                  r2, dstq
5269*c0909341SAndroid Build Coastguard Worker    mov               tmp4b, 4
5270*c0909341SAndroid Build Coastguard Worker.pass2_loop:
5271*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*64]
5272*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS   tmp1q-32*4, 32
5273*c0909341SAndroid Build Coastguard Worker    test              tmp4d, 0x40000000
5274*c0909341SAndroid Build Coastguard Worker    jnz .fast
5275*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H tmp2q-32*4, 32
5276*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
5277*c0909341SAndroid Build Coastguard Worker    lea               tmp3q, [tmp2q-32*8]
5278*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS_H tmp3q-32*4, 32
5279*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m15
5280*c0909341SAndroid Build Coastguard Worker    jmp .idct16
5281*c0909341SAndroid Build Coastguard Worker.fast:
5282*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5283*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5284*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
5285*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m8
5286*c0909341SAndroid Build Coastguard Worker.idct16:
5287*c0909341SAndroid Build Coastguard Worker    lea               tmp3q, [tmp1q-32*8]
5288*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS   tmp3q-32*4, 32
5289*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
5290*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end
5291*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
5292*c0909341SAndroid Build Coastguard Worker    sub                dstq, r3
5293*c0909341SAndroid Build Coastguard Worker    lea                  r2, [r2+r3+16]
5294*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
5295*c0909341SAndroid Build Coastguard Worker    dec               tmp4b
5296*c0909341SAndroid Build Coastguard Worker    jg .pass2_loop
5297*c0909341SAndroid Build Coastguard Worker    RET
5298*c0909341SAndroid Build Coastguard WorkerALIGN function_align
5299*c0909341SAndroid Build Coastguard Worker.transpose_round_interleave:
5300*c0909341SAndroid Build Coastguard Worker    mov               tmp3d, 4
5301*c0909341SAndroid Build Coastguard Worker.loop:
5302*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
5303*c0909341SAndroid Build Coastguard Worker    mova                xm0, [tmp1q-32*4]
5304*c0909341SAndroid Build Coastguard Worker    mova                xm1, [tmp1q-32*3]
5305*c0909341SAndroid Build Coastguard Worker    vinserti128          m0, [tmp2q-32*4], 1
5306*c0909341SAndroid Build Coastguard Worker    vinserti128          m1, [tmp2q-32*3], 1
5307*c0909341SAndroid Build Coastguard Worker    mova                xm2, [tmp1q-32*2]
5308*c0909341SAndroid Build Coastguard Worker    mova                xm3, [tmp1q-32*1]
5309*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tmp2q-32*2], 1
5310*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [tmp2q-32*1], 1
5311*c0909341SAndroid Build Coastguard Worker    mova                xm4, [tmp1q+32*0]
5312*c0909341SAndroid Build Coastguard Worker    mova                xm5, [tmp1q+32*1]
5313*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [tmp2q+32*0], 1
5314*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [tmp2q+32*1], 1
5315*c0909341SAndroid Build Coastguard Worker    mova                xm6, [tmp1q+32*2]
5316*c0909341SAndroid Build Coastguard Worker    mova                xm7, [tmp1q+32*3]
5317*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [tmp2q+32*2], 1
5318*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [tmp2q+32*3], 1
5319*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
5320*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
5321*c0909341SAndroid Build Coastguard Worker    mova                xm8, [tmp1q-32*4+16]
5322*c0909341SAndroid Build Coastguard Worker    mova                xm9, [tmp1q-32*3+16]
5323*c0909341SAndroid Build Coastguard Worker    vinserti128          m8, [tmp2q-32*4+16], 1
5324*c0909341SAndroid Build Coastguard Worker    vinserti128          m9, [tmp2q-32*3+16], 1
5325*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
5326*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*4], m1
5327*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m2
5328*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*3], m3
5329*c0909341SAndroid Build Coastguard Worker    mova                xm2, [tmp1q-32*2+16]
5330*c0909341SAndroid Build Coastguard Worker    mova                xm3, [tmp1q-32*1+16]
5331*c0909341SAndroid Build Coastguard Worker    vinserti128          m2, [tmp2q-32*2+16], 1
5332*c0909341SAndroid Build Coastguard Worker    vinserti128          m3, [tmp2q-32*1+16], 1
5333*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m4
5334*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*2], m5
5335*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m6
5336*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q-32*1], m7
5337*c0909341SAndroid Build Coastguard Worker    mova                xm4, [tmp1q+32*0+16]
5338*c0909341SAndroid Build Coastguard Worker    mova                xm5, [tmp1q+32*1+16]
5339*c0909341SAndroid Build Coastguard Worker    vinserti128          m4, [tmp2q+32*0+16], 1
5340*c0909341SAndroid Build Coastguard Worker    vinserti128          m5, [tmp2q+32*1+16], 1
5341*c0909341SAndroid Build Coastguard Worker    mova                xm6, [tmp1q+32*2+16]
5342*c0909341SAndroid Build Coastguard Worker    mova                xm7, [tmp1q+32*3+16]
5343*c0909341SAndroid Build Coastguard Worker    vinserti128          m6, [tmp2q+32*2+16], 1
5344*c0909341SAndroid Build Coastguard Worker    vinserti128          m7, [tmp2q+32*3+16], 1
5345*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m8, m10
5346*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m9, m10
5347*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
5348*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8
5349*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m0
5350*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*0], m1
5351*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m2
5352*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*1], m3
5353*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m4
5354*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*2], m5
5355*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m6
5356*c0909341SAndroid Build Coastguard Worker    mova       [tmp2q+32*3], m7
5357*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
5358*c0909341SAndroid Build Coastguard Worker    dec               tmp3d
5359*c0909341SAndroid Build Coastguard Worker    jg .loop
5360*c0909341SAndroid Build Coastguard Worker    ret
5361*c0909341SAndroid Build Coastguard Worker
5362*c0909341SAndroid Build Coastguard Workercglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob
5363*c0909341SAndroid Build Coastguard Worker    lea                  r6, [o_base]
5364*c0909341SAndroid Build Coastguard Worker    test               eobd, eobd
5365*c0909341SAndroid Build Coastguard Worker    jnz .normal
5366*c0909341SAndroid Build Coastguard Worker    movd                xm1, [o(pw_2896x8)]
5367*c0909341SAndroid Build Coastguard Worker    pmulhrsw            xm0, xm1, [cq]
5368*c0909341SAndroid Build Coastguard Worker    movd                xm2, [o(pw_8192)]
5369*c0909341SAndroid Build Coastguard Worker    mov                [cq], eobd
5370*c0909341SAndroid Build Coastguard Worker    or                  r3d, 64
5371*c0909341SAndroid Build Coastguard Worker    jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
5372*c0909341SAndroid Build Coastguard Worker.normal:
5373*c0909341SAndroid Build Coastguard Worker    PROLOGUE              0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2
5374*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*71]
5375*c0909341SAndroid Build Coastguard Worker    lea                r10d, [eobq-136]
5376*c0909341SAndroid Build Coastguard Worker.pass1_loop:
5377*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+64*0, 64*4
5378*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5379*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28
5380*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m8}, m9, m10, m11, m12, m13, m14
5381*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m8
5382*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
5383*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
5384*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
5385*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m1
5386*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m2
5387*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m3
5388*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m4
5389*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m5
5390*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m6
5391*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m7
5392*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5393*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m8
5394*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m9
5395*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m10
5396*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m11
5397*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m12
5398*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m13
5399*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m14
5400*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m15
5401*c0909341SAndroid Build Coastguard Worker    LOAD_8ROWS      cq+64*2, 64*4
5402*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5403*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30
5404*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5405*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
5406*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5407*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
5408*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
5409*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*32
5410*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 1]
5411*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*31]
5412*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*17]
5413*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*15]
5414*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64* 9]
5415*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*23]
5416*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*25]
5417*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64* 7]
5418*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5419*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7
5420*c0909341SAndroid Build Coastguard Worker    add                  r6, o_idct64_offset
5421*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5422*c0909341SAndroid Build Coastguard Worker    add                  r6, 8
5423*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5424*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, 32*8
5425*c0909341SAndroid Build Coastguard Worker    mova                 m0, [cq+64* 5]
5426*c0909341SAndroid Build Coastguard Worker    mova                 m1, [cq+64*27]
5427*c0909341SAndroid Build Coastguard Worker    mova                 m2, [cq+64*21]
5428*c0909341SAndroid Build Coastguard Worker    mova                 m3, [cq+64*11]
5429*c0909341SAndroid Build Coastguard Worker    mova                 m4, [cq+64*13]
5430*c0909341SAndroid Build Coastguard Worker    mova                 m5, [cq+64*19]
5431*c0909341SAndroid Build Coastguard Worker    mova                 m6, [cq+64*29]
5432*c0909341SAndroid Build Coastguard Worker    mova                 m7, [cq+64* 3]
5433*c0909341SAndroid Build Coastguard Worker    pxor                 m8, m8
5434*c0909341SAndroid Build Coastguard Worker    REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3
5435*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5436*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1
5437*c0909341SAndroid Build Coastguard Worker    sub               tmp1q, 32*44
5438*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m10, [o(pw_8192)]
5439*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave
5440*c0909341SAndroid Build Coastguard Worker    add                  cq, 32
5441*c0909341SAndroid Build Coastguard Worker    add                r10d, 0x80000000
5442*c0909341SAndroid Build Coastguard Worker    jnc .pass1_loop
5443*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [rsp+32*7]
5444*c0909341SAndroid Build Coastguard Worker    mov                r10b, 4
5445*c0909341SAndroid Build Coastguard Worker.pass2_loop:
5446*c0909341SAndroid Build Coastguard Worker    lea                  r2, [tmp1q+32*64]
5447*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r2-32*4]
5448*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r2-32*2]
5449*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r2+32*0]
5450*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r2+32*2]
5451*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
5452*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
5453*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m4
5454*c0909341SAndroid Build Coastguard Worker    test               r10d, 0x40000000
5455*c0909341SAndroid Build Coastguard Worker    jnz .fast
5456*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r2+32*64]
5457*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3-32*4]
5458*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3-32*2]
5459*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+32*0]
5460*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+32*2]
5461*c0909341SAndroid Build Coastguard Worker.fast:
5462*c0909341SAndroid Build Coastguard Worker    call m(idct_16x16_internal_8bpc).main
5463*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+32*1]
5464*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m0
5465*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m1
5466*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m2
5467*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m3
5468*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m4
5469*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m5
5470*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m6
5471*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m7
5472*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5473*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*4], m8
5474*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*3], m9
5475*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*2], m10
5476*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q-32*1], m11
5477*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*0], m12
5478*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*1], m13
5479*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*2], m14
5480*c0909341SAndroid Build Coastguard Worker    mova       [tmp1q+32*3], m15
5481*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r2-32*3]
5482*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r2-32*1]
5483*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r2+32*1]
5484*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r2+32*3]
5485*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
5486*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m4}, m5, m6, m7
5487*c0909341SAndroid Build Coastguard Worker    test               r10d, 0x40000000
5488*c0909341SAndroid Build Coastguard Worker    jnz .fast2
5489*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r3-32*3]
5490*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3-32*1]
5491*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+32*1]
5492*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r3+32*3]
5493*c0909341SAndroid Build Coastguard Worker.fast2:
5494*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5495*c0909341SAndroid Build Coastguard Worker    lea               tmp2q, [tmp1q+32*8]
5496*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
5497*c0909341SAndroid Build Coastguard Worker    vpbroadcastd        m15, [o(pd_2048)]
5498*c0909341SAndroid Build Coastguard Worker    add                  r2, 32*8
5499*c0909341SAndroid Build Coastguard Worker    add                  r3, 32*8
5500*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*16
5501*c0909341SAndroid Build Coastguard Worker    add               tmp2q, 32*32
5502*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r2-32*4] ;  1
5503*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r2+32*3] ; 15
5504*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r2+32*0] ;  9
5505*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r2-32*1] ;  7
5506*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
5507*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m1}, m2, m5, m6
5508*c0909341SAndroid Build Coastguard Worker    test               r10d, 0x40000000
5509*c0909341SAndroid Build Coastguard Worker    jnz .fast3
5510*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+32*3] ; 31
5511*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3-32*4] ; 17
5512*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3-32*1] ; 23
5513*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+32*0] ; 25
5514*c0909341SAndroid Build Coastguard Worker.fast3:
5515*c0909341SAndroid Build Coastguard Worker    add                  r6, o_idct64_offset
5516*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5517*c0909341SAndroid Build Coastguard Worker    add                  r6, 8
5518*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 32*8
5519*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, 32*8
5520*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r2-32*2] ;  5
5521*c0909341SAndroid Build Coastguard Worker    mova                 m3, [r2+32*1] ; 11
5522*c0909341SAndroid Build Coastguard Worker    mova                 m4, [r2+32*2] ; 13
5523*c0909341SAndroid Build Coastguard Worker    mova                 m7, [r2-32*3] ;  3
5524*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
5525*c0909341SAndroid Build Coastguard Worker    REPX       {mova x, m1}, m2, m5, m6
5526*c0909341SAndroid Build Coastguard Worker    test               r10d, 0x40000000
5527*c0909341SAndroid Build Coastguard Worker    jnz .fast4
5528*c0909341SAndroid Build Coastguard Worker    mova                 m1, [r3+32*1] ; 27
5529*c0909341SAndroid Build Coastguard Worker    mova                 m2, [r3-32*2] ; 21
5530*c0909341SAndroid Build Coastguard Worker    mova                 m5, [r3-32*3] ; 19
5531*c0909341SAndroid Build Coastguard Worker    mova                 m6, [r3+32*2] ; 29
5532*c0909341SAndroid Build Coastguard Worker.fast4:
5533*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
5534*c0909341SAndroid Build Coastguard Worker    call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2
5535*c0909341SAndroid Build Coastguard Worker    sub               tmp1q, 32*28
5536*c0909341SAndroid Build Coastguard Worker    sub                dstq, r8
5537*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4+16]
5538*c0909341SAndroid Build Coastguard Worker    dec                r10b
5539*c0909341SAndroid Build Coastguard Worker    jg .pass2_loop
5540*c0909341SAndroid Build Coastguard Worker    RET
5541*c0909341SAndroid Build Coastguard Worker
5542*c0909341SAndroid Build Coastguard Worker%endif ; ARCH_X86_64
5543