xref: /aosp_15_r20/external/libdav1d/src/x86/mc_sse.asm (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLAN and dav1d authors
2*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, Two Orioles, LLC
3*c0909341SAndroid Build Coastguard Worker; Copyright © 2018, VideoLabs
4*c0909341SAndroid Build Coastguard Worker; All rights reserved.
5*c0909341SAndroid Build Coastguard Worker;
6*c0909341SAndroid Build Coastguard Worker; Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker; modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker;
9*c0909341SAndroid Build Coastguard Worker; 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker;    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker;
12*c0909341SAndroid Build Coastguard Worker; 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker;    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker;    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker;
16*c0909341SAndroid Build Coastguard Worker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker
27*c0909341SAndroid Build Coastguard Worker%include "config.asm"
28*c0909341SAndroid Build Coastguard Worker%include "ext/x86/x86inc.asm"
29*c0909341SAndroid Build Coastguard Worker
30*c0909341SAndroid Build Coastguard WorkerSECTION_RODATA 16
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard Worker; dav1d_obmc_masks[] with 64-x interleaved
33*c0909341SAndroid Build Coastguard Workerobmc_masks: db  0,  0,  0,  0
34*c0909341SAndroid Build Coastguard Worker            ; 2 @4
35*c0909341SAndroid Build Coastguard Worker            db 45, 19, 64,  0
36*c0909341SAndroid Build Coastguard Worker            ; 4 @8
37*c0909341SAndroid Build Coastguard Worker            db 39, 25, 50, 14, 59,  5, 64,  0
38*c0909341SAndroid Build Coastguard Worker            ; 8 @16
39*c0909341SAndroid Build Coastguard Worker            db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
40*c0909341SAndroid Build Coastguard Worker            ; 16 @32
41*c0909341SAndroid Build Coastguard Worker            db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
42*c0909341SAndroid Build Coastguard Worker            db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
43*c0909341SAndroid Build Coastguard Worker            ; 32 @64
44*c0909341SAndroid Build Coastguard Worker            db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
45*c0909341SAndroid Build Coastguard Worker            db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
46*c0909341SAndroid Build Coastguard Worker            db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
47*c0909341SAndroid Build Coastguard Worker
48*c0909341SAndroid Build Coastguard Workerwarp_8x8_shufA: db 0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
49*c0909341SAndroid Build Coastguard Workerwarp_8x8_shufB: db 4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
50*c0909341SAndroid Build Coastguard Workerwarp_8x8_shufC: db 2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
51*c0909341SAndroid Build Coastguard Workerwarp_8x8_shufD: db 6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
52*c0909341SAndroid Build Coastguard Workerblend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
53*c0909341SAndroid Build Coastguard Workersubpel_h_shuf4: db 0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
54*c0909341SAndroid Build Coastguard Worker                db 2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
55*c0909341SAndroid Build Coastguard Workersubpel_h_shufA: db 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
56*c0909341SAndroid Build Coastguard Workersubpel_h_shufB: db 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
57*c0909341SAndroid Build Coastguard Workersubpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
58*c0909341SAndroid Build Coastguard Workersubpel_h_shufD: db 0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8
59*c0909341SAndroid Build Coastguard Workersubpel_h_shufE: db 2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
60*c0909341SAndroid Build Coastguard Workersubpel_h_shufF: db 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12
61*c0909341SAndroid Build Coastguard Workersubpel_s_shuf2: db 0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
62*c0909341SAndroid Build Coastguard Workersubpel_s_shuf8: db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
63*c0909341SAndroid Build Coastguard Workerbilin_h_shuf4:  db 0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
64*c0909341SAndroid Build Coastguard Workerunpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
65*c0909341SAndroid Build Coastguard Workerrescale_mul:    dd 0,  1,  2,  3
66*c0909341SAndroid Build Coastguard Workerresize_shuf:    db 0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7
67*c0909341SAndroid Build Coastguard Worker
68*c0909341SAndroid Build Coastguard Workerwm_420_sign:    times 4 dw 258
69*c0909341SAndroid Build Coastguard Worker                times 4 dw 257
70*c0909341SAndroid Build Coastguard Workerwm_422_sign:    times 8 db 128
71*c0909341SAndroid Build Coastguard Worker                times 8 db 127
72*c0909341SAndroid Build Coastguard Worker
73*c0909341SAndroid Build Coastguard Workerpb_8x0_8x8: times 8 db 0
74*c0909341SAndroid Build Coastguard Worker            times 8 db 8
75*c0909341SAndroid Build Coastguard Workerbdct_lb_dw: times 4 db 0
76*c0909341SAndroid Build Coastguard Worker            times 4 db 4
77*c0909341SAndroid Build Coastguard Worker            times 4 db 8
78*c0909341SAndroid Build Coastguard Worker            times 4 db 12
79*c0909341SAndroid Build Coastguard Worker
80*c0909341SAndroid Build Coastguard Workerpb_64:    times 16 db 64
81*c0909341SAndroid Build Coastguard Workerpw_m256:  times 8 dw -256
82*c0909341SAndroid Build Coastguard Workerpw_1:     times 8 dw 1
83*c0909341SAndroid Build Coastguard Workerpw_2:     times 8 dw 2
84*c0909341SAndroid Build Coastguard Workerpw_8:     times 8 dw 8
85*c0909341SAndroid Build Coastguard Workerpw_15:    times 8 dw 15
86*c0909341SAndroid Build Coastguard Workerpw_26:    times 8 dw 26
87*c0909341SAndroid Build Coastguard Workerpw_34:    times 8 dw 34
88*c0909341SAndroid Build Coastguard Workerpw_512:   times 8 dw 512
89*c0909341SAndroid Build Coastguard Workerpw_1024:  times 8 dw 1024
90*c0909341SAndroid Build Coastguard Workerpw_2048:  times 8 dw 2048
91*c0909341SAndroid Build Coastguard Workerpw_6903:  times 8 dw 6903
92*c0909341SAndroid Build Coastguard Workerpw_8192:  times 8 dw 8192
93*c0909341SAndroid Build Coastguard Workerpd_32:    times 4 dd 32
94*c0909341SAndroid Build Coastguard Workerpd_63:    times 4 dd 63
95*c0909341SAndroid Build Coastguard Workerpd_512:   times 4 dd 512
96*c0909341SAndroid Build Coastguard Workerpd_16384: times 4 dd 16484
97*c0909341SAndroid Build Coastguard Workerpd_32768: times 4 dd 32768
98*c0909341SAndroid Build Coastguard Workerpd_262144:times 4 dd 262144
99*c0909341SAndroid Build Coastguard Workerpd_0x3ff: times 4 dd 0x3ff
100*c0909341SAndroid Build Coastguard Workerpd_0x4000:times 4 dd 0x4000
101*c0909341SAndroid Build Coastguard Workerpq_0x40000000: times 2 dq 0x40000000
102*c0909341SAndroid Build Coastguard Worker
103*c0909341SAndroid Build Coastguard Workerconst mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage
104*c0909341SAndroid Build Coastguard Worker    ; [-1, 0)
105*c0909341SAndroid Build Coastguard Worker    db 0, 127,   0, 0,   0,   1, 0, 0, 0, 127,   0, 0,  -1,   2, 0, 0
106*c0909341SAndroid Build Coastguard Worker    db 1, 127,  -1, 0,  -3,   4, 0, 0, 1, 126,  -2, 0,  -4,   6, 1, 0
107*c0909341SAndroid Build Coastguard Worker    db 1, 126,  -3, 0,  -5,   8, 1, 0, 1, 125,  -4, 0,  -6,  11, 1, 0
108*c0909341SAndroid Build Coastguard Worker    db 1, 124,  -4, 0,  -7,  13, 1, 0, 2, 123,  -5, 0,  -8,  15, 1, 0
109*c0909341SAndroid Build Coastguard Worker    db 2, 122,  -6, 0,  -9,  18, 1, 0, 2, 121,  -6, 0, -10,  20, 1, 0
110*c0909341SAndroid Build Coastguard Worker    db 2, 120,  -7, 0, -11,  22, 2, 0, 2, 119,  -8, 0, -12,  25, 2, 0
111*c0909341SAndroid Build Coastguard Worker    db 3, 117,  -8, 0, -13,  27, 2, 0, 3, 116,  -9, 0, -13,  29, 2, 0
112*c0909341SAndroid Build Coastguard Worker    db 3, 114, -10, 0, -14,  32, 3, 0, 3, 113, -10, 0, -15,  35, 2, 0
113*c0909341SAndroid Build Coastguard Worker    db 3, 111, -11, 0, -15,  37, 3, 0, 3, 109, -11, 0, -16,  40, 3, 0
114*c0909341SAndroid Build Coastguard Worker    db 3, 108, -12, 0, -16,  42, 3, 0, 4, 106, -13, 0, -17,  45, 3, 0
115*c0909341SAndroid Build Coastguard Worker    db 4, 104, -13, 0, -17,  47, 3, 0, 4, 102, -14, 0, -17,  50, 3, 0
116*c0909341SAndroid Build Coastguard Worker    db 4, 100, -14, 0, -17,  52, 3, 0, 4,  98, -15, 0, -18,  55, 4, 0
117*c0909341SAndroid Build Coastguard Worker    db 4,  96, -15, 0, -18,  58, 3, 0, 4,  94, -16, 0, -18,  60, 4, 0
118*c0909341SAndroid Build Coastguard Worker    db 4,  91, -16, 0, -18,  63, 4, 0, 4,  89, -16, 0, -18,  65, 4, 0
119*c0909341SAndroid Build Coastguard Worker    db 4,  87, -17, 0, -18,  68, 4, 0, 4,  85, -17, 0, -18,  70, 4, 0
120*c0909341SAndroid Build Coastguard Worker    db 4,  82, -17, 0, -18,  73, 4, 0, 4,  80, -17, 0, -18,  75, 4, 0
121*c0909341SAndroid Build Coastguard Worker    db 4,  78, -18, 0, -18,  78, 4, 0, 4,  75, -18, 0, -17,  80, 4, 0
122*c0909341SAndroid Build Coastguard Worker    db 4,  73, -18, 0, -17,  82, 4, 0, 4,  70, -18, 0, -17,  85, 4, 0
123*c0909341SAndroid Build Coastguard Worker    db 4,  68, -18, 0, -17,  87, 4, 0, 4,  65, -18, 0, -16,  89, 4, 0
124*c0909341SAndroid Build Coastguard Worker    db 4,  63, -18, 0, -16,  91, 4, 0, 4,  60, -18, 0, -16,  94, 4, 0
125*c0909341SAndroid Build Coastguard Worker    db 3,  58, -18, 0, -15,  96, 4, 0, 4,  55, -18, 0, -15,  98, 4, 0
126*c0909341SAndroid Build Coastguard Worker    db 3,  52, -17, 0, -14, 100, 4, 0, 3,  50, -17, 0, -14, 102, 4, 0
127*c0909341SAndroid Build Coastguard Worker    db 3,  47, -17, 0, -13, 104, 4, 0, 3,  45, -17, 0, -13, 106, 4, 0
128*c0909341SAndroid Build Coastguard Worker    db 3,  42, -16, 0, -12, 108, 3, 0, 3,  40, -16, 0, -11, 109, 3, 0
129*c0909341SAndroid Build Coastguard Worker    db 3,  37, -15, 0, -11, 111, 3, 0, 2,  35, -15, 0, -10, 113, 3, 0
130*c0909341SAndroid Build Coastguard Worker    db 3,  32, -14, 0, -10, 114, 3, 0, 2,  29, -13, 0,  -9, 116, 3, 0
131*c0909341SAndroid Build Coastguard Worker    db 2,  27, -13, 0,  -8, 117, 3, 0, 2,  25, -12, 0,  -8, 119, 2, 0
132*c0909341SAndroid Build Coastguard Worker    db 2,  22, -11, 0,  -7, 120, 2, 0, 1,  20, -10, 0,  -6, 121, 2, 0
133*c0909341SAndroid Build Coastguard Worker    db 1,  18,  -9, 0,  -6, 122, 2, 0, 1,  15,  -8, 0,  -5, 123, 2, 0
134*c0909341SAndroid Build Coastguard Worker    db 1,  13,  -7, 0,  -4, 124, 1, 0, 1,  11,  -6, 0,  -4, 125, 1, 0
135*c0909341SAndroid Build Coastguard Worker    db 1,   8,  -5, 0,  -3, 126, 1, 0, 1,   6,  -4, 0,  -2, 126, 1, 0
136*c0909341SAndroid Build Coastguard Worker    db 0,   4,  -3, 0,  -1, 127, 1, 0, 0,   2,  -1, 0,   0, 127, 0, 0
137*c0909341SAndroid Build Coastguard Worker    ; [0, 1)
138*c0909341SAndroid Build Coastguard Worker    db  0,   0,   1, 0, 0, 127,   0,  0,  0,  -1,   2, 0, 0, 127,   0,  0
139*c0909341SAndroid Build Coastguard Worker    db  0,  -3,   4, 1, 1, 127,  -2,  0,  0,  -5,   6, 1, 1, 127,  -2,  0
140*c0909341SAndroid Build Coastguard Worker    db  0,  -6,   8, 1, 2, 126,  -3,  0, -1,  -7,  11, 2, 2, 126,  -4, -1
141*c0909341SAndroid Build Coastguard Worker    db -1,  -8,  13, 2, 3, 125,  -5, -1, -1, -10,  16, 3, 3, 124,  -6, -1
142*c0909341SAndroid Build Coastguard Worker    db -1, -11,  18, 3, 4, 123,  -7, -1, -1, -12,  20, 3, 4, 122,  -7, -1
143*c0909341SAndroid Build Coastguard Worker    db -1, -13,  23, 3, 4, 121,  -8, -1, -2, -14,  25, 4, 5, 120,  -9, -1
144*c0909341SAndroid Build Coastguard Worker    db -1, -15,  27, 4, 5, 119, -10, -1, -1, -16,  30, 4, 5, 118, -11, -1
145*c0909341SAndroid Build Coastguard Worker    db -2, -17,  33, 5, 6, 116, -12, -1, -2, -17,  35, 5, 6, 114, -12, -1
146*c0909341SAndroid Build Coastguard Worker    db -2, -18,  38, 5, 6, 113, -13, -1, -2, -19,  41, 6, 7, 111, -14, -2
147*c0909341SAndroid Build Coastguard Worker    db -2, -19,  43, 6, 7, 110, -15, -2, -2, -20,  46, 6, 7, 108, -15, -2
148*c0909341SAndroid Build Coastguard Worker    db -2, -20,  49, 6, 7, 106, -16, -2, -2, -21,  51, 7, 7, 104, -16, -2
149*c0909341SAndroid Build Coastguard Worker    db -2, -21,  54, 7, 7, 102, -17, -2, -2, -21,  56, 7, 8, 100, -18, -2
150*c0909341SAndroid Build Coastguard Worker    db -2, -22,  59, 7, 8,  98, -18, -2, -2, -22,  62, 7, 8,  96, -19, -2
151*c0909341SAndroid Build Coastguard Worker    db -2, -22,  64, 7, 8,  94, -19, -2, -2, -22,  67, 8, 8,  91, -20, -2
152*c0909341SAndroid Build Coastguard Worker    db -2, -22,  69, 8, 8,  89, -20, -2, -2, -22,  72, 8, 8,  87, -21, -2
153*c0909341SAndroid Build Coastguard Worker    db -2, -21,  74, 8, 8,  84, -21, -2, -2, -22,  77, 8, 8,  82, -21, -2
154*c0909341SAndroid Build Coastguard Worker    db -2, -21,  79, 8, 8,  79, -21, -2, -2, -21,  82, 8, 8,  77, -22, -2
155*c0909341SAndroid Build Coastguard Worker    db -2, -21,  84, 8, 8,  74, -21, -2, -2, -21,  87, 8, 8,  72, -22, -2
156*c0909341SAndroid Build Coastguard Worker    db -2, -20,  89, 8, 8,  69, -22, -2, -2, -20,  91, 8, 8,  67, -22, -2
157*c0909341SAndroid Build Coastguard Worker    db -2, -19,  94, 8, 7,  64, -22, -2, -2, -19,  96, 8, 7,  62, -22, -2
158*c0909341SAndroid Build Coastguard Worker    db -2, -18,  98, 8, 7,  59, -22, -2, -2, -18, 100, 8, 7,  56, -21, -2
159*c0909341SAndroid Build Coastguard Worker    db -2, -17, 102, 7, 7,  54, -21, -2, -2, -16, 104, 7, 7,  51, -21, -2
160*c0909341SAndroid Build Coastguard Worker    db -2, -16, 106, 7, 6,  49, -20, -2, -2, -15, 108, 7, 6,  46, -20, -2
161*c0909341SAndroid Build Coastguard Worker    db -2, -15, 110, 7, 6,  43, -19, -2, -2, -14, 111, 7, 6,  41, -19, -2
162*c0909341SAndroid Build Coastguard Worker    db -1, -13, 113, 6, 5,  38, -18, -2, -1, -12, 114, 6, 5,  35, -17, -2
163*c0909341SAndroid Build Coastguard Worker    db -1, -12, 116, 6, 5,  33, -17, -2, -1, -11, 118, 5, 4,  30, -16, -1
164*c0909341SAndroid Build Coastguard Worker    db -1, -10, 119, 5, 4,  27, -15, -1, -1,  -9, 120, 5, 4,  25, -14, -2
165*c0909341SAndroid Build Coastguard Worker    db -1,  -8, 121, 4, 3,  23, -13, -1, -1,  -7, 122, 4, 3,  20, -12, -1
166*c0909341SAndroid Build Coastguard Worker    db -1,  -7, 123, 4, 3,  18, -11, -1, -1,  -6, 124, 3, 3,  16, -10, -1
167*c0909341SAndroid Build Coastguard Worker    db -1,  -5, 125, 3, 2,  13,  -8, -1, -1,  -4, 126, 2, 2,  11,  -7, -1
168*c0909341SAndroid Build Coastguard Worker    db  0,  -3, 126, 2, 1,   8,  -6,  0,  0,  -2, 127, 1, 1,   6,  -5,  0
169*c0909341SAndroid Build Coastguard Worker    db  0,  -2, 127, 1, 1,   4,  -3,  0,  0,   0, 127, 0, 0,   2,  -1,  0
170*c0909341SAndroid Build Coastguard Worker    ; [1, 2)
171*c0909341SAndroid Build Coastguard Worker    db 0, 0, 127,   0, 0,   1,   0, 0, 0, 0, 127,   0, 0,  -1,   2, 0
172*c0909341SAndroid Build Coastguard Worker    db 0, 1, 127,  -1, 0,  -3,   4, 0, 0, 1, 126,  -2, 0,  -4,   6, 1
173*c0909341SAndroid Build Coastguard Worker    db 0, 1, 126,  -3, 0,  -5,   8, 1, 0, 1, 125,  -4, 0,  -6,  11, 1
174*c0909341SAndroid Build Coastguard Worker    db 0, 1, 124,  -4, 0,  -7,  13, 1, 0, 2, 123,  -5, 0,  -8,  15, 1
175*c0909341SAndroid Build Coastguard Worker    db 0, 2, 122,  -6, 0,  -9,  18, 1, 0, 2, 121,  -6, 0, -10,  20, 1
176*c0909341SAndroid Build Coastguard Worker    db 0, 2, 120,  -7, 0, -11,  22, 2, 0, 2, 119,  -8, 0, -12,  25, 2
177*c0909341SAndroid Build Coastguard Worker    db 0, 3, 117,  -8, 0, -13,  27, 2, 0, 3, 116,  -9, 0, -13,  29, 2
178*c0909341SAndroid Build Coastguard Worker    db 0, 3, 114, -10, 0, -14,  32, 3, 0, 3, 113, -10, 0, -15,  35, 2
179*c0909341SAndroid Build Coastguard Worker    db 0, 3, 111, -11, 0, -15,  37, 3, 0, 3, 109, -11, 0, -16,  40, 3
180*c0909341SAndroid Build Coastguard Worker    db 0, 3, 108, -12, 0, -16,  42, 3, 0, 4, 106, -13, 0, -17,  45, 3
181*c0909341SAndroid Build Coastguard Worker    db 0, 4, 104, -13, 0, -17,  47, 3, 0, 4, 102, -14, 0, -17,  50, 3
182*c0909341SAndroid Build Coastguard Worker    db 0, 4, 100, -14, 0, -17,  52, 3, 0, 4,  98, -15, 0, -18,  55, 4
183*c0909341SAndroid Build Coastguard Worker    db 0, 4,  96, -15, 0, -18,  58, 3, 0, 4,  94, -16, 0, -18,  60, 4
184*c0909341SAndroid Build Coastguard Worker    db 0, 4,  91, -16, 0, -18,  63, 4, 0, 4,  89, -16, 0, -18,  65, 4
185*c0909341SAndroid Build Coastguard Worker    db 0, 4,  87, -17, 0, -18,  68, 4, 0, 4,  85, -17, 0, -18,  70, 4
186*c0909341SAndroid Build Coastguard Worker    db 0, 4,  82, -17, 0, -18,  73, 4, 0, 4,  80, -17, 0, -18,  75, 4
187*c0909341SAndroid Build Coastguard Worker    db 0, 4,  78, -18, 0, -18,  78, 4, 0, 4,  75, -18, 0, -17,  80, 4
188*c0909341SAndroid Build Coastguard Worker    db 0, 4,  73, -18, 0, -17,  82, 4, 0, 4,  70, -18, 0, -17,  85, 4
189*c0909341SAndroid Build Coastguard Worker    db 0, 4,  68, -18, 0, -17,  87, 4, 0, 4,  65, -18, 0, -16,  89, 4
190*c0909341SAndroid Build Coastguard Worker    db 0, 4,  63, -18, 0, -16,  91, 4, 0, 4,  60, -18, 0, -16,  94, 4
191*c0909341SAndroid Build Coastguard Worker    db 0, 3,  58, -18, 0, -15,  96, 4, 0, 4,  55, -18, 0, -15,  98, 4
192*c0909341SAndroid Build Coastguard Worker    db 0, 3,  52, -17, 0, -14, 100, 4, 0, 3,  50, -17, 0, -14, 102, 4
193*c0909341SAndroid Build Coastguard Worker    db 0, 3,  47, -17, 0, -13, 104, 4, 0, 3,  45, -17, 0, -13, 106, 4
194*c0909341SAndroid Build Coastguard Worker    db 0, 3,  42, -16, 0, -12, 108, 3, 0, 3,  40, -16, 0, -11, 109, 3
195*c0909341SAndroid Build Coastguard Worker    db 0, 3,  37, -15, 0, -11, 111, 3, 0, 2,  35, -15, 0, -10, 113, 3
196*c0909341SAndroid Build Coastguard Worker    db 0, 3,  32, -14, 0, -10, 114, 3, 0, 2,  29, -13, 0,  -9, 116, 3
197*c0909341SAndroid Build Coastguard Worker    db 0, 2,  27, -13, 0,  -8, 117, 3, 0, 2,  25, -12, 0,  -8, 119, 2
198*c0909341SAndroid Build Coastguard Worker    db 0, 2,  22, -11, 0,  -7, 120, 2, 0, 1,  20, -10, 0,  -6, 121, 2
199*c0909341SAndroid Build Coastguard Worker    db 0, 1,  18,  -9, 0,  -6, 122, 2, 0, 1,  15,  -8, 0,  -5, 123, 2
200*c0909341SAndroid Build Coastguard Worker    db 0, 1,  13,  -7, 0,  -4, 124, 1, 0, 1,  11,  -6, 0,  -4, 125, 1
201*c0909341SAndroid Build Coastguard Worker    db 0, 1,   8,  -5, 0,  -3, 126, 1, 0, 1,   6,  -4, 0,  -2, 126, 1
202*c0909341SAndroid Build Coastguard Worker    db 0, 0,   4,  -3, 0,  -1, 127, 1, 0, 0,   2,  -1, 0,   0, 127, 0
203*c0909341SAndroid Build Coastguard Worker    db 0, 0,   2,  -1, 0,   0, 127, 0
204*c0909341SAndroid Build Coastguard Worker
205*c0909341SAndroid Build Coastguard Workerpw_258:  times 2 dw 258
206*c0909341SAndroid Build Coastguard Worker
207*c0909341SAndroid Build Coastguard Workercextern mc_subpel_filters
208*c0909341SAndroid Build Coastguard Worker%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
209*c0909341SAndroid Build Coastguard Worker
210*c0909341SAndroid Build Coastguard Worker%macro BIDIR_JMP_TABLE 2-*
211*c0909341SAndroid Build Coastguard Worker    ;evaluated at definition time (in loop below)
212*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - 2*%3)
213*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2_table
214*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2)
215*c0909341SAndroid Build Coastguard Worker    ; dynamically generated label
216*c0909341SAndroid Build Coastguard Worker    %%table:
217*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2 ; repeat for num args
218*c0909341SAndroid Build Coastguard Worker        dd %%prefix %+ .w%3 - %%base
219*c0909341SAndroid Build Coastguard Worker        %rotate 1
220*c0909341SAndroid Build Coastguard Worker    %endrep
221*c0909341SAndroid Build Coastguard Worker%endmacro
222*c0909341SAndroid Build Coastguard Worker
223*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE avg, ssse3,        4, 8, 16, 32, 64, 128
224*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_avg, ssse3,      4, 8, 16, 32, 64, 128
225*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE mask, ssse3,       4, 8, 16, 32, 64, 128
226*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128
227*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128
228*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128
229*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend, ssse3,      4, 8, 16, 32
230*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32
231*c0909341SAndroid Build Coastguard WorkerBIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16
232*c0909341SAndroid Build Coastguard Worker
233*c0909341SAndroid Build Coastguard Worker%macro BASE_JMP_TABLE 3-*
234*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
235*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%2
236*c0909341SAndroid Build Coastguard Worker    %%table:
237*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
238*c0909341SAndroid Build Coastguard Worker        dw %%base %+ _w%3 - %%base
239*c0909341SAndroid Build Coastguard Worker        %rotate 1
240*c0909341SAndroid Build Coastguard Worker    %endrep
241*c0909341SAndroid Build Coastguard Worker%endmacro
242*c0909341SAndroid Build Coastguard Worker
243*c0909341SAndroid Build Coastguard Worker%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put)
244*c0909341SAndroid Build Coastguard Worker%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep)
245*c0909341SAndroid Build Coastguard Worker
246*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
247*c0909341SAndroid Build Coastguard WorkerBASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
248*c0909341SAndroid Build Coastguard Worker
249*c0909341SAndroid Build Coastguard Worker%macro HV_JMP_TABLE 5-*
250*c0909341SAndroid Build Coastguard Worker    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3)
251*c0909341SAndroid Build Coastguard Worker    %xdefine %%base %1_%3
252*c0909341SAndroid Build Coastguard Worker    %assign %%types %4
253*c0909341SAndroid Build Coastguard Worker    %if %%types & 1
254*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_h_%3_table  (%%h  - %5)
255*c0909341SAndroid Build Coastguard Worker        %%h:
256*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
257*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .h_w%5 - %%base
258*c0909341SAndroid Build Coastguard Worker            %rotate 1
259*c0909341SAndroid Build Coastguard Worker        %endrep
260*c0909341SAndroid Build Coastguard Worker        %rotate 4
261*c0909341SAndroid Build Coastguard Worker    %endif
262*c0909341SAndroid Build Coastguard Worker    %if %%types & 2
263*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_v_%3_table  (%%v  - %5)
264*c0909341SAndroid Build Coastguard Worker        %%v:
265*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
266*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .v_w%5 - %%base
267*c0909341SAndroid Build Coastguard Worker            %rotate 1
268*c0909341SAndroid Build Coastguard Worker        %endrep
269*c0909341SAndroid Build Coastguard Worker        %rotate 4
270*c0909341SAndroid Build Coastguard Worker    %endif
271*c0909341SAndroid Build Coastguard Worker    %if %%types & 4
272*c0909341SAndroid Build Coastguard Worker        %xdefine %1_%2_hv_%3_table (%%hv - %5)
273*c0909341SAndroid Build Coastguard Worker        %%hv:
274*c0909341SAndroid Build Coastguard Worker        %rep %0 - 4
275*c0909341SAndroid Build Coastguard Worker            dw %%prefix %+ .hv_w%5 - %%base
276*c0909341SAndroid Build Coastguard Worker            %rotate 1
277*c0909341SAndroid Build Coastguard Worker        %endrep
278*c0909341SAndroid Build Coastguard Worker    %endif
279*c0909341SAndroid Build Coastguard Worker%endmacro
280*c0909341SAndroid Build Coastguard Worker
281*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep,  8tap, ssse3, 1,    4, 8, 16, 32, 64, 128
282*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
283*c0909341SAndroid Build Coastguard WorkerHV_JMP_TABLE prep, bilin, ssse3, 7,    4, 8, 16, 32, 64, 128
284*c0909341SAndroid Build Coastguard Worker
285*c0909341SAndroid Build Coastguard Worker%macro SCALED_JMP_TABLE 2-*
286*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_table (%%table - %3)
287*c0909341SAndroid Build Coastguard Worker    %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2)
288*c0909341SAndroid Build Coastguard Worker%%table:
289*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
290*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .w%3 - %%base
291*c0909341SAndroid Build Coastguard Worker        %rotate 1
292*c0909341SAndroid Build Coastguard Worker    %endrep
293*c0909341SAndroid Build Coastguard Worker    %rotate 2
294*c0909341SAndroid Build Coastguard Worker%%dy_1024:
295*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
296*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
297*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .dy1_w%3 - %%base
298*c0909341SAndroid Build Coastguard Worker        %rotate 1
299*c0909341SAndroid Build Coastguard Worker    %endrep
300*c0909341SAndroid Build Coastguard Worker    %rotate 2
301*c0909341SAndroid Build Coastguard Worker%%dy_2048:
302*c0909341SAndroid Build Coastguard Worker    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
303*c0909341SAndroid Build Coastguard Worker    %rep %0 - 2
304*c0909341SAndroid Build Coastguard Worker        dw %%base %+ .dy2_w%3 - %%base
305*c0909341SAndroid Build Coastguard Worker        %rotate 1
306*c0909341SAndroid Build Coastguard Worker    %endrep
307*c0909341SAndroid Build Coastguard Worker%endmacro
308*c0909341SAndroid Build Coastguard Worker
309*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
310*c0909341SAndroid Build Coastguard WorkerSCALED_JMP_TABLE prep_8tap_scaled, ssse3,   4, 8, 16, 32, 64, 128
311*c0909341SAndroid Build Coastguard Worker
312*c0909341SAndroid Build Coastguard Worker%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
313*c0909341SAndroid Build Coastguard Worker
314*c0909341SAndroid Build Coastguard WorkerSECTION .text
315*c0909341SAndroid Build Coastguard Worker
316*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
317*c0909341SAndroid Build Coastguard Worker
318*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
319*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 1
320*c0909341SAndroid Build Coastguard Worker %define base t0-put_ssse3
321*c0909341SAndroid Build Coastguard Worker%else
322*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 7
323*c0909341SAndroid Build Coastguard Worker %define base 0
324*c0909341SAndroid Build Coastguard Worker%endif
325*c0909341SAndroid Build Coastguard Worker
326*c0909341SAndroid Build Coastguard Worker%macro RESTORE_DSQ_32 1
327*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
328*c0909341SAndroid Build Coastguard Worker   mov                  %1, dsm ; restore dsq
329*c0909341SAndroid Build Coastguard Worker %endif
330*c0909341SAndroid Build Coastguard Worker%endmacro
331*c0909341SAndroid Build Coastguard Worker
332*c0909341SAndroid Build Coastguard Workercglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy
333*c0909341SAndroid Build Coastguard Worker    movifnidn          mxyd, r6m ; mx
334*c0909341SAndroid Build Coastguard Worker    LEA                  t0, put_ssse3
335*c0909341SAndroid Build Coastguard Worker    movifnidn          srcq, srcmp
336*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, ssmp
337*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
338*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
339*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
340*c0909341SAndroid Build Coastguard Worker    jnz .h
341*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
342*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
343*c0909341SAndroid Build Coastguard Worker    jnz .v
344*c0909341SAndroid Build Coastguard Worker.put:
345*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [t0+wq*2+table_offset(put,)]
346*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
347*c0909341SAndroid Build Coastguard Worker    RESTORE_DSQ_32       t0
348*c0909341SAndroid Build Coastguard Worker    jmp                  wq
349*c0909341SAndroid Build Coastguard Worker.put_w2:
350*c0909341SAndroid Build Coastguard Worker    movzx               r4d, word [srcq+ssq*0]
351*c0909341SAndroid Build Coastguard Worker    movzx               r6d, word [srcq+ssq*1]
352*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
353*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r4w
354*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6w
355*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
356*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
357*c0909341SAndroid Build Coastguard Worker    jg .put_w2
358*c0909341SAndroid Build Coastguard Worker    RET
359*c0909341SAndroid Build Coastguard Worker.put_w4:
360*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [srcq+ssq*0]
361*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [srcq+ssq*1]
362*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
363*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r4d
364*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6d
365*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
366*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
367*c0909341SAndroid Build Coastguard Worker    jg .put_w4
368*c0909341SAndroid Build Coastguard Worker    RET
369*c0909341SAndroid Build Coastguard Worker.put_w8:
370*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
371*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*1]
372*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
373*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
374*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*1], m1
375*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
376*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
377*c0909341SAndroid Build Coastguard Worker    jg .put_w8
378*c0909341SAndroid Build Coastguard Worker    RET
379*c0909341SAndroid Build Coastguard Worker.put_w16:
380*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
381*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
382*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
383*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m0
384*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m1
385*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
386*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
387*c0909341SAndroid Build Coastguard Worker    jg .put_w16
388*c0909341SAndroid Build Coastguard Worker    RET
389*c0909341SAndroid Build Coastguard Worker.put_w32:
390*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0+16*0]
391*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0+16*1]
392*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1+16*0]
393*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1+16*1]
394*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
395*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+16*0], m0
396*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*0+16*1], m1
397*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+16*0], m2
398*c0909341SAndroid Build Coastguard Worker    mova  [dstq+dsq*1+16*1], m3
399*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
400*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
401*c0909341SAndroid Build Coastguard Worker    jg .put_w32
402*c0909341SAndroid Build Coastguard Worker    RET
403*c0909341SAndroid Build Coastguard Worker.put_w64:
404*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*0]
405*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*1]
406*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*2]
407*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*3]
408*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
409*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
410*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
411*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m2
412*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m3
413*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
414*c0909341SAndroid Build Coastguard Worker    dec                  hd
415*c0909341SAndroid Build Coastguard Worker    jg .put_w64
416*c0909341SAndroid Build Coastguard Worker    RET
417*c0909341SAndroid Build Coastguard Worker.put_w128:
418*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*0]
419*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*1]
420*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*2]
421*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*3]
422*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
423*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
424*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m2
425*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m3
426*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+16*4]
427*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+16*5]
428*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+16*6]
429*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+16*7]
430*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
431*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m1
432*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m2
433*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m3
434*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
435*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
436*c0909341SAndroid Build Coastguard Worker    dec                  hd
437*c0909341SAndroid Build Coastguard Worker    jg .put_w128
438*c0909341SAndroid Build Coastguard Worker    RET
439*c0909341SAndroid Build Coastguard Worker.h:
440*c0909341SAndroid Build Coastguard Worker    ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
441*c0909341SAndroid Build Coastguard Worker    ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
442*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 0x00ff00ff
443*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+subpel_h_shufD]
444*c0909341SAndroid Build Coastguard Worker    mova                 m0, [base+bilin_h_shuf4]
445*c0909341SAndroid Build Coastguard Worker    add                mxyd, 0x00100010
446*c0909341SAndroid Build Coastguard Worker    movd                 m5, mxyd
447*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r7m ; my
448*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0000
449*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
450*c0909341SAndroid Build Coastguard Worker    jnz .hv
451*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_h)]
452*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+pw_2048]
453*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
454*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
455*c0909341SAndroid Build Coastguard Worker    jmp                  wq
456*c0909341SAndroid Build Coastguard Worker.h_w2:
457*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
458*c0909341SAndroid Build Coastguard Worker.h_w2_loop:
459*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
460*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+ssq*1]
461*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
462*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1
463*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
464*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
465*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
466*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
467*c0909341SAndroid Build Coastguard Worker    movd                r6d, m0
468*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6w
469*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 16
470*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6w
471*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
472*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
473*c0909341SAndroid Build Coastguard Worker    jg .h_w2_loop
474*c0909341SAndroid Build Coastguard Worker    RET
475*c0909341SAndroid Build Coastguard Worker.h_w4:
476*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*0]
477*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+ssq*1]
478*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
479*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
480*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5
481*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m3
482*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
483*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m4
484*c0909341SAndroid Build Coastguard Worker    psrlq                m4, 32
485*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m4
486*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
487*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
488*c0909341SAndroid Build Coastguard Worker    jg .h_w4
489*c0909341SAndroid Build Coastguard Worker    RET
490*c0909341SAndroid Build Coastguard Worker.h_w8:
491*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
492*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
493*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
494*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
495*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
496*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
497*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
498*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
499*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
500*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
501*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
502*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
503*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
504*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
505*c0909341SAndroid Build Coastguard Worker    jg .h_w8
506*c0909341SAndroid Build Coastguard Worker    RET
507*c0909341SAndroid Build Coastguard Worker.h_w16:
508*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
509*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*1]
510*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
511*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
512*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
513*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
514*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
515*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
516*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
517*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
518*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
519*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
520*c0909341SAndroid Build Coastguard Worker    dec                  hd
521*c0909341SAndroid Build Coastguard Worker    jg .h_w16
522*c0909341SAndroid Build Coastguard Worker    RET
523*c0909341SAndroid Build Coastguard Worker.h_w32:
524*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+mmsize*0+8*0]
525*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+mmsize*0+8*1]
526*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
527*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
528*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
529*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
530*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
531*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
532*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
533*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+mmsize*1+8*0]
534*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+mmsize*1+8*1]
535*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
536*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
537*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
538*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
539*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
540*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
541*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3
542*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
543*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
544*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m1
545*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
546*c0909341SAndroid Build Coastguard Worker    dec                  hd
547*c0909341SAndroid Build Coastguard Worker    jg .h_w32
548*c0909341SAndroid Build Coastguard Worker    RET
549*c0909341SAndroid Build Coastguard Worker.h_w64:
550*c0909341SAndroid Build Coastguard Worker    mov                  r6, -16*3
551*c0909341SAndroid Build Coastguard Worker.h_w64_loop:
552*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6+16*3+8*0]
553*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+16*3+8*1]
554*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
555*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
556*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
557*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
558*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
559*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
560*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
561*c0909341SAndroid Build Coastguard Worker    mova     [dstq+r6+16*3], m0
562*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
563*c0909341SAndroid Build Coastguard Worker    jle .h_w64_loop
564*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
565*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
566*c0909341SAndroid Build Coastguard Worker    dec                  hd
567*c0909341SAndroid Build Coastguard Worker    jg .h_w64
568*c0909341SAndroid Build Coastguard Worker    RET
569*c0909341SAndroid Build Coastguard Worker.h_w128:
570*c0909341SAndroid Build Coastguard Worker    mov                  r6, -16*7
571*c0909341SAndroid Build Coastguard Worker.h_w128_loop:
572*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6+16*7+8*0]
573*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+16*7+8*1]
574*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
575*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
576*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
577*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
578*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m3
579*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3
580*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
581*c0909341SAndroid Build Coastguard Worker    mova     [dstq+r6+16*7], m0
582*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
583*c0909341SAndroid Build Coastguard Worker    jle .h_w128_loop
584*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
585*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
586*c0909341SAndroid Build Coastguard Worker    dec                  hd
587*c0909341SAndroid Build Coastguard Worker    jg .h_w128
588*c0909341SAndroid Build Coastguard Worker    RET
589*c0909341SAndroid Build Coastguard Worker.v:
590*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
591*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 0x00ff00ff
592*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_2048]
593*c0909341SAndroid Build Coastguard Worker    add                mxyd, 0x00100010
594*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
595*c0909341SAndroid Build Coastguard Worker    movd                 m4, mxyd
596*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0000
597*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
598*c0909341SAndroid Build Coastguard Worker    jmp                  wq
599*c0909341SAndroid Build Coastguard Worker.v_w2:
600*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
601*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
602*c0909341SAndroid Build Coastguard Worker    pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
603*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
604*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q2301
605*c0909341SAndroid Build Coastguard Worker    pinsrw               m0, [srcq+ssq*0], 0 ; 2 1
606*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0
607*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
608*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
609*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
610*c0909341SAndroid Build Coastguard Worker    movd                r6d, m1
611*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6w
612*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 16
613*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6w
614*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
615*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
616*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
617*c0909341SAndroid Build Coastguard Worker    RET
618*c0909341SAndroid Build Coastguard Worker.v_w4:
619*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
620*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
621*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*1]
622*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
623*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
624*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
625*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2 ; 0 1
626*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m0 ; 1 2
627*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2
628*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
629*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
630*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
631*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m1
632*c0909341SAndroid Build Coastguard Worker    psrlq                m1, 32
633*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m1
634*c0909341SAndroid Build Coastguard Worker    ;
635*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
636*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
637*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
638*c0909341SAndroid Build Coastguard Worker    RET
639*c0909341SAndroid Build Coastguard Worker.v_w8:
640*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
641*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
642*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*1]
643*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
644*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
645*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
646*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2
647*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m0
648*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
649*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
650*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
651*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
652*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
653*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m1
654*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m1
655*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
656*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
657*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
658*c0909341SAndroid Build Coastguard Worker    RET
659*c0909341SAndroid Build Coastguard Worker%macro PUT_BILIN_V_W16 0
660*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
661*c0909341SAndroid Build Coastguard Worker%%loop:
662*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
663*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
664*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
665*c0909341SAndroid Build Coastguard Worker    mova                 m2, m0
666*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
667*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3
668*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
669*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4
670*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
671*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
672*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
673*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m2
674*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m0
675*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0
676*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m4
677*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m4
678*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5
679*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m5
680*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
681*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*0], m1
682*c0909341SAndroid Build Coastguard Worker    mova       [dstq+dsq*1], m2
683*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
684*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
685*c0909341SAndroid Build Coastguard Worker    jg %%loop
686*c0909341SAndroid Build Coastguard Worker%endmacro
687*c0909341SAndroid Build Coastguard Worker.v_w16:
688*c0909341SAndroid Build Coastguard Worker    PUT_BILIN_V_W16
689*c0909341SAndroid Build Coastguard Worker    RET
690*c0909341SAndroid Build Coastguard Worker.v_w128:
691*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(7<<16)]
692*c0909341SAndroid Build Coastguard Worker    jmp .v_w16gt
693*c0909341SAndroid Build Coastguard Worker.v_w64:
694*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(3<<16)]
695*c0909341SAndroid Build Coastguard Worker    jmp .v_w16gt
696*c0909341SAndroid Build Coastguard Worker.v_w32:
697*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(1<<16)]
698*c0909341SAndroid Build Coastguard Worker.v_w16gt:
699*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
700*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
701*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
702*c0909341SAndroid Build Coastguard Worker%endif
703*c0909341SAndroid Build Coastguard Worker.v_w16gt_loop:
704*c0909341SAndroid Build Coastguard Worker    PUT_BILIN_V_W16
705*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
706*c0909341SAndroid Build Coastguard Worker    add                  r4, 16
707*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
708*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
709*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
710*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
711*c0909341SAndroid Build Coastguard Worker%else
712*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstmp
713*c0909341SAndroid Build Coastguard Worker    add                  r4, 16
714*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6w
715*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
716*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
717*c0909341SAndroid Build Coastguard Worker    mov               dstmp, dstq
718*c0909341SAndroid Build Coastguard Worker%endif
719*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<16
720*c0909341SAndroid Build Coastguard Worker    jg .v_w16gt
721*c0909341SAndroid Build Coastguard Worker    RET
722*c0909341SAndroid Build Coastguard Worker.hv:
723*c0909341SAndroid Build Coastguard Worker    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
724*c0909341SAndroid Build Coastguard Worker    ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
725*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
726*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       8
727*c0909341SAndroid Build Coastguard Worker    shl                mxyd, 11 ; can't shift by 12 due to signed overflow
728*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_15]
729*c0909341SAndroid Build Coastguard Worker    movd                 m6, mxyd
730*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
731*c0909341SAndroid Build Coastguard Worker    pshuflw              m6, m6, q0000
732*c0909341SAndroid Build Coastguard Worker    paddb                m5, m5
733*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m6
734*c0909341SAndroid Build Coastguard Worker    jmp                  wq
735*c0909341SAndroid Build Coastguard Worker.hv_w2:
736*c0909341SAndroid Build Coastguard Worker    RESTORE_DSQ_32       t0
737*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
738*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m0
739*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
740*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
741*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
742*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+ssq*1]
743*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
744*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*0]
745*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2
746*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
747*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5             ; 1 _ 2 _
748*c0909341SAndroid Build Coastguard Worker    shufps               m2, m0, m1, q1032  ; 0 _ 1 _
749*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
750*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2   ; 2 * (src[x + src_stride] - src[x])
751*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6   ; (my * (src[x + src_stride] - src[x]) >> 4
752*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m7   ; src[x] + 8
753*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2   ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8
754*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
755*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
756*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
757*c0909341SAndroid Build Coastguard Worker    movq                 r6, m1
758*c0909341SAndroid Build Coastguard Worker%else
759*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q2020
760*c0909341SAndroid Build Coastguard Worker    movd                r6d, m1
761*c0909341SAndroid Build Coastguard Worker%endif
762*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6w
763*c0909341SAndroid Build Coastguard Worker    shr                  r6, gprsize*4
764*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6w
765*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
766*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
767*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
768*c0909341SAndroid Build Coastguard Worker    RET
769*c0909341SAndroid Build Coastguard Worker.hv_w4:
770*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+bilin_h_shuf4]
771*c0909341SAndroid Build Coastguard Worker    movddup              m0, [srcq+ssq*0]
772*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
773*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
774*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
775*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
776*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*1]
777*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
778*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+ssq*0]
779*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
780*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5            ; 1 2
781*c0909341SAndroid Build Coastguard Worker    shufps               m2, m0, m1, q1032 ; 0 1
782*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
783*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
784*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6
785*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m7
786*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
787*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
788*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
789*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m1
790*c0909341SAndroid Build Coastguard Worker    psrlq                m1, 32
791*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m1
792*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
793*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
794*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
795*c0909341SAndroid Build Coastguard Worker    RET
796*c0909341SAndroid Build Coastguard Worker.hv_w8:
797*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
798*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
799*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
800*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
801*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
802*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1]
803*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
804*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
805*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
806*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2, m0
807*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m6
808*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m7
809*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
810*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
811*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
812*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
813*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0, m2
814*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m6
815*c0909341SAndroid Build Coastguard Worker    pavgw                m2, m7
816*c0909341SAndroid Build Coastguard Worker    paddw                m3, m2
817*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 4
818*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 4
819*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m3
820*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m1
821*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m1
822*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
823*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
824*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
825*c0909341SAndroid Build Coastguard Worker    RET
826*c0909341SAndroid Build Coastguard Worker.hv_w128:
827*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(7<<16)]
828*c0909341SAndroid Build Coastguard Worker    jmp .hv_w16_start
829*c0909341SAndroid Build Coastguard Worker.hv_w64:
830*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(3<<16)]
831*c0909341SAndroid Build Coastguard Worker    jmp .hv_w16_start
832*c0909341SAndroid Build Coastguard Worker.hv_w32:
833*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+(1<<16)]
834*c0909341SAndroid Build Coastguard Worker.hv_w16_start:
835*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
836*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
837*c0909341SAndroid Build Coastguard Worker    %define m8 [dstq]
838*c0909341SAndroid Build Coastguard Worker%else
839*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
840*c0909341SAndroid Build Coastguard Worker%endif
841*c0909341SAndroid Build Coastguard Worker.hv_w16:
842*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
843*c0909341SAndroid Build Coastguard Worker%if WIN64
844*c0909341SAndroid Build Coastguard Worker    movaps              r4m, m8
845*c0909341SAndroid Build Coastguard Worker%endif
846*c0909341SAndroid Build Coastguard Worker.hv_w16_loop0:
847*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+8*0]
848*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+8*1]
849*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
850*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
851*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
852*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
853*c0909341SAndroid Build Coastguard Worker.hv_w16_loop:
854*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
855*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+8*0]
856*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+8*1]
857*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
858*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
859*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
860*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
861*c0909341SAndroid Build Coastguard Worker    mova                 m8, m2
862*c0909341SAndroid Build Coastguard Worker    psubw                m2, m0
863*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m6
864*c0909341SAndroid Build Coastguard Worker    pavgw                m0, m7
865*c0909341SAndroid Build Coastguard Worker    paddw                m2, m0
866*c0909341SAndroid Build Coastguard Worker    mova                 m0, m3
867*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1
868*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m6
869*c0909341SAndroid Build Coastguard Worker    pavgw                m1, m7
870*c0909341SAndroid Build Coastguard Worker    paddw                m3, m1
871*c0909341SAndroid Build Coastguard Worker    mova                 m1, m0
872*c0909341SAndroid Build Coastguard Worker    mova                 m0, m8
873*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 4
874*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 4
875*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
876*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m2
877*c0909341SAndroid Build Coastguard Worker    add                dstq, dsmp
878*c0909341SAndroid Build Coastguard Worker    dec                  hd
879*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop
880*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
881*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
882*c0909341SAndroid Build Coastguard Worker    add                  r4, 16
883*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6w
884*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
885*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
886*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
887*c0909341SAndroid Build Coastguard Worker%else
888*c0909341SAndroid Build Coastguard Worker    add                  r4, 16
889*c0909341SAndroid Build Coastguard Worker    add                  r7, 16
890*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
891*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
892*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
893*c0909341SAndroid Build Coastguard Worker%endif
894*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<16
895*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_loop0
896*c0909341SAndroid Build Coastguard Worker%if WIN64
897*c0909341SAndroid Build Coastguard Worker    movaps               m8, r4m
898*c0909341SAndroid Build Coastguard Worker%endif
899*c0909341SAndroid Build Coastguard Worker    RET
900*c0909341SAndroid Build Coastguard Worker
901*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
902*c0909341SAndroid Build Coastguard Worker    %define base r6-prep%+SUFFIX
903*c0909341SAndroid Build Coastguard Worker%else
904*c0909341SAndroid Build Coastguard Worker    %define base 0
905*c0909341SAndroid Build Coastguard Worker%endif
906*c0909341SAndroid Build Coastguard Worker
907*c0909341SAndroid Build Coastguard Workercglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
908*c0909341SAndroid Build Coastguard Worker    movifnidn          mxyd, r5m ; mx
909*c0909341SAndroid Build Coastguard Worker    LEA                  r6, prep_ssse3
910*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
911*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
912*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
913*c0909341SAndroid Build Coastguard Worker    jnz .h
914*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
915*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
916*c0909341SAndroid Build Coastguard Worker    jnz .v
917*c0909341SAndroid Build Coastguard Worker.prep:
918*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep,)]
919*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
920*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
921*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
922*c0909341SAndroid Build Coastguard Worker    jmp                  wq
923*c0909341SAndroid Build Coastguard Worker.prep_w4:
924*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+strideq*0]
925*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+strideq*1]
926*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+strideq*2]
927*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+stride3q ]
928*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
929*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1
930*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3
931*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m4
932*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4
933*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
934*c0909341SAndroid Build Coastguard Worker    psllw                m2, 4
935*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
936*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m2
937*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
938*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
939*c0909341SAndroid Build Coastguard Worker    jg .prep_w4
940*c0909341SAndroid Build Coastguard Worker    RET
941*c0909341SAndroid Build Coastguard Worker.prep_w8:
942*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
943*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+strideq*1]
944*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+strideq*2]
945*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+stride3q ]
946*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
947*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m4
948*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m4
949*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4
950*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4
951*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
952*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
953*c0909341SAndroid Build Coastguard Worker    psllw                m2, 4
954*c0909341SAndroid Build Coastguard Worker    psllw                m3, 4
955*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
956*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
957*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
958*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
959*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
960*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
961*c0909341SAndroid Build Coastguard Worker    jg .prep_w8
962*c0909341SAndroid Build Coastguard Worker    RET
963*c0909341SAndroid Build Coastguard Worker.prep_w16:
964*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0]
965*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+strideq*1]
966*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
967*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m4
968*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m4
969*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
970*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
971*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
972*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
973*c0909341SAndroid Build Coastguard Worker    psllw                m2, 4
974*c0909341SAndroid Build Coastguard Worker    psllw                m3, 4
975*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
976*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
977*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
978*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
979*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
980*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
981*c0909341SAndroid Build Coastguard Worker    jg .prep_w16
982*c0909341SAndroid Build Coastguard Worker    RET
983*c0909341SAndroid Build Coastguard Worker.prep_w128:
984*c0909341SAndroid Build Coastguard Worker    mov                  r3, -128
985*c0909341SAndroid Build Coastguard Worker    jmp .prep_w32_start
986*c0909341SAndroid Build Coastguard Worker.prep_w64:
987*c0909341SAndroid Build Coastguard Worker    mov                  r3, -64
988*c0909341SAndroid Build Coastguard Worker    jmp .prep_w32_start
989*c0909341SAndroid Build Coastguard Worker.prep_w32:
990*c0909341SAndroid Build Coastguard Worker    mov                  r3, -32
991*c0909341SAndroid Build Coastguard Worker.prep_w32_start:
992*c0909341SAndroid Build Coastguard Worker    sub                srcq, r3
993*c0909341SAndroid Build Coastguard Worker.prep_w32_vloop:
994*c0909341SAndroid Build Coastguard Worker    mov                  r6, r3
995*c0909341SAndroid Build Coastguard Worker.prep_w32_hloop:
996*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+16*0]
997*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6+16*1]
998*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m4
999*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m4
1000*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m4
1001*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4
1002*c0909341SAndroid Build Coastguard Worker    psllw                m0, 4
1003*c0909341SAndroid Build Coastguard Worker    psllw                m1, 4
1004*c0909341SAndroid Build Coastguard Worker    psllw                m2, 4
1005*c0909341SAndroid Build Coastguard Worker    psllw                m3, 4
1006*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
1007*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
1008*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
1009*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
1010*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
1011*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
1012*c0909341SAndroid Build Coastguard Worker    jl .prep_w32_hloop
1013*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1014*c0909341SAndroid Build Coastguard Worker    dec                  hd
1015*c0909341SAndroid Build Coastguard Worker    jg .prep_w32_vloop
1016*c0909341SAndroid Build Coastguard Worker    RET
1017*c0909341SAndroid Build Coastguard Worker.h:
1018*c0909341SAndroid Build Coastguard Worker    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
1019*c0909341SAndroid Build Coastguard Worker    ; = (16 - mx) * src[x] + mx * src[x + 1]
1020*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 0x00ff00ff
1021*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+subpel_h_shufD]
1022*c0909341SAndroid Build Coastguard Worker    add                mxyd, 0x00100010
1023*c0909341SAndroid Build Coastguard Worker    movd                 m5, mxyd
1024*c0909341SAndroid Build Coastguard Worker    mov                mxyd, r6m ; my
1025*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0000
1026*c0909341SAndroid Build Coastguard Worker    test               mxyd, mxyd
1027*c0909341SAndroid Build Coastguard Worker    jnz .hv
1028*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
1029*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1030*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1031*c0909341SAndroid Build Coastguard Worker.h_w4:
1032*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+bilin_h_shuf4]
1033*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1034*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
1035*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
1036*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+strideq*1]
1037*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+strideq*2]
1038*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+stride3q ]
1039*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1040*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1041*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1042*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1043*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1044*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+0 ], m0
1045*c0909341SAndroid Build Coastguard Worker    mova          [tmpq+16], m1
1046*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1047*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1048*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
1049*c0909341SAndroid Build Coastguard Worker    RET
1050*c0909341SAndroid Build Coastguard Worker.h_w8:
1051*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1052*c0909341SAndroid Build Coastguard Worker.h_w8_loop:
1053*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1054*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*1]
1055*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*2]
1056*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+stride3q ]
1057*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1058*c0909341SAndroid Build Coastguard Worker    REPX  {pshufb    x, m4}, m0, m1, m2, m3
1059*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
1060*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
1061*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
1062*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
1063*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
1064*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
1065*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1066*c0909341SAndroid Build Coastguard Worker    jg .h_w8_loop
1067*c0909341SAndroid Build Coastguard Worker    RET
1068*c0909341SAndroid Build Coastguard Worker.h_w16:
1069*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0+8*0]
1070*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0+8*1]
1071*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1+8*0]
1072*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+strideq*1+8*1]
1073*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1074*c0909341SAndroid Build Coastguard Worker    REPX  {pshufb    x, m4}, m0, m1, m2, m3
1075*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
1076*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
1077*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
1078*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
1079*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
1080*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
1081*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1082*c0909341SAndroid Build Coastguard Worker    jg .h_w16
1083*c0909341SAndroid Build Coastguard Worker    RET
1084*c0909341SAndroid Build Coastguard Worker.h_w128:
1085*c0909341SAndroid Build Coastguard Worker    mov                  r3, -128
1086*c0909341SAndroid Build Coastguard Worker    jmp .h_w32_start
1087*c0909341SAndroid Build Coastguard Worker.h_w64:
1088*c0909341SAndroid Build Coastguard Worker    mov                  r3, -64
1089*c0909341SAndroid Build Coastguard Worker    jmp .h_w32_start
1090*c0909341SAndroid Build Coastguard Worker.h_w32:
1091*c0909341SAndroid Build Coastguard Worker    mov                  r3, -32
1092*c0909341SAndroid Build Coastguard Worker.h_w32_start:
1093*c0909341SAndroid Build Coastguard Worker    sub                srcq, r3
1094*c0909341SAndroid Build Coastguard Worker.h_w32_vloop:
1095*c0909341SAndroid Build Coastguard Worker    mov                  r6, r3
1096*c0909341SAndroid Build Coastguard Worker.h_w32_hloop:
1097*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6+8*0]
1098*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+8*1]
1099*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+r6+8*2]
1100*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+r6+8*3]
1101*c0909341SAndroid Build Coastguard Worker    REPX  {pshufb    x, m4}, m0, m1, m2, m3
1102*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m5}, m0, m1, m2, m3
1103*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
1104*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
1105*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
1106*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
1107*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
1108*c0909341SAndroid Build Coastguard Worker    add                  r6, 32
1109*c0909341SAndroid Build Coastguard Worker    jl .h_w32_hloop
1110*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
1111*c0909341SAndroid Build Coastguard Worker    dec                  hd
1112*c0909341SAndroid Build Coastguard Worker    jg .h_w32_vloop
1113*c0909341SAndroid Build Coastguard Worker    RET
1114*c0909341SAndroid Build Coastguard Worker.v:
1115*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
1116*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 0x00ff00ff
1117*c0909341SAndroid Build Coastguard Worker    add                mxyd, 0x00100010
1118*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1119*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
1120*c0909341SAndroid Build Coastguard Worker    movd                 m5, mxyd
1121*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0000
1122*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1123*c0909341SAndroid Build Coastguard Worker.v_w4:
1124*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+strideq*0]
1125*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1126*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+strideq*1]
1127*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+strideq*2]
1128*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+stride3q ]
1129*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1130*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m1
1131*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2
1132*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1 ; 01 12
1133*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1134*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
1135*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+strideq*0]
1136*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m3
1137*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m0
1138*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3 ; 23 34
1139*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1140*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m2
1141*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
1142*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1143*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1144*c0909341SAndroid Build Coastguard Worker    RET
1145*c0909341SAndroid Build Coastguard Worker.v_w8:
1146*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
1147*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1148*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+strideq*1]
1149*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+strideq*2]
1150*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+stride3q ]
1151*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1152*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1 ; 01
1153*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2 ; 12
1154*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1155*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1156*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
1157*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
1158*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3 ; 23
1159*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m0 ; 34
1160*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1161*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
1162*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1163*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m2
1164*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
1165*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
1166*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1167*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1168*c0909341SAndroid Build Coastguard Worker    RET
1169*c0909341SAndroid Build Coastguard Worker.v_w16:
1170*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1171*c0909341SAndroid Build Coastguard Worker.v_w16_loop:
1172*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*1]
1173*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*2]
1174*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+stride3q ]
1175*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1176*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0, m1
1177*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m1
1178*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5
1179*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1180*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m4
1181*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m1, m2
1182*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m2
1183*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5
1184*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m0
1185*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1186*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1187*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m4
1188*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m2, m3
1189*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m3
1190*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5
1191*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m1
1192*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1193*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*4], m4
1194*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m3, m0
1195*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0
1196*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5
1197*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*5], m2
1198*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1199*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*6], m4
1200*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*7], m3
1201*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*8
1202*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1203*c0909341SAndroid Build Coastguard Worker    jg .v_w16_loop
1204*c0909341SAndroid Build Coastguard Worker    RET
1205*c0909341SAndroid Build Coastguard Worker.v_w128:
1206*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+(3<<8)]
1207*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 256
1208*c0909341SAndroid Build Coastguard Worker    jmp .v_w32_start
1209*c0909341SAndroid Build Coastguard Worker.v_w64:
1210*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+(1<<8)]
1211*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 128
1212*c0909341SAndroid Build Coastguard Worker    jmp .v_w32_start
1213*c0909341SAndroid Build Coastguard Worker.v_w32:
1214*c0909341SAndroid Build Coastguard Worker    xor                 r3d, r3d
1215*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64
1216*c0909341SAndroid Build Coastguard Worker.v_w32_start:
1217*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1218*c0909341SAndroid Build Coastguard Worker %if WIN64
1219*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
1220*c0909341SAndroid Build Coastguard Worker %endif
1221*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
1222*c0909341SAndroid Build Coastguard Worker%endif
1223*c0909341SAndroid Build Coastguard Worker    mov                  r5, srcq
1224*c0909341SAndroid Build Coastguard Worker.v_w32_hloop:
1225*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0+16*0]
1226*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0+16*1]
1227*c0909341SAndroid Build Coastguard Worker.v_w32_vloop:
1228*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1+16*0]
1229*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+strideq*1+16*1]
1230*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1231*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0, m2
1232*c0909341SAndroid Build Coastguard Worker    punpckhbw            m0, m2
1233*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5
1234*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5
1235*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m4
1236*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m0
1237*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0+16*0]
1238*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m1, m3
1239*c0909341SAndroid Build Coastguard Worker    punpckhbw            m1, m3
1240*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5
1241*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5
1242*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m4
1243*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m1
1244*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0+16*1]
1245*c0909341SAndroid Build Coastguard Worker    add                tmpq, r6
1246*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m2, m0
1247*c0909341SAndroid Build Coastguard Worker    punpckhbw            m2, m0
1248*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5
1249*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5
1250*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m4
1251*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m2
1252*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m3, m1
1253*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m1
1254*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5
1255*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5
1256*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*2], m4
1257*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*3], m3
1258*c0909341SAndroid Build Coastguard Worker    add                tmpq, r6
1259*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1260*c0909341SAndroid Build Coastguard Worker    jg .v_w32_vloop
1261*c0909341SAndroid Build Coastguard Worker    add                  r5, 32
1262*c0909341SAndroid Build Coastguard Worker    movzx                hd, r3b
1263*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
1264*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1265*c0909341SAndroid Build Coastguard Worker    add                  r7, 16*4
1266*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r7
1267*c0909341SAndroid Build Coastguard Worker%else
1268*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpmp
1269*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*4
1270*c0909341SAndroid Build Coastguard Worker    mov               tmpmp, tmpq
1271*c0909341SAndroid Build Coastguard Worker%endif
1272*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 1<<8
1273*c0909341SAndroid Build Coastguard Worker    jg .v_w32_hloop
1274*c0909341SAndroid Build Coastguard Worker%if WIN64
1275*c0909341SAndroid Build Coastguard Worker    POP                  r7
1276*c0909341SAndroid Build Coastguard Worker%endif
1277*c0909341SAndroid Build Coastguard Worker    RET
1278*c0909341SAndroid Build Coastguard Worker.hv:
1279*c0909341SAndroid Build Coastguard Worker    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
1280*c0909341SAndroid Build Coastguard Worker    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
1281*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1282*c0909341SAndroid Build Coastguard Worker    imul               mxyd, 0x08000800
1283*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM 8
1284*c0909341SAndroid Build Coastguard Worker    movd                 m6, mxyd
1285*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
1286*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q0000
1287*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1288*c0909341SAndroid Build Coastguard Worker.hv_w4:
1289*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+bilin_h_shuf4]
1290*c0909341SAndroid Build Coastguard Worker    movddup              m0, [srcq+strideq*0]
1291*c0909341SAndroid Build Coastguard Worker    lea                  r3, [strideq*3]
1292*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1293*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5            ; _ 0
1294*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1295*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+strideq*1]
1296*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+strideq*2]
1297*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+r3       ]
1298*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
1299*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+strideq*0]
1300*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1301*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1302*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5            ; 1 2
1303*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5            ; 3 4
1304*c0909341SAndroid Build Coastguard Worker    shufpd               m0, m1, 0x01      ; 0 1
1305*c0909341SAndroid Build Coastguard Worker    shufpd               m3, m1, m2, 0x01  ; 2 3
1306*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0
1307*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1308*c0909341SAndroid Build Coastguard Worker    paddw                m1, m0
1309*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
1310*c0909341SAndroid Build Coastguard Worker    psubw                m2, m3
1311*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1312*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
1313*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m1
1314*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m2
1315*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
1316*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
1317*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1318*c0909341SAndroid Build Coastguard Worker    RET
1319*c0909341SAndroid Build Coastguard Worker.hv_w8:
1320*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0]
1321*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1322*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5 ; 0
1323*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
1324*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*1]
1325*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1326*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*0]
1327*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1328*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1329*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5 ; 1
1330*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5 ; 2
1331*c0909341SAndroid Build Coastguard Worker    psubw                m3, m1, m0
1332*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1333*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0
1334*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
1335*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1
1336*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1337*c0909341SAndroid Build Coastguard Worker    paddw                m2, m1
1338*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m3
1339*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m2
1340*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16*2
1341*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1342*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
1343*c0909341SAndroid Build Coastguard Worker    RET
1344*c0909341SAndroid Build Coastguard Worker.hv_w128:
1345*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+(7<<8)]
1346*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 256
1347*c0909341SAndroid Build Coastguard Worker    jmp .hv_w16_start
1348*c0909341SAndroid Build Coastguard Worker.hv_w64:
1349*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+(3<<8)]
1350*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 128
1351*c0909341SAndroid Build Coastguard Worker    jmp .hv_w16_start
1352*c0909341SAndroid Build Coastguard Worker.hv_w32:
1353*c0909341SAndroid Build Coastguard Worker    lea                 r3d, [hq+(1<<8)]
1354*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 64
1355*c0909341SAndroid Build Coastguard Worker    jmp .hv_w16_start
1356*c0909341SAndroid Build Coastguard Worker.hv_w16:
1357*c0909341SAndroid Build Coastguard Worker    xor                 r3d, r3d
1358*c0909341SAndroid Build Coastguard Worker    mov                 r5d, 32
1359*c0909341SAndroid Build Coastguard Worker.hv_w16_start:
1360*c0909341SAndroid Build Coastguard Worker    mov                  r6, srcq
1361*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1362*c0909341SAndroid Build Coastguard Worker %if WIN64
1363*c0909341SAndroid Build Coastguard Worker    PUSH                 r7
1364*c0909341SAndroid Build Coastguard Worker %endif
1365*c0909341SAndroid Build Coastguard Worker    mov                  r7, tmpq
1366*c0909341SAndroid Build Coastguard Worker%endif
1367*c0909341SAndroid Build Coastguard Worker.hv_w16_hloop:
1368*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0+8*0]
1369*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0+8*1]
1370*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1371*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1372*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5 ; 0a
1373*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5 ; 0b
1374*c0909341SAndroid Build Coastguard Worker.hv_w16_vloop:
1375*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+strideq*1+8*0]
1376*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m4
1377*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m5 ; 1a
1378*c0909341SAndroid Build Coastguard Worker    psubw                m3, m2, m0
1379*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
1380*c0909341SAndroid Build Coastguard Worker    paddw                m3, m0
1381*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m3
1382*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+strideq*1+8*1]
1383*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
1384*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m4
1385*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m5 ; 1b
1386*c0909341SAndroid Build Coastguard Worker    psubw                m0, m3, m1
1387*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6
1388*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1
1389*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m0
1390*c0909341SAndroid Build Coastguard Worker    add                tmpq, r5
1391*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+strideq*0+8*0]
1392*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4
1393*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m5 ; 2a
1394*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, m2
1395*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
1396*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2
1397*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m1
1398*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+strideq*0+8*1]
1399*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m4
1400*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m5 ; 2b
1401*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1, m3
1402*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
1403*c0909341SAndroid Build Coastguard Worker    paddw                m2, m3
1404*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m2
1405*c0909341SAndroid Build Coastguard Worker    add                tmpq, r5
1406*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1407*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_vloop
1408*c0909341SAndroid Build Coastguard Worker    movzx                hd, r3b
1409*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1410*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
1411*c0909341SAndroid Build Coastguard Worker    add                  r7, 2*16
1412*c0909341SAndroid Build Coastguard Worker    mov                srcq, r6
1413*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r7
1414*c0909341SAndroid Build Coastguard Worker%else
1415*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpm
1416*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
1417*c0909341SAndroid Build Coastguard Worker    add                tmpq, 2*16
1418*c0909341SAndroid Build Coastguard Worker    mov                srcq, r6
1419*c0909341SAndroid Build Coastguard Worker    mov                tmpm, tmpq
1420*c0909341SAndroid Build Coastguard Worker%endif
1421*c0909341SAndroid Build Coastguard Worker    sub                 r3d, 1<<8
1422*c0909341SAndroid Build Coastguard Worker    jg .hv_w16_hloop
1423*c0909341SAndroid Build Coastguard Worker%if WIN64
1424*c0909341SAndroid Build Coastguard Worker    POP                  r7
1425*c0909341SAndroid Build Coastguard Worker%endif
1426*c0909341SAndroid Build Coastguard Worker    RET
1427*c0909341SAndroid Build Coastguard Worker
1428*c0909341SAndroid Build Coastguard Worker; int8_t subpel_filters[5][15][8]
1429*c0909341SAndroid Build Coastguard Worker%assign FILTER_REGULAR (0*15 << 16) | 3*15
1430*c0909341SAndroid Build Coastguard Worker%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1431*c0909341SAndroid Build Coastguard Worker%assign FILTER_SHARP   (2*15 << 16) | 3*15
1432*c0909341SAndroid Build Coastguard Worker
1433*c0909341SAndroid Build Coastguard Worker%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
1434*c0909341SAndroid Build Coastguard Workercglobal %1_%2_8bpc
1435*c0909341SAndroid Build Coastguard Worker    mov                 t0d, FILTER_%3
1436*c0909341SAndroid Build Coastguard Worker%ifidn %3, %4
1437*c0909341SAndroid Build Coastguard Worker    mov                 t1d, t0d
1438*c0909341SAndroid Build Coastguard Worker%else
1439*c0909341SAndroid Build Coastguard Worker    mov                 t1d, FILTER_%4
1440*c0909341SAndroid Build Coastguard Worker%endif
1441*c0909341SAndroid Build Coastguard Worker%if %0 == 5 ; skip the jump in the last filter
1442*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1443*c0909341SAndroid Build Coastguard Worker%endif
1444*c0909341SAndroid Build Coastguard Worker%endmacro
1445*c0909341SAndroid Build Coastguard Worker
1446*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1447*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 1, 2
1448*c0909341SAndroid Build Coastguard Worker%elif WIN64
1449*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 4, 5
1450*c0909341SAndroid Build Coastguard Worker%else
1451*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7, 8
1452*c0909341SAndroid Build Coastguard Worker%endif
1453*c0909341SAndroid Build Coastguard Worker
1454*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1455*c0909341SAndroid Build Coastguard Worker %define base_reg r1
1456*c0909341SAndroid Build Coastguard Worker %define base base_reg-put_ssse3
1457*c0909341SAndroid Build Coastguard Worker%else
1458*c0909341SAndroid Build Coastguard Worker %define base_reg r8
1459*c0909341SAndroid Build Coastguard Worker %define base 0
1460*c0909341SAndroid Build Coastguard Worker%endif
1461*c0909341SAndroid Build Coastguard Worker
1462*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_FN FN put_8tap,
1463*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_8bpc
1464*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_8bpc
1465*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_8bpc
1466*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular,        REGULAR, REGULAR
1467*c0909341SAndroid Build Coastguard Worker
1468*c0909341SAndroid Build Coastguard Workercglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ns
1469*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
1470*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
1471*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1472*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
1473*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
1474*c0909341SAndroid Build Coastguard Worker%else
1475*c0909341SAndroid Build Coastguard Worker    imul                ssd, mym, 0x010101
1476*c0909341SAndroid Build Coastguard Worker    add                 ssd, t1d ; 8tap_v, my, 4tap_v
1477*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcm
1478*c0909341SAndroid Build Coastguard Worker%endif
1479*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
1480*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
1481*c0909341SAndroid Build Coastguard Worker    LEA            base_reg, put_ssse3
1482*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
1483*c0909341SAndroid Build Coastguard Worker    jnz .h
1484*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1485*c0909341SAndroid Build Coastguard Worker    test                ssd, 0xf00
1486*c0909341SAndroid Build Coastguard Worker%else
1487*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1488*c0909341SAndroid Build Coastguard Worker%endif
1489*c0909341SAndroid Build Coastguard Worker    jnz .v
1490*c0909341SAndroid Build Coastguard Worker.put:
1491*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
1492*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base_reg+wq*2+table_offset(put,)]
1493*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, ssmp
1494*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
1495*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
1496*c0909341SAndroid Build Coastguard Worker%if WIN64
1497*c0909341SAndroid Build Coastguard Worker    pop                  r8
1498*c0909341SAndroid Build Coastguard Worker%endif
1499*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
1500*c0909341SAndroid Build Coastguard Worker    jmp                  wq
1501*c0909341SAndroid Build Coastguard Worker.h:
1502*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1503*c0909341SAndroid Build Coastguard Worker    test                ssd, 0xf00
1504*c0909341SAndroid Build Coastguard Worker%else
1505*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
1506*c0909341SAndroid Build Coastguard Worker%endif
1507*c0909341SAndroid Build Coastguard Worker    jnz .hv
1508*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, ssmp
1509*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
1510*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1511*c0909341SAndroid Build Coastguard Worker    jle mangle(private_prefix %+ _put_8tap_8bpc %+ SUFFIX).h_w4
1512*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11
1513*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1514*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+subpel_h_shufD]
1515*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+subpel_h_shufE]
1516*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+subpel_h_shufF]
1517*c0909341SAndroid Build Coastguard Worker%endif
1518*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1519*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
1520*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base_reg-put_ssse3+subpel_filters+1+mxq*8]
1521*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m7
1522*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m7, q0000
1523*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q1111
1524*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q2222
1525*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
1526*c0909341SAndroid Build Coastguard Worker    jge .h_w16
1527*c0909341SAndroid Build Coastguard Worker%macro PUT_6TAP_H 3 ; dst/src, tmp[1-2]
1528*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1529*c0909341SAndroid Build Coastguard Worker    pshufb               %2, %1, [base+subpel_h_shufD]
1530*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %1, [base+subpel_h_shufE]
1531*c0909341SAndroid Build Coastguard Worker    pshufb               %1, [base+subpel_h_shufF]
1532*c0909341SAndroid Build Coastguard Worker%else
1533*c0909341SAndroid Build Coastguard Worker    pshufb               %2, %1, m8
1534*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %1, m9
1535*c0909341SAndroid Build Coastguard Worker    pshufb               %1, m10
1536*c0909341SAndroid Build Coastguard Worker%endif
1537*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %2, m4
1538*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %3, m6
1539*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %1, m7
1540*c0909341SAndroid Build Coastguard Worker    paddw                %2, m5
1541*c0909341SAndroid Build Coastguard Worker    paddw                %2, %3
1542*c0909341SAndroid Build Coastguard Worker    paddw                %1, %2
1543*c0909341SAndroid Build Coastguard Worker    psraw                %1, 6
1544*c0909341SAndroid Build Coastguard Worker%endmacro
1545*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1546*c0909341SAndroid Build Coastguard Worker    mov                  r4, dsm
1547*c0909341SAndroid Build Coastguard Worker%endif
1548*c0909341SAndroid Build Coastguard Worker.h_w8:
1549*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
1550*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
1551*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1552*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H           m0, m2, m3
1553*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H           m1, m2, m3
1554*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1555*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1556*c0909341SAndroid Build Coastguard Worker    movq        [dstq+r4*0], m0
1557*c0909341SAndroid Build Coastguard Worker    movhps      [dstq+r4*1], m0
1558*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+r4*2]
1559*c0909341SAndroid Build Coastguard Worker%else
1560*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
1561*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
1562*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1563*c0909341SAndroid Build Coastguard Worker%endif
1564*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1565*c0909341SAndroid Build Coastguard Worker    jg .h_w8
1566*c0909341SAndroid Build Coastguard Worker    RET
1567*c0909341SAndroid Build Coastguard Worker.h_w16:
1568*c0909341SAndroid Build Coastguard Worker    add                srcq, wq
1569*c0909341SAndroid Build Coastguard Worker    add                dstq, wq
1570*c0909341SAndroid Build Coastguard Worker    neg                  wq
1571*c0909341SAndroid Build Coastguard Worker.h_w16_loop_v:
1572*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
1573*c0909341SAndroid Build Coastguard Worker.h_w16_loop_h:
1574*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6+8*0]
1575*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+8*1]
1576*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H           m0, m2, m3
1577*c0909341SAndroid Build Coastguard Worker    PUT_6TAP_H           m1, m2, m3
1578*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
1579*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r6], m0
1580*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
1581*c0909341SAndroid Build Coastguard Worker    jle .h_w16_loop_h
1582*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
1583*c0909341SAndroid Build Coastguard Worker    add                dstq, dsmp
1584*c0909341SAndroid Build Coastguard Worker    dec                  hd
1585*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop_v
1586*c0909341SAndroid Build Coastguard Worker    RET
1587*c0909341SAndroid Build Coastguard Worker.v:
1588*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1589*c0909341SAndroid Build Coastguard Worker    %define             dsq  r4
1590*c0909341SAndroid Build Coastguard Worker    %define              m8  [base+pw_512]
1591*c0909341SAndroid Build Coastguard Worker    movzx               mxd, ssb
1592*c0909341SAndroid Build Coastguard Worker    shr                 ssd, 16
1593*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1594*c0909341SAndroid Build Coastguard Worker    cmovs               ssd, mxd
1595*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base_reg-put_ssse3+subpel_filters+1+ssq*8]
1596*c0909341SAndroid Build Coastguard Worker    mov                 ssq, ssm
1597*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m7
1598*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q0000
1599*c0909341SAndroid Build Coastguard Worker    mov                  r6, ssq
1600*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q1111
1601*c0909341SAndroid Build Coastguard Worker    neg                  r6
1602*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q2222
1603*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1604*c0909341SAndroid Build Coastguard Worker    jge .v_w4
1605*c0909341SAndroid Build Coastguard Worker%else
1606*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       9, 12
1607*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1608*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1609*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1610*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1611*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base_reg-put_ssse3+subpel_filters+1+myq*8]
1612*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_512]
1613*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m7
1614*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q0000
1615*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
1616*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q1111
1617*c0909341SAndroid Build Coastguard Worker    neg                 nsq
1618*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q2222
1619*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1620*c0909341SAndroid Build Coastguard Worker    je .v_w4
1621*c0909341SAndroid Build Coastguard Worker    jg .v_w8
1622*c0909341SAndroid Build Coastguard Worker%endif
1623*c0909341SAndroid Build Coastguard Worker.v_w2:
1624*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1625*c0909341SAndroid Build Coastguard Worker    mov                 dsq, dsm
1626*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+r6 *2]
1627*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+r6 *1]
1628*c0909341SAndroid Build Coastguard Worker%else
1629*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+nsq*2]
1630*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+nsq*1]
1631*c0909341SAndroid Build Coastguard Worker%endif
1632*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*0]
1633*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
1634*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1635*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
1636*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3     ; 0 1
1637*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2     ; 1 2
1638*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4     ; 2 3
1639*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0     ; 3 4
1640*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3     ; 01 12
1641*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4     ; 23 34
1642*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
1643*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*1]
1644*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1645*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m1, m5 ; a0 b0
1646*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1647*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6     ; a1 b1
1648*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
1649*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m0, m3 ; 4 5
1650*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
1651*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0     ; 5 6
1652*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3     ; 67 78
1653*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m7 ; a2 b2
1654*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3
1655*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m8
1656*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
1657*c0909341SAndroid Build Coastguard Worker    movd                r6d, m4
1658*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6w
1659*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 16
1660*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6w
1661*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1662*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1663*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
1664*c0909341SAndroid Build Coastguard Worker    RET
1665*c0909341SAndroid Build Coastguard Worker.v_w4:
1666*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1667*c0909341SAndroid Build Coastguard Worker    shl                  wd, 14
1668*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+r6*2]
1669*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq-(1<<16)]
1670*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
1671*c0909341SAndroid Build Coastguard Worker    mov                 dsq, dsm
1672*c0909341SAndroid Build Coastguard Worker.v_w4_loop0:
1673*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+ssq*0]
1674*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*1]
1675*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1676*c0909341SAndroid Build Coastguard Worker%else
1677*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+nsq*2]
1678*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+nsq*1]
1679*c0909341SAndroid Build Coastguard Worker%endif
1680*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*0]
1681*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
1682*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1683*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
1684*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m3     ; 0 1
1685*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m2     ; 1 2
1686*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4     ; 2 3
1687*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m0     ; 3 4
1688*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3     ; 01 12
1689*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4     ; 23 34
1690*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
1691*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*1]
1692*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1693*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m1, m5 ; a0 b0
1694*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1695*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6     ; a1 b1
1696*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
1697*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m0, m3 ; 4 5
1698*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
1699*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m0     ; 5 6
1700*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3     ; 67 78
1701*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m7 ; a2 b2
1702*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3
1703*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m8
1704*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
1705*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m4
1706*c0909341SAndroid Build Coastguard Worker    psrlq                m4, 32
1707*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m4
1708*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1709*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1710*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
1711*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1712*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcm
1713*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
1714*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6w
1715*c0909341SAndroid Build Coastguard Worker    add                srcq, 4
1716*c0909341SAndroid Build Coastguard Worker    add                dstq, 4
1717*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
1718*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
1719*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<16
1720*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop0
1721*c0909341SAndroid Build Coastguard Worker%endif
1722*c0909341SAndroid Build Coastguard Worker    RET
1723*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
1724*c0909341SAndroid Build Coastguard Worker.v_w8:
1725*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       12
1726*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
1727*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq-256]
1728*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
1729*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+nsq*2]
1730*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+nsq*1]
1731*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ssq*2]
1732*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]
1733*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*1]
1734*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
1735*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r4  +ssq*0]
1736*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2     ; 01
1737*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3     ; 12
1738*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4     ; 23
1739*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0     ; 34
1740*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
1741*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m1, m5 ; a0
1742*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1743*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m2, m5 ; b0
1744*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1745*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6     ; a1
1746*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6     ; b1
1747*c0909341SAndroid Build Coastguard Worker    paddw               m10, m3
1748*c0909341SAndroid Build Coastguard Worker    paddw               m11, m4
1749*c0909341SAndroid Build Coastguard Worker    movq                 m4, [r4+ssq*1]
1750*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
1751*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m0, m4 ; 67
1752*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r4+ssq*0]
1753*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0     ; 78
1754*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m3, m7 ; a2
1755*c0909341SAndroid Build Coastguard Worker    paddw               m10, m9
1756*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m4, m7 ; b2
1757*c0909341SAndroid Build Coastguard Worker    paddw               m11, m9
1758*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m8
1759*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m8
1760*c0909341SAndroid Build Coastguard Worker    packuswb            m10, m11
1761*c0909341SAndroid Build Coastguard Worker    movq         [r7+dsq*0], m10
1762*c0909341SAndroid Build Coastguard Worker    movhps       [r7+dsq*1], m10
1763*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
1764*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1765*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
1766*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
1767*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
1768*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
1769*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
1770*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
1771*c0909341SAndroid Build Coastguard Worker    RET
1772*c0909341SAndroid Build Coastguard Worker%endif ;ARCH_X86_64
1773*c0909341SAndroid Build Coastguard Worker.hv:
1774*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
1775*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1776*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
1777*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1778*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
1779*c0909341SAndroid Build Coastguard Worker%else
1780*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
1781*c0909341SAndroid Build Coastguard Worker%endif
1782*c0909341SAndroid Build Coastguard Worker    dec                srcq
1783*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base_reg-put_ssse3+subpel_filters+2+mxq*8]
1784*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1785*c0909341SAndroid Build Coastguard Worker    movzx               mxd, ssb
1786*c0909341SAndroid Build Coastguard Worker    shr                 ssd, 16
1787*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1788*c0909341SAndroid Build Coastguard Worker    cmovs               ssd, mxd
1789*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+ssq*8]
1790*c0909341SAndroid Build Coastguard Worker    mov                 ssq, ssmp
1791*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   -mmsize*4
1792*c0909341SAndroid Build Coastguard Worker    %define              m8  [rsp+mmsize*0]
1793*c0909341SAndroid Build Coastguard Worker    %define              m9  [rsp+mmsize*1]
1794*c0909341SAndroid Build Coastguard Worker    %define             m10  [rsp+mmsize*2]
1795*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
1796*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
1797*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
1798*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
1799*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q0000
1800*c0909341SAndroid Build Coastguard Worker    mova                 m8, m2
1801*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q1111
1802*c0909341SAndroid Build Coastguard Worker    mova                 m9, m2
1803*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q2222
1804*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
1805*c0909341SAndroid Build Coastguard Worker%else
1806*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1807*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1808*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1809*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1810*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+myq*8]
1811*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11, 14
1812*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
1813*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
1814*c0909341SAndroid Build Coastguard Worker    neg                 nsq
1815*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
1816*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q0000
1817*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q1111
1818*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q2222
1819*c0909341SAndroid Build Coastguard Worker%endif
1820*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
1821*c0909341SAndroid Build Coastguard Worker    je .hv_w4
1822*c0909341SAndroid Build Coastguard Worker.hv_w2:
1823*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+subpel_h_shuf4]
1824*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+pw_34]
1825*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m1, q0000
1826*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1827*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*0]
1828*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+ssq*1]
1829*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1830*c0909341SAndroid Build Coastguard Worker    mov                 dsq, [rstk+stack_offset+gprsize*2]
1831*c0909341SAndroid Build Coastguard Worker%else
1832*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+nsq*2]
1833*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+nsq*1] ; 0 1
1834*c0909341SAndroid Build Coastguard Worker%endif
1835*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
1836*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+ssq*1] ; 2 3
1837*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1838*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0] ; 4
1839*c0909341SAndroid Build Coastguard Worker    REPX  {pshufb    x, m5}, m2, m1, m0
1840*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m7}, m2, m1, m0
1841*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m1
1842*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m0
1843*c0909341SAndroid Build Coastguard Worker    paddw                m2, m6
1844*c0909341SAndroid Build Coastguard Worker    paddw                m0, m6
1845*c0909341SAndroid Build Coastguard Worker    psraw                m2, 2            ; 0 1 2 3
1846*c0909341SAndroid Build Coastguard Worker    psraw                m0, 2
1847*c0909341SAndroid Build Coastguard Worker    palignr              m0, m2, 4        ; 1 2 3 4
1848*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m0       ; 01 12
1849*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0           ; 23 34
1850*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
1851*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*1]
1852*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1853*c0909341SAndroid Build Coastguard Worker    movhps               m3, [srcq+ssq*0] ; 5 6
1854*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5
1855*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7
1856*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m8, m1       ; a0 b0
1857*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
1858*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9           ; a1 b1
1859*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m3
1860*c0909341SAndroid Build Coastguard Worker    paddw                m3, m6
1861*c0909341SAndroid Build Coastguard Worker    psraw                m3, 2
1862*c0909341SAndroid Build Coastguard Worker    paddd                m4, m2
1863*c0909341SAndroid Build Coastguard Worker    palignr              m2, m3, m0, 12   ; 4 5
1864*c0909341SAndroid Build Coastguard Worker    mova                 m0, m3
1865*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3           ; 45 56
1866*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m10, m2      ; a2 b2
1867*c0909341SAndroid Build Coastguard Worker    paddd                m4, m3
1868*c0909341SAndroid Build Coastguard Worker    psrad                m4, 10
1869*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
1870*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
1871*c0909341SAndroid Build Coastguard Worker    movd                r6d, m4
1872*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6w
1873*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 16
1874*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6w
1875*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1876*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1877*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
1878*c0909341SAndroid Build Coastguard Worker    RET
1879*c0909341SAndroid Build Coastguard Worker.hv_w4:
1880*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1881*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]
1882*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*1]
1883*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1884*c0909341SAndroid Build Coastguard Worker    mov                 dsq, [rstk+stack_offset+gprsize*2]
1885*c0909341SAndroid Build Coastguard Worker    %define             m11  [base+pw_34]
1886*c0909341SAndroid Build Coastguard Worker    %define             m12  [base+subpel_h_shufA]
1887*c0909341SAndroid Build Coastguard Worker    %define             m13  [rsp+mmsize*3]
1888*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q0000
1889*c0909341SAndroid Build Coastguard Worker    mova                m13, m1
1890*c0909341SAndroid Build Coastguard Worker%else
1891*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       14
1892*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+nsq*2]
1893*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+nsq*1]
1894*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m1, q0000
1895*c0909341SAndroid Build Coastguard Worker    mova                m12, [base+subpel_h_shufA]
1896*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pw_34]
1897*c0909341SAndroid Build Coastguard Worker%endif
1898*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
1899*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*1]
1900*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1901*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*0]
1902*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1903*c0909341SAndroid Build Coastguard Worker    mova                 m5, m12
1904*c0909341SAndroid Build Coastguard Worker    mova                 m6, m13
1905*c0909341SAndroid Build Coastguard Worker    REPX {pshufb    x, m5 }, m3, m4, m0, m1, m2
1906*c0909341SAndroid Build Coastguard Worker    mova                 m5, m11
1907*c0909341SAndroid Build Coastguard Worker    REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2
1908*c0909341SAndroid Build Coastguard Worker%else
1909*c0909341SAndroid Build Coastguard Worker    REPX {pshufb    x, m12}, m3, m4, m0, m1, m2
1910*c0909341SAndroid Build Coastguard Worker    REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2
1911*c0909341SAndroid Build Coastguard Worker%endif
1912*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m0      ; 0 2
1913*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m1      ; 1 3
1914*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m2      ; 2 4
1915*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1916*c0909341SAndroid Build Coastguard Worker    REPX     {paddw x, m5 }, m3, m4, m0
1917*c0909341SAndroid Build Coastguard Worker%else
1918*c0909341SAndroid Build Coastguard Worker    REPX     {paddw x, m11}, m3, m4, m0
1919*c0909341SAndroid Build Coastguard Worker%endif
1920*c0909341SAndroid Build Coastguard Worker    REPX     {psraw x, 2  }, m3, m4, m0
1921*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4  ; 01
1922*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4      ; 23
1923*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0  ; 12
1924*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0      ; 34
1925*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
1926*c0909341SAndroid Build Coastguard Worker    movq                 m7, [srcq+ssq*1]
1927*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
1928*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+ssq*0]
1929*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m12
1930*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
1931*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m13
1932*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m13
1933*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m8, m1  ; a0
1934*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
1935*c0909341SAndroid Build Coastguard Worker    phaddw               m7, m6      ; 5 6
1936*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m8, m2  ; b0
1937*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
1938*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
1939*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
1940*c0909341SAndroid Build Coastguard Worker    paddw                m7, m11
1941*c0909341SAndroid Build Coastguard Worker    psraw                m7, 2
1942*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
1943*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
1944*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, m7, 0x01 ; 4 5
1945*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
1946*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m7  ; 45
1947*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m7      ; 56
1948*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m10, m3 ; a2
1949*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
1950*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m10, m4 ; b2
1951*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
1952*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
1953*c0909341SAndroid Build Coastguard Worker    psrad                m6, 10
1954*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
1955*c0909341SAndroid Build Coastguard Worker    packuswb             m5, m5
1956*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m5
1957*c0909341SAndroid Build Coastguard Worker    psrlq                m5, 32
1958*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m5
1959*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
1960*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
1961*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
1962*c0909341SAndroid Build Coastguard Worker    RET
1963*c0909341SAndroid Build Coastguard Worker.hv_w8:
1964*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
1965*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
1966*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
1967*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
1968*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8]
1969*c0909341SAndroid Build Coastguard Worker    movzx               mxd, ssb
1970*c0909341SAndroid Build Coastguard Worker    shr                 ssd, 16
1971*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1972*c0909341SAndroid Build Coastguard Worker    cmovs               ssd, mxd
1973*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base_reg-put_ssse3+subpel_filters+1+ssq*8]
1974*c0909341SAndroid Build Coastguard Worker    shl                  wd, 13
1975*c0909341SAndroid Build Coastguard Worker    mov                 ssq, ssm
1976*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq-(1<<16)]
1977*c0909341SAndroid Build Coastguard Worker%assign regs_used 5
1978*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -mmsize*16
1979*c0909341SAndroid Build Coastguard Worker%assign regs_used 7
1980*c0909341SAndroid Build Coastguard Worker    mov                 dsq, [rstk+stack_offset+gprsize*2]
1981*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
1982*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
1983*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
1984*c0909341SAndroid Build Coastguard Worker    %define            srcm  [esp+mmsize*15+gprsize*0]
1985*c0909341SAndroid Build Coastguard Worker    %define            dstm  [esp+mmsize*15+gprsize*1]
1986*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
1987*c0909341SAndroid Build Coastguard Worker%endif
1988*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
1989*c0909341SAndroid Build Coastguard Worker%else
1990*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        16*6, 16
1991*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg-put_ssse3+subpel_filters+1+mxq*8]
1992*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
1993*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
1994*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
1995*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
1996*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base_reg-put_ssse3+subpel_filters+1+myq*8]
1997*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
1998*c0909341SAndroid Build Coastguard Worker    shl                  wd, 13
1999*c0909341SAndroid Build Coastguard Worker    neg                 nsq
2000*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq-(1<<16)]
2001*c0909341SAndroid Build Coastguard Worker%endif
2002*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_34]
2003*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m0
2004*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
2005*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8 ; sign-extend
2006*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q0000
2007*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m2
2008*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q1111
2009*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
2010*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m0, q2222
2011*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m0
2012*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q0000
2013*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m2
2014*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q1111
2015*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m2
2016*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q2222
2017*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m1
2018*c0909341SAndroid Build Coastguard Worker%macro HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \
2019*c0909341SAndroid Build Coastguard Worker                     [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3]
2020*c0909341SAndroid Build Coastguard Worker    pshufb               %2, %1, %4
2021*c0909341SAndroid Build Coastguard Worker    pshufb               %1, %5
2022*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %3, %2, %6
2023*c0909341SAndroid Build Coastguard Worker    shufps               %2, %1, q2121
2024*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %1, %8
2025*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %2, %7
2026*c0909341SAndroid Build Coastguard Worker    paddw                %3, m7
2027*c0909341SAndroid Build Coastguard Worker    paddw                %1, %3
2028*c0909341SAndroid Build Coastguard Worker    paddw                %1, %2
2029*c0909341SAndroid Build Coastguard Worker    psraw                %1, 2
2030*c0909341SAndroid Build Coastguard Worker%endmacro
2031*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
2032*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+subpel_h_shufD]
2033*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+subpel_h_shufF]
2034*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*0]
2035*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2036*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
2037*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
2038*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2039*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m0, m5, m6, m2, m3, m4
2040*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m1, m5, m6, m2, m3, m4
2041*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0]
2042*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m1   ; 01
2043*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
2044*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 6], m6
2045*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 7], m0
2046*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m5, m0, m6, m2, m3, m4
2047*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1]
2048*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2049*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m1, m5   ; 12
2050*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m5
2051*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 8], m6
2052*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 9], m1
2053*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m0, m1, m6, m2, m3, m4
2054*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
2055*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m5, m0   ; 23
2056*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0
2057*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*10], m6
2058*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*11], m5
2059*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m1, m5, m6, m2, m3, m4
2060*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*14], m1
2061*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m1   ; 34
2062*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
2063*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*12], m6
2064*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*13], m0
2065*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
2066*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16* 3]
2067*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3, [rsp+16* 6] ; a0
2068*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m3, [rsp+16* 7] ; a0'
2069*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m3, [rsp+16* 8] ; b0
2070*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, [rsp+16* 9]     ; b0'
2071*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+16* 4]
2072*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*10]
2073*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*11]
2074*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 6], m4
2075*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m6       ; a1
2076*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 7], m5
2077*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6       ; a1'
2078*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
2079*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*12]
2080*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
2081*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*13]
2082*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 8], m4
2083*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m6       ; b1
2084*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 9], m5
2085*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6       ; b1'
2086*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1]
2087*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2088*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4
2089*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5
2090*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m6, m4, m5
2091*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*14]
2092*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6   ; 45
2093*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6
2094*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*10], m4
2095*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*11], m5
2096*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [rsp+16*5] ; a2
2097*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, [rsp+16*5] ; a2'
2098*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
2099*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]
2100*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
2101*c0909341SAndroid Build Coastguard Worker    psrad                m0, 10
2102*c0909341SAndroid Build Coastguard Worker    psrad                m2, 10
2103*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
2104*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m4, m2, m5
2105*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+16*5]
2106*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m4   ; 56
2107*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*14], m4
2108*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m4
2109*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*12], m5
2110*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m2       ; b2
2111*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*13], m6
2112*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2       ; b2'
2113*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
2114*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6
2115*c0909341SAndroid Build Coastguard Worker    psrad                m1, 10
2116*c0909341SAndroid Build Coastguard Worker    psrad                m3, 10
2117*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
2118*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2119*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
2120*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
2121*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2122*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2123*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
2124*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcm
2125*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
2126*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6w
2127*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2128*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2129*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
2130*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
2131*c0909341SAndroid Build Coastguard Worker%else
2132*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+nsq*2]
2133*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+nsq*1]
2134*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ssq*2]
2135*c0909341SAndroid Build Coastguard Worker    movu                m13, [srcq+ssq*0]
2136*c0909341SAndroid Build Coastguard Worker    movu                m15, [srcq+ssq*1]
2137*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2138*c0909341SAndroid Build Coastguard Worker    movu                 m6, [r4  +ssq*0]
2139*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*1]
2140*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+16*2]
2141*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m9, m0, m1, m2, m3, m4, m5, m8
2142*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m11, m0, m1, m2, m3, m4, m5, m8
2143*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m13, m0, m1, m2, m3, m4, m5, m8
2144*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m15, m0, m1, m2, m3, m4, m5, m8
2145*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m6, m0, m1, m2, m3, m4, m5, m8
2146*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9, m11  ; 01
2147*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m11
2148*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m11, m13 ; 12
2149*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m13
2150*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13, m15 ; 23
2151*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m15
2152*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m6  ; 34
2153*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m6
2154*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
2155*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16*3]
2156*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*4]
2157*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8, m3  ; a0
2158*c0909341SAndroid Build Coastguard Worker    mova                 m8, m12
2159*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9, m3  ; a0'
2160*c0909341SAndroid Build Coastguard Worker    mova                 m9, m13
2161*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m10, m3 ; b0
2162*c0909341SAndroid Build Coastguard Worker    mova                m10, m14
2163*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11     ; b0'
2164*c0909341SAndroid Build Coastguard Worker    mova                m11, m15
2165*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m4}, m12, m13, m14, m15
2166*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12
2167*c0909341SAndroid Build Coastguard Worker    paddd                m2, m13
2168*c0909341SAndroid Build Coastguard Worker    paddd                m1, m14
2169*c0909341SAndroid Build Coastguard Worker    paddd                m3, m15
2170*c0909341SAndroid Build Coastguard Worker    movu                m15, [r4+ssq*1]
2171*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
2172*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP           m15, m4, m5
2173*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m6, m15
2174*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m6, m15
2175*c0909341SAndroid Build Coastguard Worker    movu                 m6, [r4+ssq*0]
2176*c0909341SAndroid Build Coastguard Worker    HV_H_6TAP            m6, m4, m5
2177*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*5]
2178*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m6
2179*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m6
2180*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m12, m4  ; a2
2181*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
2182*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m13, m4  ; a2'
2183*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
2184*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m14, m4  ; b2
2185*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
2186*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m15      ; b2'
2187*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4
2188*c0909341SAndroid Build Coastguard Worker    REPX      {psrad x, 10}, m0, m2, m1, m3
2189*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
2190*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
2191*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2192*c0909341SAndroid Build Coastguard Worker    movq         [r7+dsq*0], m0
2193*c0909341SAndroid Build Coastguard Worker    movhps       [r7+dsq*1], m0
2194*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
2195*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2196*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
2197*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2198*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2199*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
2200*c0909341SAndroid Build Coastguard Worker%endif
2201*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<16
2202*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
2203*c0909341SAndroid Build Coastguard Worker    RET
2204*c0909341SAndroid Build Coastguard Worker
2205*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_8bpc
2206*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_8bpc
2207*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_8bpc
2208*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_8bpc
2209*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_FN sharp,          SHARP,   SHARP
2210*c0909341SAndroid Build Coastguard Worker
2211*c0909341SAndroid Build Coastguard Workercglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
2212*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
2213*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2214*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2215*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
2216*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
2217*c0909341SAndroid Build Coastguard Worker%else
2218*c0909341SAndroid Build Coastguard Worker    imul                ssd, mym, 0x010101
2219*c0909341SAndroid Build Coastguard Worker    add                 ssd, t1d ; 8tap_v, my, 4tap_v
2220*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcm
2221*c0909341SAndroid Build Coastguard Worker%endif
2222*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
2223*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
2224*c0909341SAndroid Build Coastguard Worker    LEA            base_reg, put_ssse3
2225*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
2226*c0909341SAndroid Build Coastguard Worker    jnz .h
2227*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2228*c0909341SAndroid Build Coastguard Worker    test                ssd, 0xf00
2229*c0909341SAndroid Build Coastguard Worker%else
2230*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2231*c0909341SAndroid Build Coastguard Worker%endif
2232*c0909341SAndroid Build Coastguard Worker    jnz .v
2233*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
2234*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base_reg+wq*2+table_offset(put,)]
2235*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, ssmp
2236*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
2237*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
2238*c0909341SAndroid Build Coastguard Worker%if WIN64
2239*c0909341SAndroid Build Coastguard Worker    pop                  r8
2240*c0909341SAndroid Build Coastguard Worker%endif
2241*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2242*c0909341SAndroid Build Coastguard Worker    jmp                  wq
2243*c0909341SAndroid Build Coastguard Worker.h_w2:
2244*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+subpel_h_shuf4]
2245*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
2246*c0909341SAndroid Build Coastguard Worker.h_w2_loop:
2247*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
2248*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+ssq*1]
2249*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2250*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3
2251*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4
2252*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m0
2253*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5 ; pw34
2254*c0909341SAndroid Build Coastguard Worker    psraw                m0, 6
2255*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
2256*c0909341SAndroid Build Coastguard Worker    movd                r6d, m0
2257*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6w
2258*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 16
2259*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6w
2260*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2261*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2262*c0909341SAndroid Build Coastguard Worker    jg .h_w2_loop
2263*c0909341SAndroid Build Coastguard Worker    RET
2264*c0909341SAndroid Build Coastguard Worker.h_w4:
2265*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2266*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
2267*c0909341SAndroid Build Coastguard Worker%else
2268*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2269*c0909341SAndroid Build Coastguard Worker%endif
2270*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
2271*c0909341SAndroid Build Coastguard Worker    dec                srcq
2272*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0000
2273*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2274*c0909341SAndroid Build Coastguard Worker    jl .h_w2
2275*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+subpel_h_shufA]
2276*c0909341SAndroid Build Coastguard Worker    movifnidn           dsq, dsmp
2277*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
2278*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0] ; 1
2279*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*1] ; 2
2280*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2281*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m3 ; subpel_h_shufA
2282*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m3 ; subpel_h_shufA
2283*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m4 ; subpel_filters
2284*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m4 ; subpel_filters
2285*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m1
2286*c0909341SAndroid Build Coastguard Worker    paddw                m0, m5 ; pw34
2287*c0909341SAndroid Build Coastguard Worker    psraw                m0, 6
2288*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
2289*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m0
2290*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
2291*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m0
2292*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2293*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2294*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
2295*c0909341SAndroid Build Coastguard Worker    RET
2296*c0909341SAndroid Build Coastguard Worker.h:
2297*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2298*c0909341SAndroid Build Coastguard Worker    test                ssd, 0xf00
2299*c0909341SAndroid Build Coastguard Worker%else
2300*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
2301*c0909341SAndroid Build Coastguard Worker%endif
2302*c0909341SAndroid Build Coastguard Worker    jnz .hv
2303*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, ssmp
2304*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_34] ; 2 + (8 << 2)
2305*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2306*c0909341SAndroid Build Coastguard Worker    jle .h_w4
2307*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12
2308*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2309*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+subpel_h_shufA]
2310*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+subpel_h_shufB]
2311*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+subpel_h_shufC]
2312*c0909341SAndroid Build Coastguard Worker%endif
2313*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2314*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
2315*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base_reg+mxq*8+subpel_filters-put_ssse3]
2316*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q0000
2317*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q1111
2318*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
2319*c0909341SAndroid Build Coastguard Worker    jge .h_w16
2320*c0909341SAndroid Build Coastguard Worker%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
2321*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
2322*c0909341SAndroid Build Coastguard Worker    pshufb              %2, %1, [base+subpel_h_shufB]
2323*c0909341SAndroid Build Coastguard Worker    pshufb              %3, %1, [base+subpel_h_shufC]
2324*c0909341SAndroid Build Coastguard Worker    pshufb              %1,     [base+subpel_h_shufA]
2325*c0909341SAndroid Build Coastguard Worker %else
2326*c0909341SAndroid Build Coastguard Worker    pshufb              %2, %1, m11; subpel_h_shufB
2327*c0909341SAndroid Build Coastguard Worker    pshufb              %3, %1, m9 ; subpel_h_shufC
2328*c0909341SAndroid Build Coastguard Worker    pshufb              %1, m10    ; subpel_h_shufA
2329*c0909341SAndroid Build Coastguard Worker %endif
2330*c0909341SAndroid Build Coastguard Worker    pmaddubsw           %4, %2, m6 ; subpel +0 B0
2331*c0909341SAndroid Build Coastguard Worker    pmaddubsw           %2, m7     ; subpel +4 B4
2332*c0909341SAndroid Build Coastguard Worker    pmaddubsw           %3, m7     ; C4
2333*c0909341SAndroid Build Coastguard Worker    pmaddubsw           %1, m6     ; A0
2334*c0909341SAndroid Build Coastguard Worker    paddw               %3, %4     ; C4+B0
2335*c0909341SAndroid Build Coastguard Worker    paddw               %1, %2     ; A0+B4
2336*c0909341SAndroid Build Coastguard Worker    phaddw              %1, %3
2337*c0909341SAndroid Build Coastguard Worker    paddw               %1, m5     ; pw34
2338*c0909341SAndroid Build Coastguard Worker    psraw               %1, 6
2339*c0909341SAndroid Build Coastguard Worker%endmacro
2340*c0909341SAndroid Build Coastguard Worker.h_w8:
2341*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
2342*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
2343*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2344*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H           m0, m2, m3, m4
2345*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H           m1, m2, m3, m4
2346*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2347*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2348*c0909341SAndroid Build Coastguard Worker    movq             [dstq], m0
2349*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
2350*c0909341SAndroid Build Coastguard Worker    movhps           [dstq], m0
2351*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
2352*c0909341SAndroid Build Coastguard Worker%else
2353*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
2354*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
2355*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2356*c0909341SAndroid Build Coastguard Worker%endif
2357*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2358*c0909341SAndroid Build Coastguard Worker    jg .h_w8
2359*c0909341SAndroid Build Coastguard Worker    RET
2360*c0909341SAndroid Build Coastguard Worker.h_w16:
2361*c0909341SAndroid Build Coastguard Worker    add                srcq, wq
2362*c0909341SAndroid Build Coastguard Worker    add                dstq, wq
2363*c0909341SAndroid Build Coastguard Worker    neg                  wq
2364*c0909341SAndroid Build Coastguard Worker.h_w16_loop_v:
2365*c0909341SAndroid Build Coastguard Worker    mov                  r6, wq
2366*c0909341SAndroid Build Coastguard Worker.h_w16_loop_h:
2367*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r6+8*0]
2368*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r6+8*1]
2369*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H           m0, m2, m3, m4
2370*c0909341SAndroid Build Coastguard Worker    PUT_8TAP_H           m1, m2, m3, m4
2371*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
2372*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r6], m0
2373*c0909341SAndroid Build Coastguard Worker    add                  r6, 16
2374*c0909341SAndroid Build Coastguard Worker    jle .h_w16_loop_h
2375*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2376*c0909341SAndroid Build Coastguard Worker    add                dstq, dsmp
2377*c0909341SAndroid Build Coastguard Worker    dec                  hd
2378*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop_v
2379*c0909341SAndroid Build Coastguard Worker    RET
2380*c0909341SAndroid Build Coastguard Worker.v:
2381*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2382*c0909341SAndroid Build Coastguard Worker    movzx               mxd, ssb
2383*c0909341SAndroid Build Coastguard Worker    shr                 ssd, 16
2384*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2385*c0909341SAndroid Build Coastguard Worker    cmovs               ssd, mxd
2386*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
2387*c0909341SAndroid Build Coastguard Worker%else
2388*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      16
2389*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2390*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2391*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2392*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2393*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
2394*c0909341SAndroid Build Coastguard Worker%endif
2395*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m0
2396*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_512]
2397*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2398*c0909341SAndroid Build Coastguard Worker %define            subpel0  [rsp+mmsize*0]
2399*c0909341SAndroid Build Coastguard Worker %define            subpel1  [rsp+mmsize*1]
2400*c0909341SAndroid Build Coastguard Worker %define            subpel2  [rsp+mmsize*2]
2401*c0909341SAndroid Build Coastguard Worker %define            subpel3  [rsp+mmsize*3]
2402*c0909341SAndroid Build Coastguard Worker%assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed
2403*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK       -16*4
2404*c0909341SAndroid Build Coastguard Worker%assign regs_used 7
2405*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q0000
2406*c0909341SAndroid Build Coastguard Worker    mova            subpel0, m1
2407*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q1111
2408*c0909341SAndroid Build Coastguard Worker    mova            subpel1, m1
2409*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q2222
2410*c0909341SAndroid Build Coastguard Worker    mova            subpel2, m1
2411*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q3333
2412*c0909341SAndroid Build Coastguard Worker    mova            subpel3, m1
2413*c0909341SAndroid Build Coastguard Worker    mov                 ssq, [rstk+stack_offset+gprsize*4]
2414*c0909341SAndroid Build Coastguard Worker    lea                 ssq, [ssq*3]
2415*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
2416*c0909341SAndroid Build Coastguard Worker    mov                 ssq, [rstk+stack_offset+gprsize*4]
2417*c0909341SAndroid Build Coastguard Worker    mov                 dsq, [rstk+stack_offset+gprsize*2]
2418*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 2
2419*c0909341SAndroid Build Coastguard Worker    jne .v_w4
2420*c0909341SAndroid Build Coastguard Worker%else
2421*c0909341SAndroid Build Coastguard Worker %define            subpel0  m8
2422*c0909341SAndroid Build Coastguard Worker %define            subpel1  m9
2423*c0909341SAndroid Build Coastguard Worker %define            subpel2  m10
2424*c0909341SAndroid Build Coastguard Worker %define            subpel3  m11
2425*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
2426*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q0000
2427*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
2428*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q1111
2429*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q2222
2430*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q3333
2431*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2432*c0909341SAndroid Build Coastguard Worker    je .v_w4
2433*c0909341SAndroid Build Coastguard Worker    jg .v_w8
2434*c0909341SAndroid Build Coastguard Worker%endif
2435*c0909341SAndroid Build Coastguard Worker.v_w2:
2436*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+ssq*0]
2437*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*1]
2438*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2439*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2440*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*0]
2441*c0909341SAndroid Build Coastguard Worker    movd                 m5, [srcq+ssq*1]
2442*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2443*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*0]
2444*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
2445*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2446*c0909341SAndroid Build Coastguard Worker%else
2447*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*2]
2448*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2449*c0909341SAndroid Build Coastguard Worker    movd                 m5, [srcq+ssq*0]
2450*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*1]
2451*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*2]
2452*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2453*c0909341SAndroid Build Coastguard Worker%endif
2454*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m0           ; 0 1
2455*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2           ; 1 2
2456*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0           ; 01 12
2457*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
2458*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5           ; 2 3
2459*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m3           ; 3 4
2460*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4           ; 4 5
2461*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0           ; 5 6
2462*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5           ; 23 34
2463*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4           ; 45 56
2464*c0909341SAndroid Build Coastguard Worker.v_w2_loop:
2465*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
2466*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2467*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m1, subpel0     ; a0 b0
2468*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2469*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, subpel1         ; a1 b1
2470*c0909341SAndroid Build Coastguard Worker    paddw                m5, m2
2471*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2472*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, subpel2         ; a2 b2
2473*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3
2474*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0, m4          ; 6 7
2475*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
2476*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0              ; 7 8
2477*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4              ; 67 78
2478*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m3, subpel3     ; a3 b3
2479*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4
2480*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
2481*c0909341SAndroid Build Coastguard Worker    packuswb             m5, m5
2482*c0909341SAndroid Build Coastguard Worker    movd                r6d, m5
2483*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r6w
2484*c0909341SAndroid Build Coastguard Worker    shr                 r6d, 16
2485*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r6w
2486*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2487*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2488*c0909341SAndroid Build Coastguard Worker    jg .v_w2_loop
2489*c0909341SAndroid Build Coastguard Worker    RET
2490*c0909341SAndroid Build Coastguard Worker.v_w4:
2491*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2492*c0909341SAndroid Build Coastguard Worker    shl                  wd, 14
2493*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
2494*c0909341SAndroid Build Coastguard Worker %define               dstm [rsp+mmsize*4+gprsize]
2495*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
2496*c0909341SAndroid Build Coastguard Worker%endif
2497*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq-(1<<16)]
2498*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
2499*c0909341SAndroid Build Coastguard Worker.v_w4_loop0:
2500*c0909341SAndroid Build Coastguard Worker%endif
2501*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+ssq*0]
2502*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*1]
2503*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2504*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2505*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*0]
2506*c0909341SAndroid Build Coastguard Worker    movd                 m5, [srcq+ssq*1]
2507*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2508*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*0]
2509*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
2510*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2511*c0909341SAndroid Build Coastguard Worker%else
2512*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*2]
2513*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2514*c0909341SAndroid Build Coastguard Worker    movd                 m5, [srcq+ssq*0]
2515*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*1]
2516*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*2]
2517*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2518*c0909341SAndroid Build Coastguard Worker%endif
2519*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m0           ; 0 1
2520*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2           ; 1 2
2521*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0           ; 01 12
2522*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
2523*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m5           ; 2 3
2524*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m3           ; 3 4
2525*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m4           ; 4 5
2526*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m0           ; 5 6
2527*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m5           ; 23 34
2528*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4           ; 45 56
2529*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
2530*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
2531*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2532*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m1, subpel0  ; a0 b0
2533*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2534*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, subpel1      ; a1 b1
2535*c0909341SAndroid Build Coastguard Worker    paddw                m5, m2
2536*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2537*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, subpel2      ; a2 b2
2538*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3
2539*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m0, m4       ; 6 7 _ _
2540*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
2541*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m0           ; 7 8 _ _
2542*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4           ; 67 78
2543*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m3, subpel3  ; a3 b3
2544*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4
2545*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
2546*c0909341SAndroid Build Coastguard Worker    packuswb             m5, m5
2547*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m5
2548*c0909341SAndroid Build Coastguard Worker    psrlq                m5, 32
2549*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m5
2550*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2551*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2552*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
2553*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2554*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
2555*c0909341SAndroid Build Coastguard Worker    add                  r4, 4
2556*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6w
2557*c0909341SAndroid Build Coastguard Worker    add                dstq, 4
2558*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
2559*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
2560*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<16
2561*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop0
2562*c0909341SAndroid Build Coastguard Worker%endif
2563*c0909341SAndroid Build Coastguard Worker    RET
2564*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
2565*c0909341SAndroid Build Coastguard Worker.v_w8:
2566*c0909341SAndroid Build Coastguard Worker    shl                  wd, 5
2567*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq-256]
2568*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
2569*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
2570*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*1]
2571*c0909341SAndroid Build Coastguard Worker    lea                  r4, [srcq+ss3q]
2572*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*2]
2573*c0909341SAndroid Build Coastguard Worker    movq                 m4, [r4  +ssq*0]
2574*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
2575*c0909341SAndroid Build Coastguard Worker    movq                 m5, [r4  +ssq*1]
2576*c0909341SAndroid Build Coastguard Worker    movq                 m6, [r4  +ssq*2]
2577*c0909341SAndroid Build Coastguard Worker    add                  r4, ss3q
2578*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r4  +ssq*0]
2579*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2 ; 01
2580*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3 ; 12
2581*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4 ; 23
2582*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5 ; 34
2583*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6 ; 45
2584*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m0 ; 56
2585*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
2586*c0909341SAndroid Build Coastguard Worker    movq                m13, [r4+ssq*1]
2587*c0909341SAndroid Build Coastguard Worker    lea                  r4, [r4+ssq*2]
2588*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m1, subpel0 ; a0
2589*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
2590*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m2, subpel0 ; b0
2591*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
2592*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, subpel1 ; a1
2593*c0909341SAndroid Build Coastguard Worker    mova                m12, m0
2594*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, subpel1 ; b1
2595*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r4+ssq*0]
2596*c0909341SAndroid Build Coastguard Worker    paddw               m14, m3
2597*c0909341SAndroid Build Coastguard Worker    paddw               m15, m4
2598*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
2599*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, subpel2 ; a2
2600*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
2601*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, subpel2 ; b2
2602*c0909341SAndroid Build Coastguard Worker    punpcklbw           m12, m13     ; 67
2603*c0909341SAndroid Build Coastguard Worker    punpcklbw           m13, m0      ; 78
2604*c0909341SAndroid Build Coastguard Worker    paddw               m14, m5
2605*c0909341SAndroid Build Coastguard Worker    mova                 m5, m12
2606*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, subpel3 ; a3
2607*c0909341SAndroid Build Coastguard Worker    paddw               m15, m6
2608*c0909341SAndroid Build Coastguard Worker    mova                 m6, m13
2609*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, subpel3 ; b3
2610*c0909341SAndroid Build Coastguard Worker    paddw               m14, m12
2611*c0909341SAndroid Build Coastguard Worker    paddw               m15, m13
2612*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m7
2613*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m7
2614*c0909341SAndroid Build Coastguard Worker    packuswb            m14, m15
2615*c0909341SAndroid Build Coastguard Worker    movq         [r7+dsq*0], m14
2616*c0909341SAndroid Build Coastguard Worker    movhps       [r7+dsq*1], m14
2617*c0909341SAndroid Build Coastguard Worker    lea                  r7, [r7+dsq*2]
2618*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2619*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
2620*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
2621*c0909341SAndroid Build Coastguard Worker    add                dstq, 8
2622*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
2623*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
2624*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
2625*c0909341SAndroid Build Coastguard Worker    RET
2626*c0909341SAndroid Build Coastguard Worker%endif ;ARCH_X86_64
2627*c0909341SAndroid Build Coastguard Worker%undef subpel0
2628*c0909341SAndroid Build Coastguard Worker%undef subpel1
2629*c0909341SAndroid Build Coastguard Worker%undef subpel2
2630*c0909341SAndroid Build Coastguard Worker%undef subpel3
2631*c0909341SAndroid Build Coastguard Worker.hv:
2632*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
2633*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2634*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
2635*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2636*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
2637*c0909341SAndroid Build Coastguard Worker%else
2638*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
2639*c0909341SAndroid Build Coastguard Worker%endif
2640*c0909341SAndroid Build Coastguard Worker    dec                srcq
2641*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2]
2642*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2643*c0909341SAndroid Build Coastguard Worker    movzx               mxd, ssb
2644*c0909341SAndroid Build Coastguard Worker    shr                 ssd, 16
2645*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2646*c0909341SAndroid Build Coastguard Worker    cmovs               ssd, mxd
2647*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg+ssq*8+subpel_filters-put_ssse3]
2648*c0909341SAndroid Build Coastguard Worker    mov                 ssq, ssmp
2649*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
2650*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
2651*c0909341SAndroid Build Coastguard Worker %define           base_reg  r6
2652*c0909341SAndroid Build Coastguard Worker    mov                  r6, r1; use as new base
2653*c0909341SAndroid Build Coastguard Worker %assign regs_used 2
2654*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -mmsize*14
2655*c0909341SAndroid Build Coastguard Worker %assign regs_used 7
2656*c0909341SAndroid Build Coastguard Worker    mov                 dsq, [rstk+stack_offset+gprsize*2]
2657*c0909341SAndroid Build Coastguard Worker %define           subpelv0  [rsp+mmsize*0]
2658*c0909341SAndroid Build Coastguard Worker %define           subpelv1  [rsp+mmsize*1]
2659*c0909341SAndroid Build Coastguard Worker %define           subpelv2  [rsp+mmsize*2]
2660*c0909341SAndroid Build Coastguard Worker %define           subpelv3  [rsp+mmsize*3]
2661*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2662*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2663*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q0000
2664*c0909341SAndroid Build Coastguard Worker    mova           subpelv0, m6
2665*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q1111
2666*c0909341SAndroid Build Coastguard Worker    mova           subpelv1, m6
2667*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q2222
2668*c0909341SAndroid Build Coastguard Worker    mova           subpelv2, m6
2669*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q3333
2670*c0909341SAndroid Build Coastguard Worker    mova           subpelv3, m6
2671*c0909341SAndroid Build Coastguard Worker%else
2672*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
2673*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
2674*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2675*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
2676*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg+myq*8+subpel_filters-put_ssse3]
2677*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   mmsize*14, 14
2678*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
2679*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
2680*c0909341SAndroid Build Coastguard Worker %define           subpelv0  m10
2681*c0909341SAndroid Build Coastguard Worker %define           subpelv1  m11
2682*c0909341SAndroid Build Coastguard Worker %define           subpelv2  m12
2683*c0909341SAndroid Build Coastguard Worker %define           subpelv3  m13
2684*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
2685*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
2686*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_8192]
2687*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pd_512]
2688*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q0000
2689*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q1111
2690*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q2222
2691*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q3333
2692*c0909341SAndroid Build Coastguard Worker%endif
2693*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m1, q0000
2694*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
2695*c0909341SAndroid Build Coastguard Worker    je .hv_w4
2696*c0909341SAndroid Build Coastguard Worker.hv_w2:
2697*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4]
2698*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*0]     ; 0
2699*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+ssq*1]     ; 0 _ 1
2700*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2701*c0909341SAndroid Build Coastguard Worker %define           w8192reg  [base+pw_8192]
2702*c0909341SAndroid Build Coastguard Worker %define            d512reg  [base+pd_512]
2703*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2704*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]     ; 2
2705*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+ssq*1]     ; 2 _ 3
2706*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2707*c0909341SAndroid Build Coastguard Worker%else
2708*c0909341SAndroid Build Coastguard Worker %define           w8192reg  m8
2709*c0909341SAndroid Build Coastguard Worker %define            d512reg  m9
2710*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*2]     ; 2
2711*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2712*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+ssq*0]     ; 2 _ 3
2713*c0909341SAndroid Build Coastguard Worker%endif
2714*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6 ; 0 ~ 1 ~
2715*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6 ; 2 ~ 3 ~
2716*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7 ; subpel_filters
2717*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7 ; subpel_filters
2718*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m0 ; 0 1 2 3
2719*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, w8192reg
2720*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2721*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]     ; 4
2722*c0909341SAndroid Build Coastguard Worker    movhps               m3, [srcq+ssq*1]     ; 4 _ 5
2723*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2724*c0909341SAndroid Build Coastguard Worker%else
2725*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*1]     ; 4
2726*c0909341SAndroid Build Coastguard Worker    movhps               m3, [srcq+ssq*2]     ; 4 _ 5
2727*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2728*c0909341SAndroid Build Coastguard Worker%endif
2729*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]     ; 6
2730*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6 ; 4 ~ 5 ~
2731*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6 ; 6 ~
2732*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7 ; subpel_filters
2733*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7 ; subpel_filters
2734*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m0 ; 4 5 6 _
2735*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, w8192reg
2736*c0909341SAndroid Build Coastguard Worker    palignr              m4, m3, m2, 4; V        1 2 3 4
2737*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m4   ; V 01 12    0 1 1 2
2738*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4       ; V 23 34    2 3 3 4
2739*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q2121; V          5 6 5 6
2740*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0       ; V 45 56    4 5 5 6
2741*c0909341SAndroid Build Coastguard Worker.hv_w2_loop:
2742*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*1] ; V 7
2743*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2] ; V
2744*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+ssq*0] ; V 7 8
2745*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6
2746*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7
2747*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, subpelv0; V a0 b0
2748*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2       ; V
2749*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, subpelv1 ; V a1 b1
2750*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2       ; V
2751*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3       ; V
2752*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, subpelv2 ; a2 b2
2753*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m4
2754*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, w8192reg
2755*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3       ; V
2756*c0909341SAndroid Build Coastguard Worker    palignr              m3, m4, m0, 12
2757*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
2758*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0           ; V 67 78
2759*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, subpelv3 ; V a3 b3
2760*c0909341SAndroid Build Coastguard Worker    paddd                m5, d512reg
2761*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
2762*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
2763*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
2764*c0909341SAndroid Build Coastguard Worker    packuswb             m5, m5
2765*c0909341SAndroid Build Coastguard Worker    movd                r4d, m5
2766*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r4w
2767*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 16
2768*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r4w
2769*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2770*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2771*c0909341SAndroid Build Coastguard Worker    jg .hv_w2_loop
2772*c0909341SAndroid Build Coastguard Worker    RET
2773*c0909341SAndroid Build Coastguard Worker%undef w8192reg
2774*c0909341SAndroid Build Coastguard Worker%undef d512reg
2775*c0909341SAndroid Build Coastguard Worker.hv_w4:
2776*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_0 4
2777*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_1 5
2778*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_2 6
2779*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_3 7
2780*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_4 8
2781*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_5 9
2782*c0909341SAndroid Build Coastguard Worker%define hv4_line_1_0 10
2783*c0909341SAndroid Build Coastguard Worker%define hv4_line_1_1 11
2784*c0909341SAndroid Build Coastguard Worker%define hv4_line_1_2 12
2785*c0909341SAndroid Build Coastguard Worker%define hv4_line_1_3 13
2786*c0909341SAndroid Build Coastguard Worker%macro SAVELINE_W4 3
2787*c0909341SAndroid Build Coastguard Worker    mova     [rsp+mmsize*hv4_line_%3_%2], %1
2788*c0909341SAndroid Build Coastguard Worker%endmacro
2789*c0909341SAndroid Build Coastguard Worker%macro RESTORELINE_W4 3
2790*c0909341SAndroid Build Coastguard Worker    mova     %1, [rsp+mmsize*hv4_line_%3_%2]
2791*c0909341SAndroid Build Coastguard Worker%endmacro
2792*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2793*c0909341SAndroid Build Coastguard Worker %define           w8192reg  [base+pw_8192]
2794*c0909341SAndroid Build Coastguard Worker %define            d512reg  [base+pd_512]
2795*c0909341SAndroid Build Coastguard Worker%else
2796*c0909341SAndroid Build Coastguard Worker %define           w8192reg  m8
2797*c0909341SAndroid Build Coastguard Worker %define            d512reg  m9
2798*c0909341SAndroid Build Coastguard Worker%endif
2799*c0909341SAndroid Build Coastguard Worker    ; lower shuffle 0 1 2 3 4
2800*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4]
2801*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+ssq*0]   ; 0 _ _ _
2802*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+ssq*1]   ; 0 _ 1 _
2803*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2804*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2805*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*0]   ; 2 _ _ _
2806*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+ssq*1]   ; 2 _ 3 _
2807*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2808*c0909341SAndroid Build Coastguard Worker%else
2809*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*2]   ; 2 _ _ _
2810*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+ss3q ]   ; 2 _ 3 _
2811*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
2812*c0909341SAndroid Build Coastguard Worker%endif
2813*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
2814*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
2815*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7 ;H subpel_filters
2816*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7 ;H subpel_filters
2817*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m0 ;H 0 1 2 3
2818*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, w8192reg ;H pw_8192
2819*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m2, 2, 0
2820*c0909341SAndroid Build Coastguard Worker    ; upper shuffle 2 3 4 5 6
2821*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4+16]
2822*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
2823*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
2824*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7 ;H subpel_filters
2825*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7 ;H subpel_filters
2826*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m0 ;H 0 1 2 3
2827*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, w8192reg ;H pw_8192
2828*c0909341SAndroid Build Coastguard Worker    ;
2829*c0909341SAndroid Build Coastguard Worker    ; lower shuffle
2830*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4]
2831*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+ssq*0]   ; 4 _ _ _
2832*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+ssq*1]   ; 4 _ 5 _
2833*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2834*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2835*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*0]   ; 6 _ _ _
2836*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
2837*c0909341SAndroid Build Coastguard Worker%else
2838*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*2]   ; 6 _ _ _
2839*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
2840*c0909341SAndroid Build Coastguard Worker%endif
2841*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
2842*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
2843*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7 ;H subpel_filters
2844*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7 ;H subpel_filters
2845*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m0 ;H 4 5 6 7
2846*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, w8192reg ;H pw_8192
2847*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m3, 3, 0
2848*c0909341SAndroid Build Coastguard Worker    ; upper shuffle
2849*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4+16]
2850*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
2851*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
2852*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7 ;H subpel_filters
2853*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7 ;H subpel_filters
2854*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m0 ;H 4 5 6 7
2855*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, w8192reg ;H pw_8192
2856*c0909341SAndroid Build Coastguard Worker    ;process high
2857*c0909341SAndroid Build Coastguard Worker    palignr              m4, m3, m2, 4;V 1 2 3 4
2858*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m4  ; V 01 12
2859*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4      ; V 23 34
2860*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q2121;V 5 6 5 6
2861*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0      ; V 45 56
2862*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m0, 0, 1
2863*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m1, 1, 1
2864*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m2, 2, 1
2865*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m3, 3, 1
2866*c0909341SAndroid Build Coastguard Worker    ;process low
2867*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m2, 2, 0
2868*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m3, 3, 0
2869*c0909341SAndroid Build Coastguard Worker    palignr              m4, m3, m2, 4;V 1 2 3 4
2870*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m4  ; V 01 12
2871*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4      ; V 23 34
2872*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q2121;V 5 6 5 6
2873*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0      ; V 45 56
2874*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
2875*c0909341SAndroid Build Coastguard Worker    ;process low
2876*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, subpelv0 ; V a0 b0
2877*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2878*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, subpelv1; V a1 b1
2879*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
2880*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2881*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, subpelv2; V a2 b2
2882*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
2883*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4]
2884*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*0] ; 7
2885*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
2886*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
2887*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7 ;H subpel_filters
2888*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m4 ;H                7 8 7 8
2889*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, w8192reg ;H pw_8192
2890*c0909341SAndroid Build Coastguard Worker    palignr              m3, m4, m0, 12         ; 6 7 8 7
2891*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
2892*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 67 78
2893*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, subpelv3; a3 b3
2894*c0909341SAndroid Build Coastguard Worker    paddd                m5, d512reg ; pd_512
2895*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
2896*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
2897*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m0, 0, 0
2898*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m1, 1, 0
2899*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m2, 2, 0
2900*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m3, 3, 0
2901*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m5, 5, 0
2902*c0909341SAndroid Build Coastguard Worker    ;process high
2903*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m0, 0, 1
2904*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m1, 1, 1
2905*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m2, 2, 1
2906*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m3, 3, 1
2907*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, subpelv0; V a0 b0
2908*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
2909*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, subpelv1; V a1 b1
2910*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
2911*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
2912*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, subpelv2; V a2 b2
2913*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
2914*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4+16]
2915*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*0] ; 7
2916*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+ssq*1] ; 7 _ 8 _
2917*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
2918*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
2919*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7 ;H subpel_filters
2920*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m4 ;H                7 8 7 8
2921*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, w8192reg ;H pw_8192
2922*c0909341SAndroid Build Coastguard Worker    palignr              m3, m4, m0, 12         ; 6 7 8 7
2923*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
2924*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 67 78
2925*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, subpelv3; a3 b3
2926*c0909341SAndroid Build Coastguard Worker    paddd                m5, d512reg ; pd_512
2927*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
2928*c0909341SAndroid Build Coastguard Worker    psrad                m4, m5, 10
2929*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m5, 5, 0
2930*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m4 ; d -> w
2931*c0909341SAndroid Build Coastguard Worker    packuswb             m5, m5 ; w -> b
2932*c0909341SAndroid Build Coastguard Worker    pshuflw              m5, m5, q3120
2933*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m5
2934*c0909341SAndroid Build Coastguard Worker    psrlq                m5, 32
2935*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m5
2936*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
2937*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
2938*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m0, 0, 1
2939*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m1, 1, 1
2940*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m2, 2, 1
2941*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m3, 3, 1
2942*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m0, 0, 0
2943*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m1, 1, 0
2944*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m2, 2, 0
2945*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m3, 3, 0
2946*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
2947*c0909341SAndroid Build Coastguard Worker    RET
2948*c0909341SAndroid Build Coastguard Worker%undef subpelv0
2949*c0909341SAndroid Build Coastguard Worker%undef subpelv1
2950*c0909341SAndroid Build Coastguard Worker%undef subpelv2
2951*c0909341SAndroid Build Coastguard Worker%undef subpelv3
2952*c0909341SAndroid Build Coastguard Worker.hv_w8:
2953*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
2954*c0909341SAndroid Build Coastguard Worker%define hv8_line_1 0
2955*c0909341SAndroid Build Coastguard Worker%define hv8_line_2 1
2956*c0909341SAndroid Build Coastguard Worker%define hv8_line_3 2
2957*c0909341SAndroid Build Coastguard Worker%define hv8_line_4 3
2958*c0909341SAndroid Build Coastguard Worker%define hv8_line_6 4
2959*c0909341SAndroid Build Coastguard Worker%macro SAVELINE_W8 2
2960*c0909341SAndroid Build Coastguard Worker    mova     [rsp+hv8_line_%1*mmsize], %2
2961*c0909341SAndroid Build Coastguard Worker%endmacro
2962*c0909341SAndroid Build Coastguard Worker%macro RESTORELINE_W8 2
2963*c0909341SAndroid Build Coastguard Worker    mova     %2, [rsp+hv8_line_%1*mmsize]
2964*c0909341SAndroid Build Coastguard Worker%endmacro
2965*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
2966*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
2967*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
2968*c0909341SAndroid Build Coastguard Worker %define           base_reg  r1
2969*c0909341SAndroid Build Coastguard Worker %define           subpelh0  [rsp+mmsize*5]
2970*c0909341SAndroid Build Coastguard Worker %define           subpelh1  [rsp+mmsize*6]
2971*c0909341SAndroid Build Coastguard Worker %define           subpelv0  [rsp+mmsize*7]
2972*c0909341SAndroid Build Coastguard Worker %define           subpelv1  [rsp+mmsize*8]
2973*c0909341SAndroid Build Coastguard Worker %define           subpelv2  [rsp+mmsize*9]
2974*c0909341SAndroid Build Coastguard Worker %define           subpelv3  [rsp+mmsize*10]
2975*c0909341SAndroid Build Coastguard Worker %define             accuv0  [rsp+mmsize*11]
2976*c0909341SAndroid Build Coastguard Worker %define             accuv1  [rsp+mmsize*12]
2977*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base_reg+mxq*8+subpel_filters-put_ssse3]
2978*c0909341SAndroid Build Coastguard Worker    movzx               mxd, ssb
2979*c0909341SAndroid Build Coastguard Worker    shr                 ssd, 16
2980*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
2981*c0909341SAndroid Build Coastguard Worker    cmovs               ssd, mxd
2982*c0909341SAndroid Build Coastguard Worker    movq                 m5, [base_reg+ssq*8+subpel_filters-put_ssse3]
2983*c0909341SAndroid Build Coastguard Worker    mov                 ssq, ssmp
2984*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -mmsize*13
2985*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
2986*c0909341SAndroid Build Coastguard Worker %define               dstm  [rsp+mmsize*13+gprsize*1]
2987*c0909341SAndroid Build Coastguard Worker %define                dsm  [rsp+mmsize*13+gprsize*2]
2988*c0909341SAndroid Build Coastguard Worker    mov                  r6, [rstk+stack_offset+gprsize*2]
2989*c0909341SAndroid Build Coastguard Worker    mov                 dsm, r6
2990*c0909341SAndroid Build Coastguard Worker%endif
2991*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m1, q0000
2992*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1111
2993*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m5
2994*c0909341SAndroid Build Coastguard Worker    psraw                m5, 8 ; sign-extend
2995*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m5, q0000
2996*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m5, q1111
2997*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m5, q2222
2998*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3333
2999*c0909341SAndroid Build Coastguard Worker    mova           subpelh0, m0
3000*c0909341SAndroid Build Coastguard Worker    mova           subpelh1, m1
3001*c0909341SAndroid Build Coastguard Worker    mova           subpelv0, m2
3002*c0909341SAndroid Build Coastguard Worker    mova           subpelv1, m3
3003*c0909341SAndroid Build Coastguard Worker    mova           subpelv2, m4
3004*c0909341SAndroid Build Coastguard Worker    mova           subpelv3, m5
3005*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
3006*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
3007*c0909341SAndroid Build Coastguard Worker    sub                srcq, r6
3008*c0909341SAndroid Build Coastguard Worker%else
3009*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        16*5, 16
3010*c0909341SAndroid Build Coastguard Worker %define           subpelh0  m10
3011*c0909341SAndroid Build Coastguard Worker %define           subpelh1  m11
3012*c0909341SAndroid Build Coastguard Worker %define           subpelv0  m12
3013*c0909341SAndroid Build Coastguard Worker %define           subpelv1  m13
3014*c0909341SAndroid Build Coastguard Worker %define           subpelv2  m14
3015*c0909341SAndroid Build Coastguard Worker %define           subpelv3  m15
3016*c0909341SAndroid Build Coastguard Worker %define             accuv0  m8
3017*c0909341SAndroid Build Coastguard Worker %define             accuv1  m9
3018*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg+mxq*8+subpel_filters-put_ssse3]
3019*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3020*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3021*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3022*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
3023*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base_reg+myq*8+subpel_filters-put_ssse3]
3024*c0909341SAndroid Build Coastguard Worker    pshufd         subpelh0, m0, q0000
3025*c0909341SAndroid Build Coastguard Worker    pshufd         subpelh1, m0, q1111
3026*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
3027*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8 ; sign-extend
3028*c0909341SAndroid Build Coastguard Worker    pshufd         subpelv0, m1, q0000
3029*c0909341SAndroid Build Coastguard Worker    pshufd         subpelv1, m1, q1111
3030*c0909341SAndroid Build Coastguard Worker    pshufd         subpelv2, m1, q2222
3031*c0909341SAndroid Build Coastguard Worker    pshufd         subpelv3, m1, q3333
3032*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
3033*c0909341SAndroid Build Coastguard Worker    mov                  r7, dstq
3034*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
3035*c0909341SAndroid Build Coastguard Worker%endif
3036*c0909341SAndroid Build Coastguard Worker    shl                  wd, 14
3037*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+wq-(1<<16)]
3038*c0909341SAndroid Build Coastguard Worker    mov                  r4, srcq
3039*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
3040*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0] ; 0 = _ _
3041*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1] ; 1 = _ _
3042*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3043*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3044*c0909341SAndroid Build Coastguard Worker%endif
3045*c0909341SAndroid Build Coastguard Worker%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
3046*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
3047*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %1, [base+subpel_h_shufB]
3048*c0909341SAndroid Build Coastguard Worker    pshufb               %4, %1, [base+subpel_h_shufC]
3049*c0909341SAndroid Build Coastguard Worker    pshufb               %1,     [base+subpel_h_shufA]
3050*c0909341SAndroid Build Coastguard Worker %else
3051*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %1, %6  ; subpel_h_shufB
3052*c0909341SAndroid Build Coastguard Worker    pshufb               %4, %1, %7  ; subpel_h_shufC
3053*c0909341SAndroid Build Coastguard Worker    pshufb               %1, %5      ; subpel_h_shufA
3054*c0909341SAndroid Build Coastguard Worker %endif
3055*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %2, %3, subpelh0 ; subpel +0 C0
3056*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %4, subpelh1; subpel +4 B4
3057*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %3, subpelh1; C4
3058*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %1, subpelh0; A0
3059*c0909341SAndroid Build Coastguard Worker    paddw                %2, %4      ; C0+B4
3060*c0909341SAndroid Build Coastguard Worker    paddw                %1, %3      ; A0+C4
3061*c0909341SAndroid Build Coastguard Worker    phaddw               %1, %2
3062*c0909341SAndroid Build Coastguard Worker%endmacro
3063*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3064*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+subpel_h_shufA]
3065*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+subpel_h_shufB]
3066*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+subpel_h_shufC]
3067*c0909341SAndroid Build Coastguard Worker%endif
3068*c0909341SAndroid Build Coastguard Worker    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
3069*c0909341SAndroid Build Coastguard Worker    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
3070*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3071*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0] ; 2 = _ _
3072*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1] ; 3 = _ _
3073*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3074*c0909341SAndroid Build Coastguard Worker%else
3075*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2] ; 2 = _ _
3076*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
3077*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0] ; 3 = _ _
3078*c0909341SAndroid Build Coastguard Worker%endif
3079*c0909341SAndroid Build Coastguard Worker    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
3080*c0909341SAndroid Build Coastguard Worker    HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
3081*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_8192]
3082*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m7 ; H pw_8192
3083*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7 ; H pw_8192
3084*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m7 ; H pw_8192
3085*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7 ; H pw_8192
3086*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4, m5  ; 0 1 ~
3087*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5, m6  ; 1 2 ~
3088*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m0  ; 2 3 ~
3089*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           1, m1
3090*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           2, m2
3091*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           3, m3
3092*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+subpel_h_shufA]
3093*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3094*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]       ; 4 = _ _
3095*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1]       ; 5 = _ _
3096*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3097*c0909341SAndroid Build Coastguard Worker%else
3098*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*1]       ; 4 = _ _
3099*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*2]       ; 5 = _ _
3100*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
3101*c0909341SAndroid Build Coastguard Worker%endif
3102*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0]       ; 6 = _ _
3103*c0909341SAndroid Build Coastguard Worker    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
3104*c0909341SAndroid Build Coastguard Worker    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
3105*c0909341SAndroid Build Coastguard Worker    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
3106*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_8192]
3107*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m4, m7 ; H pw_8192 4 ~
3108*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m5, m7 ; H pw_8192 5 ~
3109*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6, m7 ; H pw_8192 6 ~
3110*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0, m1  ; 3 4 ~
3111*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m1, m2  ; 4 5 ~
3112*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m2, m3  ; 5 6 ~
3113*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           6, m3
3114*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        1, m1
3115*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        2, m2
3116*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        3, m3
3117*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3118*c0909341SAndroid Build Coastguard Worker    ; m8 accu for V a
3119*c0909341SAndroid Build Coastguard Worker    ; m9 accu for V b
3120*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           1, m3
3121*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           2, m4
3122*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           3, m5
3123*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           4, m6
3124*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3125*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m1, subpelv0 ; a0
3126*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m2, subpelv0 ; b0
3127*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, subpelv1     ; a1
3128*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, subpelv1     ; b1
3129*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3
3130*c0909341SAndroid Build Coastguard Worker    paddd                m7, m4
3131*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, subpelv2     ; a2
3132*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, subpelv2     ; b2
3133*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
3134*c0909341SAndroid Build Coastguard Worker    paddd                m7, m6
3135*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pd_512]
3136*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5 ;   pd_512
3137*c0909341SAndroid Build Coastguard Worker    paddd                m7, m5 ;   pd_512
3138*c0909341SAndroid Build Coastguard Worker    mova             accuv0, m0
3139*c0909341SAndroid Build Coastguard Worker    mova             accuv1, m7
3140*c0909341SAndroid Build Coastguard Worker%else
3141*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m1, subpelv0 ; a0
3142*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m2, subpelv0 ; b0
3143*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, subpelv1     ; a1
3144*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, subpelv1     ; b1
3145*c0909341SAndroid Build Coastguard Worker    paddd                m8, m3
3146*c0909341SAndroid Build Coastguard Worker    paddd                m9, m4
3147*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, subpelv2     ; a2
3148*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, subpelv2     ; b2
3149*c0909341SAndroid Build Coastguard Worker    paddd                m8, m5
3150*c0909341SAndroid Build Coastguard Worker    paddd                m9, m6
3151*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pd_512]
3152*c0909341SAndroid Build Coastguard Worker    paddd                m8, m7 ;   pd_512
3153*c0909341SAndroid Build Coastguard Worker    paddd                m9, m7 ;   pd_512
3154*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+subpel_h_shufB]
3155*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shufC]
3156*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+subpel_h_shufA]
3157*c0909341SAndroid Build Coastguard Worker%endif
3158*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1] ; 7
3159*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*2] ; 8
3160*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3161*c0909341SAndroid Build Coastguard Worker    HV_H_W8              m0, m1, m2, m3, m5, m7, m6
3162*c0909341SAndroid Build Coastguard Worker    HV_H_W8              m4, m1, m2, m3, m5, m7, m6
3163*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_8192]
3164*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5 ; H pw_8192
3165*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5 ; H pw_8192
3166*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        6, m6
3167*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m0  ; 6 7  ~
3168*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m4  ; 7 8 ~
3169*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, subpelv3 ; a3
3170*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1, accuv0
3171*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m6, subpelv3 ; b3
3172*c0909341SAndroid Build Coastguard Worker    paddd                m1, m1, accuv1 ; H + V
3173*c0909341SAndroid Build Coastguard Worker    psrad                m2, 10
3174*c0909341SAndroid Build Coastguard Worker    psrad                m1, 10
3175*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1  ; d -> w
3176*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m1 ; w -> b
3177*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m2
3178*c0909341SAndroid Build Coastguard Worker    psrlq                m2, 32
3179*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3180*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
3181*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m2
3182*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
3183*c0909341SAndroid Build Coastguard Worker%else
3184*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m2
3185*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
3186*c0909341SAndroid Build Coastguard Worker%endif
3187*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3188*c0909341SAndroid Build Coastguard Worker    jle .hv_w8_outer
3189*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           6, m4
3190*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        1, m1
3191*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        2, m2
3192*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        3, m3
3193*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        4, m4
3194*c0909341SAndroid Build Coastguard Worker    jmp .hv_w8_loop
3195*c0909341SAndroid Build Coastguard Worker.hv_w8_outer:
3196*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3197*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
3198*c0909341SAndroid Build Coastguard Worker    add                  r4, 4
3199*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6w
3200*c0909341SAndroid Build Coastguard Worker    add                dstq, 4
3201*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
3202*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstq
3203*c0909341SAndroid Build Coastguard Worker%else
3204*c0909341SAndroid Build Coastguard Worker    add                  r4, 4
3205*c0909341SAndroid Build Coastguard Worker    add                  r7, 4
3206*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
3207*c0909341SAndroid Build Coastguard Worker    mov                srcq, r4
3208*c0909341SAndroid Build Coastguard Worker    mov                dstq, r7
3209*c0909341SAndroid Build Coastguard Worker%endif
3210*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<16
3211*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
3212*c0909341SAndroid Build Coastguard Worker    RET
3213*c0909341SAndroid Build Coastguard Worker
3214*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3215*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 1, 2
3216*c0909341SAndroid Build Coastguard Worker%elif WIN64
3217*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 6, 4
3218*c0909341SAndroid Build Coastguard Worker%else
3219*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 6, 7
3220*c0909341SAndroid Build Coastguard Worker%endif
3221*c0909341SAndroid Build Coastguard Worker
3222*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3223*c0909341SAndroid Build Coastguard Worker %define base_reg r2
3224*c0909341SAndroid Build Coastguard Worker %define base base_reg-prep_ssse3
3225*c0909341SAndroid Build Coastguard Worker%else
3226*c0909341SAndroid Build Coastguard Worker %define base_reg r7
3227*c0909341SAndroid Build Coastguard Worker %define base 0
3228*c0909341SAndroid Build Coastguard Worker%endif
3229*c0909341SAndroid Build Coastguard Worker
3230*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_FN FN prep_8tap,
3231*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_8bpc
3232*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_8bpc
3233*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_8bpc
3234*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular,        REGULAR, REGULAR
3235*c0909341SAndroid Build Coastguard Worker
3236*c0909341SAndroid Build Coastguard Workercglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, mx, my, ns
3237*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
3238*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3239*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
3240*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
3241*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
3242*c0909341SAndroid Build Coastguard Worker    movifnidn          srcd, srcm
3243*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3244*c0909341SAndroid Build Coastguard Worker    LEA            base_reg, prep_ssse3
3245*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
3246*c0909341SAndroid Build Coastguard Worker    jnz .h
3247*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3248*c0909341SAndroid Build Coastguard Worker    jnz .v
3249*c0909341SAndroid Build Coastguard Worker.prep:
3250*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
3251*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
3252*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
3253*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
3254*c0909341SAndroid Build Coastguard Worker    movifnidn           ssq, ssmp
3255*c0909341SAndroid Build Coastguard Worker    lea                  r6, [ssq*3]
3256*c0909341SAndroid Build Coastguard Worker%if WIN64
3257*c0909341SAndroid Build Coastguard Worker    pop                  r8
3258*c0909341SAndroid Build Coastguard Worker    pop                  r7
3259*c0909341SAndroid Build Coastguard Worker%endif
3260*c0909341SAndroid Build Coastguard Worker    jmp                  wq
3261*c0909341SAndroid Build Coastguard Worker.h:
3262*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3263*c0909341SAndroid Build Coastguard Worker    jnz .hv
3264*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3265*c0909341SAndroid Build Coastguard Worker    jnz .hv
3266*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3267*c0909341SAndroid Build Coastguard Worker %define ssq r6
3268*c0909341SAndroid Build Coastguard Worker    mov                 ssq, ssmp
3269*c0909341SAndroid Build Coastguard Worker%endif
3270*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3271*c0909341SAndroid Build Coastguard Worker    jle mangle(private_prefix %+ _prep_8tap_8bpc %+ SUFFIX).h_w4
3272*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      11
3273*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_8192]
3274*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3275*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+subpel_h_shufD]
3276*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+subpel_h_shufE]
3277*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+subpel_h_shufF]
3278*c0909341SAndroid Build Coastguard Worker%endif
3279*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3280*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
3281*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base_reg-prep_ssse3+subpel_filters+1+mxq*8]
3282*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m7
3283*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m7, q0000
3284*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q1111
3285*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q2222
3286*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
3287*c0909341SAndroid Build Coastguard Worker    jge .h_w16
3288*c0909341SAndroid Build Coastguard Worker%macro PREP_6TAP_H 3 ; dst/src, tmp[1-2]
3289*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3290*c0909341SAndroid Build Coastguard Worker    pshufb               %2, %1, [base+subpel_h_shufD]
3291*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %1, [base+subpel_h_shufE]
3292*c0909341SAndroid Build Coastguard Worker    pshufb               %1, [base+subpel_h_shufF]
3293*c0909341SAndroid Build Coastguard Worker%else
3294*c0909341SAndroid Build Coastguard Worker    pshufb               %2, %1, m8
3295*c0909341SAndroid Build Coastguard Worker    pshufb               %3, %1, m9
3296*c0909341SAndroid Build Coastguard Worker    pshufb               %1, m10
3297*c0909341SAndroid Build Coastguard Worker%endif
3298*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %2, m4
3299*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %3, m6
3300*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %1, m7
3301*c0909341SAndroid Build Coastguard Worker    paddw                %2, %3
3302*c0909341SAndroid Build Coastguard Worker    paddw                %1, %2
3303*c0909341SAndroid Build Coastguard Worker    pmulhrsw             %1, m5
3304*c0909341SAndroid Build Coastguard Worker%endmacro
3305*c0909341SAndroid Build Coastguard Worker.h_w8:
3306*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
3307*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
3308*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3309*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H          m0, m2, m3
3310*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H          m1, m2, m3
3311*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
3312*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
3313*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3314*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3315*c0909341SAndroid Build Coastguard Worker    jg .h_w8
3316*c0909341SAndroid Build Coastguard Worker    RET
3317*c0909341SAndroid Build Coastguard Worker.h_w16:
3318*c0909341SAndroid Build Coastguard Worker    add                srcq, wq
3319*c0909341SAndroid Build Coastguard Worker    neg                  wq
3320*c0909341SAndroid Build Coastguard Worker.h_w16_loop_v:
3321*c0909341SAndroid Build Coastguard Worker    mov                  r5, wq
3322*c0909341SAndroid Build Coastguard Worker.h_w16_loop_h:
3323*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r5+8*0]
3324*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+r5+8*1]
3325*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H          m0, m2, m3
3326*c0909341SAndroid Build Coastguard Worker    PREP_6TAP_H          m1, m2, m3
3327*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
3328*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
3329*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
3330*c0909341SAndroid Build Coastguard Worker    add                  r5, 16
3331*c0909341SAndroid Build Coastguard Worker    jle .h_w16_loop_h
3332*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
3333*c0909341SAndroid Build Coastguard Worker    dec                  hd
3334*c0909341SAndroid Build Coastguard Worker    jg .h_w16_loop_v
3335*c0909341SAndroid Build Coastguard Worker    RET
3336*c0909341SAndroid Build Coastguard Worker.v:
3337*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3338*c0909341SAndroid Build Coastguard Worker    mov                 mxd, myd
3339*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
3340*c0909341SAndroid Build Coastguard Worker%else
3341*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       9, 12
3342*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3343*c0909341SAndroid Build Coastguard Worker%endif
3344*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3345*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3346*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
3347*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base_reg-prep_ssse3+subpel_filters+1+myq*8]
3348*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m7
3349*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q0000
3350*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q1111
3351*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q2222
3352*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3353*c0909341SAndroid Build Coastguard Worker    %define              m8  [base+pw_8192]
3354*c0909341SAndroid Build Coastguard Worker    mov                 ssq, ssm
3355*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
3356*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
3357*c0909341SAndroid Build Coastguard Worker%else
3358*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_8192]
3359*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
3360*c0909341SAndroid Build Coastguard Worker    neg                 nsq
3361*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3362*c0909341SAndroid Build Coastguard Worker    jg .v_w8
3363*c0909341SAndroid Build Coastguard Worker%endif
3364*c0909341SAndroid Build Coastguard Worker.v_w4:
3365*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3366*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [wq-4]
3367*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 14
3368*c0909341SAndroid Build Coastguard Worker    add                 r5d, hd
3369*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
3370*c0909341SAndroid Build Coastguard Worker.v_w4_loop0:
3371*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+ssq*0]
3372*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*1]
3373*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3374*c0909341SAndroid Build Coastguard Worker%else
3375*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+nsq*2]
3376*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+nsq*1]
3377*c0909341SAndroid Build Coastguard Worker%endif
3378*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+ssq*0]
3379*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+ssq*1]
3380*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3381*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
3382*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m3     ; 0 1
3383*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m2     ; 1 2
3384*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4     ; 2 3
3385*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m0     ; 3 4
3386*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m3     ; 01 12
3387*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4     ; 23 34
3388*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
3389*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+ssq*1]
3390*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3391*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m1, m5 ; a0 b0
3392*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
3393*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6     ; a1 b1
3394*c0909341SAndroid Build Coastguard Worker    paddw                m4, m2
3395*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m0, m3 ; 4 5
3396*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+ssq*0]
3397*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m0     ; 5 6
3398*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3     ; 67 78
3399*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m2, m7 ; a2 b2
3400*c0909341SAndroid Build Coastguard Worker    paddw                m4, m3
3401*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m8
3402*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3403*c0909341SAndroid Build Coastguard Worker    movq        [tmpq+wq*0], m4
3404*c0909341SAndroid Build Coastguard Worker    movhps      [tmpq+wq*2], m4
3405*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+wq*4]
3406*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3407*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
3408*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcm
3409*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpm
3410*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5w
3411*c0909341SAndroid Build Coastguard Worker    add                srcq, 4
3412*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
3413*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
3414*c0909341SAndroid Build Coastguard Worker    mov                tmpm, tmpq
3415*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 1<<16
3416*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop0
3417*c0909341SAndroid Build Coastguard Worker%else
3418*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
3419*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3420*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3421*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
3422*c0909341SAndroid Build Coastguard Worker%endif
3423*c0909341SAndroid Build Coastguard Worker    RET
3424*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3425*c0909341SAndroid Build Coastguard Worker.v_w8:
3426*c0909341SAndroid Build Coastguard Worker    WIN64_PUSH_XMM       12
3427*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*4-32]
3428*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [r6*8+hq]
3429*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
3430*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+nsq*2]
3431*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+nsq*1]
3432*c0909341SAndroid Build Coastguard Worker    lea                  r5, [srcq+ssq*2]
3433*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]
3434*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*1]
3435*c0909341SAndroid Build Coastguard Worker    mov                  r8, tmpq
3436*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r5  +ssq*0]
3437*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2     ; 01
3438*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3     ; 12
3439*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4     ; 23
3440*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0     ; 34
3441*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
3442*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m1, m5 ; a0
3443*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3444*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m2, m5 ; b0
3445*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3446*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6     ; a1
3447*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6     ; b1
3448*c0909341SAndroid Build Coastguard Worker    paddw               m10, m3
3449*c0909341SAndroid Build Coastguard Worker    paddw               m11, m4
3450*c0909341SAndroid Build Coastguard Worker    movq                 m4, [r5+ssq*1]
3451*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+ssq*2]
3452*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m0, m4 ; 67
3453*c0909341SAndroid Build Coastguard Worker    movq                 m0, [r5+ssq*0]
3454*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m0     ; 78
3455*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m3, m7 ; a2
3456*c0909341SAndroid Build Coastguard Worker    paddw               m10, m9
3457*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m4, m7 ; b2
3458*c0909341SAndroid Build Coastguard Worker    paddw               m11, m9
3459*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m10, m8
3460*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m8
3461*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*0], m10
3462*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*2], m11
3463*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+wq*4]
3464*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3465*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
3466*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
3467*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3468*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
3469*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
3470*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
3471*c0909341SAndroid Build Coastguard Worker    RET
3472*c0909341SAndroid Build Coastguard Worker%endif ;ARCH_X86_64
3473*c0909341SAndroid Build Coastguard Worker.hv:
3474*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
3475*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
3476*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
3477*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3478*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
3479*c0909341SAndroid Build Coastguard Worker%else
3480*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
3481*c0909341SAndroid Build Coastguard Worker%endif
3482*c0909341SAndroid Build Coastguard Worker    dec                srcq
3483*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base_reg-prep_ssse3+subpel_filters+2+mxq*8]
3484*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3485*c0909341SAndroid Build Coastguard Worker    mov                 mxd, myd
3486*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
3487*c0909341SAndroid Build Coastguard Worker%else
3488*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3489*c0909341SAndroid Build Coastguard Worker%endif
3490*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3491*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3492*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
3493*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg-prep_ssse3+subpel_filters+1+myq*8]
3494*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3495*c0909341SAndroid Build Coastguard Worker    mov                 ssq, ssmp
3496*c0909341SAndroid Build Coastguard Worker%define regs_used 6
3497*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   -mmsize*4
3498*c0909341SAndroid Build Coastguard Worker%define regs_used 7
3499*c0909341SAndroid Build Coastguard Worker    %define              m8  [rsp+mmsize*0]
3500*c0909341SAndroid Build Coastguard Worker    %define              m9  [rsp+mmsize*1]
3501*c0909341SAndroid Build Coastguard Worker    %define             m10  [rsp+mmsize*2]
3502*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
3503*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
3504*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
3505*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
3506*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q0000
3507*c0909341SAndroid Build Coastguard Worker    mova                 m8, m2
3508*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q1111
3509*c0909341SAndroid Build Coastguard Worker    mova                 m9, m2
3510*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q2222
3511*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
3512*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]
3513*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ssq*1]
3514*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3515*c0909341SAndroid Build Coastguard Worker    %define             m11  [base+pw_8192]
3516*c0909341SAndroid Build Coastguard Worker    %define             m12  [base+subpel_h_shufA]
3517*c0909341SAndroid Build Coastguard Worker    %define             m13  [rsp+mmsize*3]
3518*c0909341SAndroid Build Coastguard Worker    %define             m14  [base+pd_32]
3519*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q0000
3520*c0909341SAndroid Build Coastguard Worker    mova                m13, m1
3521*c0909341SAndroid Build Coastguard Worker%else
3522*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      15
3523*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
3524*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
3525*c0909341SAndroid Build Coastguard Worker    neg                 nsq
3526*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8 ; sign-extend
3527*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q0000
3528*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q1111
3529*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q2222
3530*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+nsq*2]
3531*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+nsq*1]
3532*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m1, q0000
3533*c0909341SAndroid Build Coastguard Worker    mova                m12, [base+subpel_h_shufA]
3534*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pw_8192]
3535*c0909341SAndroid Build Coastguard Worker    mova                m14, [base+pd_32]
3536*c0909341SAndroid Build Coastguard Worker%endif
3537*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
3538*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*1]
3539*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3540*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*0]
3541*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3542*c0909341SAndroid Build Coastguard Worker    mova                 m5, m12
3543*c0909341SAndroid Build Coastguard Worker    mova                 m6, m13
3544*c0909341SAndroid Build Coastguard Worker    REPX {pshufb    x, m5 }, m3, m4, m0, m1, m2
3545*c0909341SAndroid Build Coastguard Worker    mova                 m5, m11
3546*c0909341SAndroid Build Coastguard Worker    REPX {pmaddubsw x, m6 }, m3, m4, m0, m1, m2
3547*c0909341SAndroid Build Coastguard Worker%else
3548*c0909341SAndroid Build Coastguard Worker    REPX {pshufb    x, m12}, m3, m4, m0, m1, m2
3549*c0909341SAndroid Build Coastguard Worker    REPX {pmaddubsw x, m13}, m3, m4, m0, m1, m2
3550*c0909341SAndroid Build Coastguard Worker%endif
3551*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m0      ; 0 2
3552*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m1      ; 1 3
3553*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m2      ; 2 4
3554*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3555*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m5 }, m3, m4, m0
3556*c0909341SAndroid Build Coastguard Worker%else
3557*c0909341SAndroid Build Coastguard Worker    REPX  {pmulhrsw x, m11}, m3, m4, m0
3558*c0909341SAndroid Build Coastguard Worker%endif
3559*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4  ; 01
3560*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4      ; 23
3561*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m0  ; 12
3562*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m0      ; 34
3563*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
3564*c0909341SAndroid Build Coastguard Worker    movq                 m7, [srcq+ssq*1]
3565*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3566*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+ssq*0]
3567*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m12
3568*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m12
3569*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m13
3570*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m13
3571*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m8, m1  ; a0
3572*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
3573*c0909341SAndroid Build Coastguard Worker    phaddw               m7, m6      ; 5 6
3574*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m8, m2  ; b0
3575*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
3576*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m9      ; a1
3577*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m9      ; b1
3578*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m11
3579*c0909341SAndroid Build Coastguard Worker    paddd                m5, m14
3580*c0909341SAndroid Build Coastguard Worker    paddd                m6, m14
3581*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
3582*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4
3583*c0909341SAndroid Build Coastguard Worker    shufpd               m4, m0, m7, 0x01 ; 4 5
3584*c0909341SAndroid Build Coastguard Worker    mova                 m0, m7
3585*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m7  ; 45
3586*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m7      ; 56
3587*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m10, m3 ; a2
3588*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
3589*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m10, m4 ; b2
3590*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
3591*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
3592*c0909341SAndroid Build Coastguard Worker    psrad                m6, 6
3593*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m6
3594*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m5
3595*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3596*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3597*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
3598*c0909341SAndroid Build Coastguard Worker    RET
3599*c0909341SAndroid Build Coastguard Worker.hv_w8:
3600*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
3601*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
3602*c0909341SAndroid Build Coastguard Worker    sub                srcq, 2
3603*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg-prep_ssse3+subpel_filters+1+mxq*8]
3604*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3605*c0909341SAndroid Build Coastguard Worker    mov                 mxd, myd
3606*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
3607*c0909341SAndroid Build Coastguard Worker%else
3608*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3609*c0909341SAndroid Build Coastguard Worker%endif
3610*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3611*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3612*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
3613*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base_reg-prep_ssse3+subpel_filters+1+myq*8]
3614*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3615*c0909341SAndroid Build Coastguard Worker    mov                 ssq, ssm
3616*c0909341SAndroid Build Coastguard Worker%assign regs_used 6
3617*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -mmsize*16
3618*c0909341SAndroid Build Coastguard Worker%assign regs_used 7
3619*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
3620*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
3621*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT < 16
3622*c0909341SAndroid Build Coastguard Worker    %define            srcm  [esp+mmsize*15+gprsize*0]
3623*c0909341SAndroid Build Coastguard Worker    %define            tmpm  [esp+mmsize*15+gprsize*1]
3624*c0909341SAndroid Build Coastguard Worker    mov                tmpm, tmpq
3625*c0909341SAndroid Build Coastguard Worker%endif
3626*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
3627*c0909341SAndroid Build Coastguard Worker%else
3628*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK        16*6, 16
3629*c0909341SAndroid Build Coastguard Worker    mov                 nsq, ssq
3630*c0909341SAndroid Build Coastguard Worker    neg                 nsq
3631*c0909341SAndroid Build Coastguard Worker%endif
3632*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_8192]
3633*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [wq-8]
3634*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m0
3635*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 13
3636*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
3637*c0909341SAndroid Build Coastguard Worker    add                 r5d, hd
3638*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8 ; sign-extend
3639*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q0000
3640*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*0], m2
3641*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q1111
3642*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*1], m2
3643*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m0, q2222
3644*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*2], m0
3645*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q0000
3646*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*3], m2
3647*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m1, q1111
3648*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*4], m2
3649*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q2222
3650*c0909341SAndroid Build Coastguard Worker    mova         [rsp+16*5], m1
3651*c0909341SAndroid Build Coastguard Worker%macro PREP_HV_H_6TAP 3-8 [base+subpel_h_shufD], [base+subpel_h_shufF], \
3652*c0909341SAndroid Build Coastguard Worker                          [rsp+16*0], [rsp+16*1], [rsp+16*2] ; src/dst, tmp[1-2], shuf[1-2], mul[1-3]
3653*c0909341SAndroid Build Coastguard Worker    pshufb               %2, %1, %4
3654*c0909341SAndroid Build Coastguard Worker    pshufb               %1, %5
3655*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %3, %2, %6
3656*c0909341SAndroid Build Coastguard Worker    shufps               %2, %1, q2121
3657*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %1, %8
3658*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %2, %7
3659*c0909341SAndroid Build Coastguard Worker    paddw                %1, %3
3660*c0909341SAndroid Build Coastguard Worker    paddw                %1, %2
3661*c0909341SAndroid Build Coastguard Worker    pmulhrsw             %1, m7
3662*c0909341SAndroid Build Coastguard Worker%endmacro
3663*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
3664*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+subpel_h_shufD]
3665*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+subpel_h_shufF]
3666*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*0]
3667*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3668*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
3669*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
3670*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3671*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m0, m5, m6, m2, m3, m4
3672*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m1, m5, m6, m2, m3, m4
3673*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0]
3674*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m1   ; 01
3675*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
3676*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 6], m6
3677*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 7], m0
3678*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m5, m0, m6, m2, m3, m4
3679*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*1]
3680*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3681*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m1, m5   ; 12
3682*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m5
3683*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 8], m6
3684*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 9], m1
3685*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m0, m1, m6, m2, m3, m4
3686*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
3687*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m5, m0   ; 23
3688*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m0
3689*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*10], m6
3690*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*11], m5
3691*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m1, m5, m6, m2, m3, m4
3692*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*14], m1
3693*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m1   ; 34
3694*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1
3695*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*12], m6
3696*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*13], m0
3697*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3698*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16* 3]
3699*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m3, [rsp+16* 6] ; a0
3700*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m3, [rsp+16* 7] ; a0'
3701*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m3, [rsp+16* 8] ; b0
3702*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, [rsp+16* 9]     ; b0'
3703*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+16* 4]
3704*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*10]
3705*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*11]
3706*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 6], m4
3707*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m6       ; a1
3708*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 7], m5
3709*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6       ; a1'
3710*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3711*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*12]
3712*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
3713*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*13]
3714*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 8], m4
3715*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m6       ; b1
3716*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16* 9], m5
3717*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m6       ; b1'
3718*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*1]
3719*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
3720*c0909341SAndroid Build Coastguard Worker    paddd                m1, m4
3721*c0909341SAndroid Build Coastguard Worker    paddd                m3, m5
3722*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m6, m4, m5
3723*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+pd_32]
3724*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*14]
3725*c0909341SAndroid Build Coastguard Worker    REPX      {paddd x, m4}, m0, m2, m1, m3
3726*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6   ; 45
3727*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6
3728*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*10], m4
3729*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*11], m5
3730*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [rsp+16*5] ; a2
3731*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, [rsp+16*5] ; a2'
3732*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
3733*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]
3734*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
3735*c0909341SAndroid Build Coastguard Worker    psrad                m0, 6
3736*c0909341SAndroid Build Coastguard Worker    psrad                m2, 6
3737*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
3738*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m4, m2, m5
3739*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+16*5]
3740*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m4   ; 56
3741*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*14], m4
3742*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m4
3743*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*12], m5
3744*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m2       ; b2
3745*c0909341SAndroid Build Coastguard Worker    mova        [rsp+16*13], m6
3746*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2       ; b2'
3747*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
3748*c0909341SAndroid Build Coastguard Worker    paddd                m3, m6
3749*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
3750*c0909341SAndroid Build Coastguard Worker    psrad                m3, 6
3751*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
3752*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+wq*0], m0
3753*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+wq*2], m1
3754*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+wq*4]
3755*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3756*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3757*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcm
3758*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpm
3759*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5w
3760*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
3761*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3762*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
3763*c0909341SAndroid Build Coastguard Worker    mov                tmpm, tmpq
3764*c0909341SAndroid Build Coastguard Worker%else
3765*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+nsq*2]
3766*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+nsq*1]
3767*c0909341SAndroid Build Coastguard Worker    lea                  r6, [srcq+ssq*2]
3768*c0909341SAndroid Build Coastguard Worker    movu                m13, [srcq+ssq*0]
3769*c0909341SAndroid Build Coastguard Worker    movu                m15, [srcq+ssq*1]
3770*c0909341SAndroid Build Coastguard Worker    mov                  r8, tmpq
3771*c0909341SAndroid Build Coastguard Worker    movu                 m6, [r6  +ssq*0]
3772*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+16*1]
3773*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+16*2]
3774*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m9, m0, m1, m2, m3, m4, m5, m8
3775*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP      m11, m0, m1, m2, m3, m4, m5, m8
3776*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP      m13, m0, m1, m2, m3, m4, m5, m8
3777*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP      m15, m0, m1, m2, m3, m4, m5, m8
3778*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m6, m0, m1, m2, m3, m4, m5, m8
3779*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m9, m11  ; 01
3780*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m11
3781*c0909341SAndroid Build Coastguard Worker    punpcklwd           m10, m11, m13 ; 12
3782*c0909341SAndroid Build Coastguard Worker    punpckhwd           m11, m13
3783*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m13, m15 ; 23
3784*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m15
3785*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m6  ; 34
3786*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m6
3787*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
3788*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+16*3]
3789*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*4]
3790*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pd_32]
3791*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8, m3  ; a0
3792*c0909341SAndroid Build Coastguard Worker    mova                 m8, m12
3793*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9, m3  ; a0'
3794*c0909341SAndroid Build Coastguard Worker    mova                 m9, m13
3795*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m10, m3 ; b0
3796*c0909341SAndroid Build Coastguard Worker    mova                m10, m14
3797*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m11     ; b0'
3798*c0909341SAndroid Build Coastguard Worker    mova                m11, m15
3799*c0909341SAndroid Build Coastguard Worker    REPX    {pmaddwd x, m4}, m12, m13, m14, m15
3800*c0909341SAndroid Build Coastguard Worker    REPX    {paddd   x, m5}, m0, m2, m1, m3
3801*c0909341SAndroid Build Coastguard Worker    paddd                m0, m12
3802*c0909341SAndroid Build Coastguard Worker    paddd                m2, m13
3803*c0909341SAndroid Build Coastguard Worker    paddd                m1, m14
3804*c0909341SAndroid Build Coastguard Worker    paddd                m3, m15
3805*c0909341SAndroid Build Coastguard Worker    movu                m15, [r6+ssq*1]
3806*c0909341SAndroid Build Coastguard Worker    lea                  r6, [r6+ssq*2]
3807*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP      m15, m4, m5
3808*c0909341SAndroid Build Coastguard Worker    punpcklwd           m12, m6, m15
3809*c0909341SAndroid Build Coastguard Worker    punpckhwd           m13, m6, m15
3810*c0909341SAndroid Build Coastguard Worker    movu                 m6, [r6+ssq*0]
3811*c0909341SAndroid Build Coastguard Worker    PREP_HV_H_6TAP       m6, m4, m5
3812*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+16*5]
3813*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m15, m6
3814*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m6
3815*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m12, m4  ; a2
3816*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
3817*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m13, m4  ; a2'
3818*c0909341SAndroid Build Coastguard Worker    paddd                m2, m5
3819*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m14, m4  ; b2
3820*c0909341SAndroid Build Coastguard Worker    paddd                m1, m5
3821*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m15      ; b2'
3822*c0909341SAndroid Build Coastguard Worker    paddd                m3, m4
3823*c0909341SAndroid Build Coastguard Worker    REPX       {psrad x, 6}, m0, m2, m1, m3
3824*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m2
3825*c0909341SAndroid Build Coastguard Worker    packssdw             m1, m3
3826*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*0], m0
3827*c0909341SAndroid Build Coastguard Worker    mova          [r8+wq*2], m1
3828*c0909341SAndroid Build Coastguard Worker    lea                  r8, [r8+wq*4]
3829*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3830*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop
3831*c0909341SAndroid Build Coastguard Worker    add                srcq, 8
3832*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
3833*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5b
3834*c0909341SAndroid Build Coastguard Worker%endif
3835*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 1<<16
3836*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
3837*c0909341SAndroid Build Coastguard Worker    RET
3838*c0909341SAndroid Build Coastguard Worker
3839*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_8bpc
3840*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_8bpc
3841*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_8bpc
3842*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_8bpc
3843*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_FN sharp,          SHARP,   SHARP
3844*c0909341SAndroid Build Coastguard Worker
3845*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
3846*c0909341SAndroid Build Coastguard Worker    imul                mxd, mxm, 0x010101
3847*c0909341SAndroid Build Coastguard Worker    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3848*c0909341SAndroid Build Coastguard Worker    imul                myd, mym, 0x010101
3849*c0909341SAndroid Build Coastguard Worker    add                 myd, t1d ; 8tap_v, my, 4tap_v
3850*c0909341SAndroid Build Coastguard Worker    mov                  wd, wm
3851*c0909341SAndroid Build Coastguard Worker    movifnidn          srcd, srcm
3852*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
3853*c0909341SAndroid Build Coastguard Worker    LEA            base_reg, prep_ssse3
3854*c0909341SAndroid Build Coastguard Worker    test                mxd, 0xf00
3855*c0909341SAndroid Build Coastguard Worker    jnz .h
3856*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
3857*c0909341SAndroid Build Coastguard Worker    jz mangle(private_prefix %+ _prep_6tap_8bpc_ssse3).prep
3858*c0909341SAndroid Build Coastguard Worker.v:
3859*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3860*c0909341SAndroid Build Coastguard Worker    mov                 mxd, myd
3861*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
3862*c0909341SAndroid Build Coastguard Worker%else
3863*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      16
3864*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
3865*c0909341SAndroid Build Coastguard Worker%endif
3866*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
3867*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
3868*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
3869*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
3870*c0909341SAndroid Build Coastguard Worker    mova                 m2, [base+pw_512]
3871*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_8192]
3872*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m0
3873*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3874*c0909341SAndroid Build Coastguard Worker %define            subpel0  [rsp+mmsize*0]
3875*c0909341SAndroid Build Coastguard Worker %define            subpel1  [rsp+mmsize*1]
3876*c0909341SAndroid Build Coastguard Worker %define            subpel2  [rsp+mmsize*2]
3877*c0909341SAndroid Build Coastguard Worker %define            subpel3  [rsp+mmsize*3]
3878*c0909341SAndroid Build Coastguard Worker%assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed
3879*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   -mmsize*4
3880*c0909341SAndroid Build Coastguard Worker%assign regs_used 7
3881*c0909341SAndroid Build Coastguard Worker    mov             strideq, [rstk+stack_offset+gprsize*3]
3882*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q0000
3883*c0909341SAndroid Build Coastguard Worker    mova            subpel0, m1
3884*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q1111
3885*c0909341SAndroid Build Coastguard Worker    mova            subpel1, m1
3886*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*3]
3887*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q2222
3888*c0909341SAndroid Build Coastguard Worker    mova            subpel2, m1
3889*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m0, q3333
3890*c0909341SAndroid Build Coastguard Worker    mova            subpel3, m1
3891*c0909341SAndroid Build Coastguard Worker    sub                srcq, r5
3892*c0909341SAndroid Build Coastguard Worker%else
3893*c0909341SAndroid Build Coastguard Worker %define            subpel0  m8
3894*c0909341SAndroid Build Coastguard Worker %define            subpel1  m9
3895*c0909341SAndroid Build Coastguard Worker %define            subpel2  m10
3896*c0909341SAndroid Build Coastguard Worker %define            subpel3  m11
3897*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m0, q0000
3898*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m0, q1111
3899*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
3900*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q2222
3901*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q3333
3902*c0909341SAndroid Build Coastguard Worker    sub                srcq, stride3q
3903*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 8
3904*c0909341SAndroid Build Coastguard Worker    jns .v_w8
3905*c0909341SAndroid Build Coastguard Worker%endif
3906*c0909341SAndroid Build Coastguard Worker.v_w4:
3907*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3908*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < mmsize
3909*c0909341SAndroid Build Coastguard Worker  %define srcm [esp+stack_size+gprsize*1]
3910*c0909341SAndroid Build Coastguard Worker  %define tmpm [esp+stack_size+gprsize*2]
3911*c0909341SAndroid Build Coastguard Worker %endif
3912*c0909341SAndroid Build Coastguard Worker    mov                tmpm, tmpq
3913*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
3914*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [wq - 4] ; horizontal loop
3915*c0909341SAndroid Build Coastguard Worker    shl                 r5d, (16 - 2)  ; (wq / 4) << 16
3916*c0909341SAndroid Build Coastguard Worker    mov                 r5w, hw
3917*c0909341SAndroid Build Coastguard Worker.v_w4_loop0:
3918*c0909341SAndroid Build Coastguard Worker%endif
3919*c0909341SAndroid Build Coastguard Worker    movd                 m1, [srcq+strideq*0]
3920*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+strideq*1]
3921*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3922*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3923*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+strideq*0]
3924*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+strideq*1]
3925*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3926*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+strideq*0]
3927*c0909341SAndroid Build Coastguard Worker    movd                 m5, [srcq+strideq*1]
3928*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3929*c0909341SAndroid Build Coastguard Worker%else
3930*c0909341SAndroid Build Coastguard Worker    movd                 m2, [srcq+strideq*2]
3931*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
3932*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+strideq*0]
3933*c0909341SAndroid Build Coastguard Worker    movd                 m3, [srcq+strideq*1]
3934*c0909341SAndroid Build Coastguard Worker    movd                 m5, [srcq+strideq*2]
3935*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
3936*c0909341SAndroid Build Coastguard Worker%endif
3937*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m0 ; 0 1
3938*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2 ; 1 2
3939*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m0 ; 01 12
3940*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+strideq*0]
3941*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4 ; 2 3
3942*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m3 ; 3 4
3943*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m5 ; 4 5
3944*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m0 ; 5 6
3945*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4 ; 23 34
3946*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m5 ; 45 56
3947*c0909341SAndroid Build Coastguard Worker.v_w4_loop:
3948*c0909341SAndroid Build Coastguard Worker    mova                 m5, m1
3949*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, subpel0      ; a0 b0
3950*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
3951*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, subpel1      ; a1 b1
3952*c0909341SAndroid Build Coastguard Worker    paddw                m5, m2
3953*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
3954*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, subpel2      ; a2 b2
3955*c0909341SAndroid Build Coastguard Worker    movd                 m4, [srcq+strideq*1]
3956*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
3957*c0909341SAndroid Build Coastguard Worker    paddw                m5, m3
3958*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m0, m4       ; 6 7 _ _
3959*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+strideq*0]
3960*c0909341SAndroid Build Coastguard Worker    punpckldq            m4, m0           ; 7 8 _ _
3961*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4           ; 67 78
3962*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
3963*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, subpel3      ; a3 b3
3964*c0909341SAndroid Build Coastguard Worker    paddw                m5, m4
3965*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m7
3966*c0909341SAndroid Build Coastguard Worker    movq        [tmpq+wq*0], m5
3967*c0909341SAndroid Build Coastguard Worker    movhps      [tmpq+wq*2], m5
3968*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+wq*4]
3969*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
3970*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop
3971*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
3972*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcm
3973*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpm
3974*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5w
3975*c0909341SAndroid Build Coastguard Worker    add                srcq, 4
3976*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
3977*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
3978*c0909341SAndroid Build Coastguard Worker    mov                tmpm, tmpq
3979*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 1<<16 ; horizontal--
3980*c0909341SAndroid Build Coastguard Worker    jg .v_w4_loop0
3981*c0909341SAndroid Build Coastguard Worker%endif
3982*c0909341SAndroid Build Coastguard Worker    RET
3983*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
3984*c0909341SAndroid Build Coastguard Worker.v_w8:
3985*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [wq*8-64]
3986*c0909341SAndroid Build Coastguard Worker    mov                  r5, srcq
3987*c0909341SAndroid Build Coastguard Worker    mov                  r8, tmpq
3988*c0909341SAndroid Build Coastguard Worker    lea                 r6d, [hq+r6*4]
3989*c0909341SAndroid Build Coastguard Worker.v_w8_loop0:
3990*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+strideq*0]
3991*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+strideq*1]
3992*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+strideq*2]
3993*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
3994*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+strideq*0]
3995*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+strideq*1]
3996*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+strideq*2]
3997*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
3998*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
3999*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2 ; 01
4000*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3 ; 12
4001*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m4 ; 23
4002*c0909341SAndroid Build Coastguard Worker    punpcklbw            m4, m5 ; 34
4003*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m6 ; 45
4004*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m0 ; 56
4005*c0909341SAndroid Build Coastguard Worker.v_w8_loop:
4006*c0909341SAndroid Build Coastguard Worker    movq                m13, [srcq+strideq*1]
4007*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4008*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m14, m1, subpel0 ; a0
4009*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m2, subpel0 ; b0
4010*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
4011*c0909341SAndroid Build Coastguard Worker    mova                 m2, m4
4012*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, subpel1 ; a1
4013*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, subpel1 ; b1
4014*c0909341SAndroid Build Coastguard Worker    paddw               m14, m3
4015*c0909341SAndroid Build Coastguard Worker    paddw               m15, m4
4016*c0909341SAndroid Build Coastguard Worker    mova                 m3, m5
4017*c0909341SAndroid Build Coastguard Worker    mova                 m4, m6
4018*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, subpel2 ; a2
4019*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, subpel2 ; b2
4020*c0909341SAndroid Build Coastguard Worker    punpcklbw           m12, m0, m13 ; 67
4021*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+strideq*0]
4022*c0909341SAndroid Build Coastguard Worker    punpcklbw           m13, m0      ; 78
4023*c0909341SAndroid Build Coastguard Worker    paddw               m14, m5
4024*c0909341SAndroid Build Coastguard Worker    mova                 m5, m12
4025*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m12, subpel3 ; a3
4026*c0909341SAndroid Build Coastguard Worker    paddw               m15, m6
4027*c0909341SAndroid Build Coastguard Worker    mova                 m6, m13
4028*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m13, subpel3 ; b3
4029*c0909341SAndroid Build Coastguard Worker    paddw               m14, m12
4030*c0909341SAndroid Build Coastguard Worker    paddw               m15, m13
4031*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m7
4032*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m15, m7
4033*c0909341SAndroid Build Coastguard Worker    movu        [tmpq+wq*0], m14
4034*c0909341SAndroid Build Coastguard Worker    movu        [tmpq+wq*2], m15
4035*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+wq*4]
4036*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4037*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop
4038*c0909341SAndroid Build Coastguard Worker    add                  r5, 8
4039*c0909341SAndroid Build Coastguard Worker    add                  r8, 16
4040*c0909341SAndroid Build Coastguard Worker    movzx                hd, r6b
4041*c0909341SAndroid Build Coastguard Worker    mov                srcq, r5
4042*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r8
4043*c0909341SAndroid Build Coastguard Worker    sub                 r6d, 1<<8
4044*c0909341SAndroid Build Coastguard Worker    jg .v_w8_loop0
4045*c0909341SAndroid Build Coastguard Worker    RET
4046*c0909341SAndroid Build Coastguard Worker%endif ;ARCH_X86_64
4047*c0909341SAndroid Build Coastguard Worker%undef subpel0
4048*c0909341SAndroid Build Coastguard Worker%undef subpel1
4049*c0909341SAndroid Build Coastguard Worker%undef subpel2
4050*c0909341SAndroid Build Coastguard Worker%undef subpel3
4051*c0909341SAndroid Build Coastguard Worker.h_w4:
4052*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM       7
4053*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4054*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
4055*c0909341SAndroid Build Coastguard Worker%else
4056*c0909341SAndroid Build Coastguard Worker    movzx               mxd, mxb
4057*c0909341SAndroid Build Coastguard Worker%endif
4058*c0909341SAndroid Build Coastguard Worker    dec                srcq
4059*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
4060*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+subpel_h_shufA]
4061*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+pw_8192]
4062*c0909341SAndroid Build Coastguard Worker    movifnidn            r2, stridemp
4063*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q0000
4064*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r2*3]
4065*c0909341SAndroid Build Coastguard Worker.h_w4_loop:
4066*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+r2*0]
4067*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+r2*1]
4068*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+r2*2]
4069*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+r3  ]
4070*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+r2*4]
4071*c0909341SAndroid Build Coastguard Worker    REPX  {pshufb    x, m5}, m0, m1, m2, m3
4072*c0909341SAndroid Build Coastguard Worker    REPX  {pmaddubsw x, m4}, m0, m1, m2, m3
4073*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m1
4074*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
4075*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6
4076*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m6
4077*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
4078*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m2
4079*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
4080*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
4081*c0909341SAndroid Build Coastguard Worker    jg .h_w4_loop
4082*c0909341SAndroid Build Coastguard Worker    RET
4083*c0909341SAndroid Build Coastguard Worker.h:
4084*c0909341SAndroid Build Coastguard Worker    test                myd, 0xf00
4085*c0909341SAndroid Build Coastguard Worker    jnz .hv
4086*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
4087*c0909341SAndroid Build Coastguard Worker    je .h_w4
4088*c0909341SAndroid Build Coastguard Worker    WIN64_SPILL_XMM      12
4089*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4090*c0909341SAndroid Build Coastguard Worker %define strideq r6
4091*c0909341SAndroid Build Coastguard Worker    mov             strideq, stridem
4092*c0909341SAndroid Build Coastguard Worker%endif
4093*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wd
4094*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4095*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+subpel_h_shufA]
4096*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+subpel_h_shufB]
4097*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+subpel_h_shufC]
4098*c0909341SAndroid Build Coastguard Worker%else
4099*c0909341SAndroid Build Coastguard Worker    %define             m10  [base+subpel_h_shufA]
4100*c0909341SAndroid Build Coastguard Worker    %define             m11  [base+subpel_h_shufB]
4101*c0909341SAndroid Build Coastguard Worker    %define              m9  [base+subpel_h_shufC]
4102*c0909341SAndroid Build Coastguard Worker%endif
4103*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
4104*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
4105*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
4106*c0909341SAndroid Build Coastguard Worker    movq                 m6, [base_reg+mxq*8+subpel_filters-prep_ssse3]
4107*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_8192]
4108*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m6, q0000
4109*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q1111
4110*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
4111*c0909341SAndroid Build Coastguard Worker    jmp                  wq
4112*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_H 2 ; dst, src_memloc
4113*c0909341SAndroid Build Coastguard Worker    movu                m%1, [%2]
4114*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m%1, m11 ; subpel_h_shufB
4115*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m%1, m9  ; subpel_h_shufC
4116*c0909341SAndroid Build Coastguard Worker    pshufb              m%1, m10      ; subpel_h_shufA
4117*c0909341SAndroid Build Coastguard Worker    mova                 m4, m2
4118*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m5       ; subpel +0 B0
4119*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m6       ; subpel +4 B4
4120*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m6       ; subpel +4 C4
4121*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%1, m5       ; subpel +0 A0
4122*c0909341SAndroid Build Coastguard Worker    paddw                m3, m4
4123*c0909341SAndroid Build Coastguard Worker    paddw               m%1, m2
4124*c0909341SAndroid Build Coastguard Worker    phaddw              m%1, m3
4125*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m7
4126*c0909341SAndroid Build Coastguard Worker%endmacro
4127*c0909341SAndroid Build Coastguard Worker.h_w8:
4128*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H           0, srcq+strideq*0
4129*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H           1, srcq+strideq*1
4130*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
4131*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
4132*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4133*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
4134*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4135*c0909341SAndroid Build Coastguard Worker    jg .h_w8
4136*c0909341SAndroid Build Coastguard Worker    RET
4137*c0909341SAndroid Build Coastguard Worker.h_w16:
4138*c0909341SAndroid Build Coastguard Worker    mov                  r3, -16*1
4139*c0909341SAndroid Build Coastguard Worker    jmp .h_start
4140*c0909341SAndroid Build Coastguard Worker.h_w32:
4141*c0909341SAndroid Build Coastguard Worker    mov                  r3, -16*2
4142*c0909341SAndroid Build Coastguard Worker    jmp .h_start
4143*c0909341SAndroid Build Coastguard Worker.h_w64:
4144*c0909341SAndroid Build Coastguard Worker    mov                  r3, -16*4
4145*c0909341SAndroid Build Coastguard Worker    jmp .h_start
4146*c0909341SAndroid Build Coastguard Worker.h_w128:
4147*c0909341SAndroid Build Coastguard Worker    mov                  r3, -16*8
4148*c0909341SAndroid Build Coastguard Worker.h_start:
4149*c0909341SAndroid Build Coastguard Worker    sub                srcq, r3
4150*c0909341SAndroid Build Coastguard Worker    mov                  r5, r3
4151*c0909341SAndroid Build Coastguard Worker.h_loop:
4152*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H           0, srcq+r3+8*0
4153*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_H           1, srcq+r3+8*1
4154*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*0], m0
4155*c0909341SAndroid Build Coastguard Worker    mova        [tmpq+16*1], m1
4156*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
4157*c0909341SAndroid Build Coastguard Worker    add                  r3, 16
4158*c0909341SAndroid Build Coastguard Worker    jl .h_loop
4159*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
4160*c0909341SAndroid Build Coastguard Worker    mov                  r3, r5
4161*c0909341SAndroid Build Coastguard Worker    dec                  hd
4162*c0909341SAndroid Build Coastguard Worker    jg .h_loop
4163*c0909341SAndroid Build Coastguard Worker    RET
4164*c0909341SAndroid Build Coastguard Worker.hv:
4165*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
4166*c0909341SAndroid Build Coastguard Worker    cmp                  wd, 4
4167*c0909341SAndroid Build Coastguard Worker    jg .hv_w8
4168*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
4169*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
4170*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4171*c0909341SAndroid Build Coastguard Worker    mov                 mxd, myd
4172*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
4173*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
4174*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
4175*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
4176*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
4177*c0909341SAndroid Build Coastguard Worker    mov             strideq, stridem
4178*c0909341SAndroid Build Coastguard Worker %assign regs_used 6
4179*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -mmsize*14
4180*c0909341SAndroid Build Coastguard Worker %assign regs_used 7
4181*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*3+1]
4182*c0909341SAndroid Build Coastguard Worker    sub                srcq, r5
4183*c0909341SAndroid Build Coastguard Worker %define           subpelv0  [rsp+mmsize*0]
4184*c0909341SAndroid Build Coastguard Worker %define           subpelv1  [rsp+mmsize*1]
4185*c0909341SAndroid Build Coastguard Worker %define           subpelv2  [rsp+mmsize*2]
4186*c0909341SAndroid Build Coastguard Worker %define           subpelv3  [rsp+mmsize*3]
4187*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
4188*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8
4189*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q0000
4190*c0909341SAndroid Build Coastguard Worker    mova           subpelv0, m6
4191*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q1111
4192*c0909341SAndroid Build Coastguard Worker    mova           subpelv1, m6
4193*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q2222
4194*c0909341SAndroid Build Coastguard Worker    mova           subpelv2, m6
4195*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m0, q3333
4196*c0909341SAndroid Build Coastguard Worker    mova           subpelv3, m6
4197*c0909341SAndroid Build Coastguard Worker%else
4198*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
4199*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
4200*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
4201*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
4202*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
4203*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK   mmsize*14, 14
4204*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4205*c0909341SAndroid Build Coastguard Worker    sub                srcq, stride3q
4206*c0909341SAndroid Build Coastguard Worker    dec                srcq
4207*c0909341SAndroid Build Coastguard Worker %define           subpelv0  m10
4208*c0909341SAndroid Build Coastguard Worker %define           subpelv1  m11
4209*c0909341SAndroid Build Coastguard Worker %define           subpelv2  m12
4210*c0909341SAndroid Build Coastguard Worker %define           subpelv3  m13
4211*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m0
4212*c0909341SAndroid Build Coastguard Worker    psraw                m0, 8
4213*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_8192]
4214*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pd_32]
4215*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m0, q0000
4216*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m0, q1111
4217*c0909341SAndroid Build Coastguard Worker    pshufd              m12, m0, q2222
4218*c0909341SAndroid Build Coastguard Worker    pshufd              m13, m0, q3333
4219*c0909341SAndroid Build Coastguard Worker%endif
4220*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m1, q0000
4221*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_0 4
4222*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_1 5
4223*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_2 6
4224*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_3 7
4225*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_4 8
4226*c0909341SAndroid Build Coastguard Worker%define hv4_line_0_5 9
4227*c0909341SAndroid Build Coastguard Worker%define hv4_line_1_0 10
4228*c0909341SAndroid Build Coastguard Worker%define hv4_line_1_1 11
4229*c0909341SAndroid Build Coastguard Worker%define hv4_line_1_2 12
4230*c0909341SAndroid Build Coastguard Worker%define hv4_line_1_3 13
4231*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4232*c0909341SAndroid Build Coastguard Worker    %define        w8192reg  [base+pw_8192]
4233*c0909341SAndroid Build Coastguard Worker    %define          d32reg  [base+pd_32]
4234*c0909341SAndroid Build Coastguard Worker%else
4235*c0909341SAndroid Build Coastguard Worker    %define        w8192reg  m8
4236*c0909341SAndroid Build Coastguard Worker    %define          d32reg  m9
4237*c0909341SAndroid Build Coastguard Worker%endif
4238*c0909341SAndroid Build Coastguard Worker    ; lower shuffle 0 1 2 3 4
4239*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4]
4240*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
4241*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
4242*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4243*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4244*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+strideq*0]   ; 2 _ _ _
4245*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+strideq*1]   ; 2 _ 3 _
4246*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4247*c0909341SAndroid Build Coastguard Worker%else
4248*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
4249*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
4250*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*4]
4251*c0909341SAndroid Build Coastguard Worker%endif
4252*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m6             ;H subpel_h_shuf4 0~1~
4253*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m6             ;H subpel_h_shuf4 2~3~
4254*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7                 ;H subpel_filters
4255*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7                 ;H subpel_filters
4256*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m0
4257*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, w8192reg
4258*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m2, 2, 0
4259*c0909341SAndroid Build Coastguard Worker    ; upper shuffle 2 3 4 5 6
4260*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4+16]
4261*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m5, m6             ;H subpel_h_shuf4 0~1~
4262*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m6             ;H subpel_h_shuf4 2~3~
4263*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m7                 ;H subpel_filters
4264*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7                 ;H subpel_filters
4265*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m0                 ;H 0 1 2 3
4266*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, w8192reg
4267*c0909341SAndroid Build Coastguard Worker    ; lower shuffle
4268*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4]
4269*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
4270*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
4271*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4272*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4273*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+strideq*0]   ; 6 _ _ _
4274*c0909341SAndroid Build Coastguard Worker    add                srcq, strideq
4275*c0909341SAndroid Build Coastguard Worker%else
4276*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
4277*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
4278*c0909341SAndroid Build Coastguard Worker%endif
4279*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5, m6             ;H subpel_h_shuf4 4~5~
4280*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m6             ;H subpel_h_shuf4 6~6~
4281*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7                 ;H subpel_filters
4282*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7                 ;H subpel_filters
4283*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m0                 ;H 4 5 6 7
4284*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, w8192reg
4285*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m3, 3, 0
4286*c0909341SAndroid Build Coastguard Worker    ; upper shuffle
4287*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4+16]
4288*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m5, m6             ;H subpel_h_shuf4 4~5~
4289*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m4, m6             ;H subpel_h_shuf4 6~6~
4290*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m7                 ;H subpel_filters
4291*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m7                 ;H subpel_filters
4292*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m0                 ;H 4 5 6 7
4293*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, w8192reg
4294*c0909341SAndroid Build Coastguard Worker    ;process high
4295*c0909341SAndroid Build Coastguard Worker    palignr              m4, m3, m2, 4;V 1 2 3 4
4296*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m4  ; V 01 12
4297*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4      ; V 23 34
4298*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q2121;V 5 6 5 6
4299*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0      ; V 45 56
4300*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m0, 0, 1
4301*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m1, 1, 1
4302*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m2, 2, 1
4303*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m3, 3, 1
4304*c0909341SAndroid Build Coastguard Worker    ;process low
4305*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m2, 2, 0
4306*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m3, 3, 0
4307*c0909341SAndroid Build Coastguard Worker    palignr              m4, m3, m2, 4;V 1 2 3 4
4308*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m2, m4  ; V 01 12
4309*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4      ; V 23 34
4310*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q2121;V 5 6 5 6
4311*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0      ; V 45 56
4312*c0909341SAndroid Build Coastguard Worker.hv_w4_loop:
4313*c0909341SAndroid Build Coastguard Worker    ;process low
4314*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, subpelv0 ; V a0 b0
4315*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4316*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, subpelv1; V a1 b1
4317*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
4318*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
4319*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, subpelv2; V a2 b2
4320*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
4321*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4]
4322*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+strideq*0] ; 7
4323*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
4324*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6               ; H subpel_h_shuf4 7~8~
4325*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7               ; H subpel_filters
4326*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m4               ; H                7878
4327*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, w8192reg
4328*c0909341SAndroid Build Coastguard Worker    palignr              m3, m4, m0, 12       ;                  6787
4329*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
4330*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 67 78
4331*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, subpelv3; a3 b3
4332*c0909341SAndroid Build Coastguard Worker    paddd                m5, d32reg ; pd_32
4333*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
4334*c0909341SAndroid Build Coastguard Worker    psrad                m5, 6
4335*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m0, 0, 0
4336*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m1, 1, 0
4337*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m2, 2, 0
4338*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m3, 3, 0
4339*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m5, 5, 0
4340*c0909341SAndroid Build Coastguard Worker    ;process high
4341*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m0, 0, 1
4342*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m1, 1, 1
4343*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m2, 2, 1
4344*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m3, 3, 1
4345*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, subpelv0; V a0 b0
4346*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4347*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, subpelv1; V a1 b1
4348*c0909341SAndroid Build Coastguard Worker    paddd                m5, m2
4349*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
4350*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, subpelv2; V a2 b2
4351*c0909341SAndroid Build Coastguard Worker    paddd                m5, m3
4352*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shuf4+16]
4353*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+strideq*0] ; 7
4354*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
4355*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6               ; H subpel_h_shuf4 7~8~
4356*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m7               ; H subpel_filters
4357*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m4               ; H                7878
4358*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, w8192reg
4359*c0909341SAndroid Build Coastguard Worker    palignr              m3, m4, m0, 12       ;                  6787
4360*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
4361*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4      ; 67 78
4362*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, subpelv3; a3 b3
4363*c0909341SAndroid Build Coastguard Worker    paddd                m5, d32reg ; pd_32
4364*c0909341SAndroid Build Coastguard Worker    paddd                m5, m4
4365*c0909341SAndroid Build Coastguard Worker    psrad                m4, m5, 6
4366*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m5, 5, 0
4367*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m4
4368*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3120
4369*c0909341SAndroid Build Coastguard Worker    movu             [tmpq], m5
4370*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4371*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
4372*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4373*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m0, 0, 1
4374*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m1, 1, 1
4375*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m2, 2, 1
4376*c0909341SAndroid Build Coastguard Worker    SAVELINE_W4          m3, 3, 1
4377*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m0, 0, 0
4378*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m1, 1, 0
4379*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m2, 2, 0
4380*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W4       m3, 3, 0
4381*c0909341SAndroid Build Coastguard Worker    jg .hv_w4_loop
4382*c0909341SAndroid Build Coastguard Worker    RET
4383*c0909341SAndroid Build Coastguard Worker%undef subpelv0
4384*c0909341SAndroid Build Coastguard Worker%undef subpelv1
4385*c0909341SAndroid Build Coastguard Worker%undef subpelv2
4386*c0909341SAndroid Build Coastguard Worker%undef subpelv3
4387*c0909341SAndroid Build Coastguard Worker.hv_w8:
4388*c0909341SAndroid Build Coastguard Worker    RESET_STACK_STATE
4389*c0909341SAndroid Build Coastguard Worker%define hv8_line_1 0
4390*c0909341SAndroid Build Coastguard Worker%define hv8_line_2 1
4391*c0909341SAndroid Build Coastguard Worker%define hv8_line_3 2
4392*c0909341SAndroid Build Coastguard Worker%define hv8_line_4 3
4393*c0909341SAndroid Build Coastguard Worker%define hv8_line_6 4
4394*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 16
4395*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4396*c0909341SAndroid Build Coastguard Worker %define           subpelh0  [rsp+mmsize*5]
4397*c0909341SAndroid Build Coastguard Worker %define           subpelh1  [rsp+mmsize*6]
4398*c0909341SAndroid Build Coastguard Worker %define           subpelv0  [rsp+mmsize*7]
4399*c0909341SAndroid Build Coastguard Worker %define           subpelv1  [rsp+mmsize*8]
4400*c0909341SAndroid Build Coastguard Worker %define           subpelv2  [rsp+mmsize*9]
4401*c0909341SAndroid Build Coastguard Worker %define           subpelv3  [rsp+mmsize*10]
4402*c0909341SAndroid Build Coastguard Worker %define             accuv0  [rsp+mmsize*11]
4403*c0909341SAndroid Build Coastguard Worker %define             accuv1  [rsp+mmsize*12]
4404*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
4405*c0909341SAndroid Build Coastguard Worker    mov                 mxd, myd
4406*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
4407*c0909341SAndroid Build Coastguard Worker    and                 mxd, 0x7f
4408*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
4409*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
4410*c0909341SAndroid Build Coastguard Worker    movq                 m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
4411*c0909341SAndroid Build Coastguard Worker    mov             strideq, stridem
4412*c0909341SAndroid Build Coastguard Worker %assign regs_used 6
4413*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK  -mmsize*14
4414*c0909341SAndroid Build Coastguard Worker %assign regs_used 7
4415*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < mmsize
4416*c0909341SAndroid Build Coastguard Worker  %define              tmpm  [rsp+mmsize*13+gprsize*1]
4417*c0909341SAndroid Build Coastguard Worker  %define              srcm  [rsp+mmsize*13+gprsize*2]
4418*c0909341SAndroid Build Coastguard Worker  %define           stridem  [rsp+mmsize*13+gprsize*3]
4419*c0909341SAndroid Build Coastguard Worker    mov                tmpm, tmpq
4420*c0909341SAndroid Build Coastguard Worker    mov             stridem, strideq
4421*c0909341SAndroid Build Coastguard Worker %endif
4422*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m1, q0000
4423*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q1111
4424*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m5
4425*c0909341SAndroid Build Coastguard Worker    psraw                m5, 8
4426*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m5, q0000
4427*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m5, q1111
4428*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m5, q2222
4429*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3333
4430*c0909341SAndroid Build Coastguard Worker    mova           subpelh0, m0
4431*c0909341SAndroid Build Coastguard Worker    mova           subpelh1, m1
4432*c0909341SAndroid Build Coastguard Worker    mova           subpelv0, m2
4433*c0909341SAndroid Build Coastguard Worker    mova           subpelv1, m3
4434*c0909341SAndroid Build Coastguard Worker    mova           subpelv2, m4
4435*c0909341SAndroid Build Coastguard Worker    mova           subpelv3, m5
4436*c0909341SAndroid Build Coastguard Worker    lea                  r5, [strideq*3+3]
4437*c0909341SAndroid Build Coastguard Worker    sub                srcq, r5
4438*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
4439*c0909341SAndroid Build Coastguard Worker%else
4440*c0909341SAndroid Build Coastguard Worker    ALLOC_STACK    mmsize*5, 16
4441*c0909341SAndroid Build Coastguard Worker %define           subpelh0  m10
4442*c0909341SAndroid Build Coastguard Worker %define           subpelh1  m11
4443*c0909341SAndroid Build Coastguard Worker %define           subpelv0  m12
4444*c0909341SAndroid Build Coastguard Worker %define           subpelv1  m13
4445*c0909341SAndroid Build Coastguard Worker %define           subpelv2  m14
4446*c0909341SAndroid Build Coastguard Worker %define           subpelv3  m15
4447*c0909341SAndroid Build Coastguard Worker %define             accuv0  m8
4448*c0909341SAndroid Build Coastguard Worker %define             accuv1  m9
4449*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
4450*c0909341SAndroid Build Coastguard Worker    movzx               mxd, myb
4451*c0909341SAndroid Build Coastguard Worker    shr                 myd, 16
4452*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
4453*c0909341SAndroid Build Coastguard Worker    cmovs               myd, mxd
4454*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
4455*c0909341SAndroid Build Coastguard Worker    pshufd         subpelh0, m0, q0000
4456*c0909341SAndroid Build Coastguard Worker    pshufd         subpelh1, m0, q1111
4457*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m1
4458*c0909341SAndroid Build Coastguard Worker    psraw                m1, 8
4459*c0909341SAndroid Build Coastguard Worker    pshufd         subpelv0, m1, q0000
4460*c0909341SAndroid Build Coastguard Worker    pshufd         subpelv1, m1, q1111
4461*c0909341SAndroid Build Coastguard Worker    pshufd         subpelv2, m1, q2222
4462*c0909341SAndroid Build Coastguard Worker    pshufd         subpelv3, m1, q3333
4463*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
4464*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
4465*c0909341SAndroid Build Coastguard Worker    sub                srcq, stride3q
4466*c0909341SAndroid Build Coastguard Worker    mov                  r6, srcq
4467*c0909341SAndroid Build Coastguard Worker    mov                  r8, tmpq
4468*c0909341SAndroid Build Coastguard Worker%endif
4469*c0909341SAndroid Build Coastguard Worker    lea                 r5d, [wq-4]
4470*c0909341SAndroid Build Coastguard Worker    shl                 r5d, 14
4471*c0909341SAndroid Build Coastguard Worker    add                 r5d, hd
4472*c0909341SAndroid Build Coastguard Worker.hv_w8_loop0:
4473*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4474*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+subpel_h_shufA]
4475*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+subpel_h_shufB]
4476*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+subpel_h_shufC]
4477*c0909341SAndroid Build Coastguard Worker    %define           shufA  m7
4478*c0909341SAndroid Build Coastguard Worker    %define           shufB  m8
4479*c0909341SAndroid Build Coastguard Worker    %define           shufC  m9
4480*c0909341SAndroid Build Coastguard Worker%else
4481*c0909341SAndroid Build Coastguard Worker    %define           shufA  [base+subpel_h_shufA]
4482*c0909341SAndroid Build Coastguard Worker    %define           shufB  [base+subpel_h_shufB]
4483*c0909341SAndroid Build Coastguard Worker    %define           shufC  [base+subpel_h_shufC]
4484*c0909341SAndroid Build Coastguard Worker%endif
4485*c0909341SAndroid Build Coastguard Worker%macro PREP_8TAP_HV 2 ; dst, src_memloc, tmp[1-2]
4486*c0909341SAndroid Build Coastguard Worker    movu                 %1, [%2]
4487*c0909341SAndroid Build Coastguard Worker    pshufb               m2, %1, shufB
4488*c0909341SAndroid Build Coastguard Worker    pshufb               m3, %1, shufC
4489*c0909341SAndroid Build Coastguard Worker    pshufb               %1, shufA
4490*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
4491*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, subpelh0 ; subpel +0 C0
4492*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, subpelh1 ; subpel +4 B4
4493*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, subpelh1 ; C4
4494*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %1, subpelh0 ; A0
4495*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3       ; C0+B4
4496*c0909341SAndroid Build Coastguard Worker    paddw                %1, m2       ; A0+C4
4497*c0909341SAndroid Build Coastguard Worker    phaddw               %1, m1
4498*c0909341SAndroid Build Coastguard Worker%endmacro
4499*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m4, srcq+strideq*0
4500*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m5, srcq+strideq*1
4501*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4502*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m6, srcq+strideq*2
4503*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
4504*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m0, srcq+strideq*0
4505*c0909341SAndroid Build Coastguard Worker%else
4506*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4507*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m6, srcq+strideq*0
4508*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m0, srcq+strideq*1
4509*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4510*c0909341SAndroid Build Coastguard Worker%endif
4511*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_8192]
4512*c0909341SAndroid Build Coastguard Worker    REPX   {pmulhrsw x, m7}, m4, m5, m6, m0
4513*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4, m5 ; 01
4514*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5, m6 ; 12
4515*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m0 ; 23
4516*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           1, m1
4517*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           2, m2
4518*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           3, m3
4519*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+subpel_h_shufA]
4520*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4521*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m4, srcq+strideq*1
4522*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m5, srcq+strideq*2
4523*c0909341SAndroid Build Coastguard Worker    add                srcq, stride3q
4524*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m6, srcq+strideq*0
4525*c0909341SAndroid Build Coastguard Worker%else
4526*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m4, srcq+strideq*0
4527*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m5, srcq+strideq*1
4528*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4529*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m6, srcq+strideq*0
4530*c0909341SAndroid Build Coastguard Worker%endif
4531*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+pw_8192]
4532*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m3, m4
4533*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m3, m5
4534*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m6
4535*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m0, m1 ; 34
4536*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m1, m2 ; 45
4537*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m2, m3 ; 56
4538*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           6, m3
4539*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        1, m1
4540*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        2, m2
4541*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        3, m3
4542*c0909341SAndroid Build Coastguard Worker.hv_w8_loop:
4543*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           1, m3
4544*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           2, m4
4545*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           3, m5
4546*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           4, m6
4547*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4548*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m1, subpelv0 ; a0
4549*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m2, subpelv0 ; b0
4550*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, subpelv1     ; a1
4551*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, subpelv1     ; b1
4552*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3
4553*c0909341SAndroid Build Coastguard Worker    paddd                m7, m4
4554*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, subpelv2     ; a2
4555*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, subpelv2     ; b2
4556*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
4557*c0909341SAndroid Build Coastguard Worker    paddd                m7, m6
4558*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pd_32]
4559*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
4560*c0909341SAndroid Build Coastguard Worker    paddd                m7, m5
4561*c0909341SAndroid Build Coastguard Worker    mova             accuv0, m0
4562*c0909341SAndroid Build Coastguard Worker    mova             accuv1, m7
4563*c0909341SAndroid Build Coastguard Worker%else
4564*c0909341SAndroid Build Coastguard Worker    pmaddwd          accuv0, m1, subpelv0 ; a0
4565*c0909341SAndroid Build Coastguard Worker    pmaddwd          accuv1, m2, subpelv0 ; b0
4566*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, subpelv1     ; a1
4567*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, subpelv1     ; b1
4568*c0909341SAndroid Build Coastguard Worker    paddd            accuv0, m3
4569*c0909341SAndroid Build Coastguard Worker    paddd            accuv1, m4
4570*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, subpelv2     ; a2
4571*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, subpelv2     ; b2
4572*c0909341SAndroid Build Coastguard Worker    paddd            accuv0, m5
4573*c0909341SAndroid Build Coastguard Worker    paddd            accuv1, m6
4574*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pd_32]
4575*c0909341SAndroid Build Coastguard Worker    paddd            accuv0, m7
4576*c0909341SAndroid Build Coastguard Worker    paddd            accuv1, m7
4577*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+subpel_h_shufB]
4578*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_h_shufC]
4579*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+subpel_h_shufA]
4580*c0909341SAndroid Build Coastguard Worker    %define           shufA  m5
4581*c0909341SAndroid Build Coastguard Worker    %define           shufB  m7
4582*c0909341SAndroid Build Coastguard Worker    %define           shufC  m6
4583*c0909341SAndroid Build Coastguard Worker%endif
4584*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m0, srcq+strideq*1
4585*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+strideq*2]
4586*c0909341SAndroid Build Coastguard Worker    PREP_8TAP_HV         m4, srcq+strideq*0
4587*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_8192]
4588*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
4589*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m5
4590*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        6, m6
4591*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m6, m0 ; 67
4592*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m4 ; 78
4593*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5, subpelv3 ; a3
4594*c0909341SAndroid Build Coastguard Worker    paddd                m2, m1, accuv0
4595*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m6, subpelv3 ; b3
4596*c0909341SAndroid Build Coastguard Worker    paddd                m1, m1, accuv1
4597*c0909341SAndroid Build Coastguard Worker    psrad                m2, 6
4598*c0909341SAndroid Build Coastguard Worker    psrad                m1, 6
4599*c0909341SAndroid Build Coastguard Worker    packssdw             m2, m1
4600*c0909341SAndroid Build Coastguard Worker    movq        [tmpq+wq*0], m2
4601*c0909341SAndroid Build Coastguard Worker    movhps      [tmpq+wq*2], m2
4602*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+wq*4]
4603*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
4604*c0909341SAndroid Build Coastguard Worker    jle .hv_w8_outer
4605*c0909341SAndroid Build Coastguard Worker    SAVELINE_W8           6, m4
4606*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        1, m1
4607*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        2, m2
4608*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        3, m3
4609*c0909341SAndroid Build Coastguard Worker    RESTORELINE_W8        4, m4
4610*c0909341SAndroid Build Coastguard Worker    jmp .hv_w8_loop
4611*c0909341SAndroid Build Coastguard Worker.hv_w8_outer:
4612*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4613*c0909341SAndroid Build Coastguard Worker    mov                srcq, srcm
4614*c0909341SAndroid Build Coastguard Worker    mov                tmpq, tmpm
4615*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5w
4616*c0909341SAndroid Build Coastguard Worker    add                srcq, 4
4617*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
4618*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcq
4619*c0909341SAndroid Build Coastguard Worker    mov                tmpm, tmpq
4620*c0909341SAndroid Build Coastguard Worker%else
4621*c0909341SAndroid Build Coastguard Worker    add                  r6, 4
4622*c0909341SAndroid Build Coastguard Worker    add                  r8, 8
4623*c0909341SAndroid Build Coastguard Worker    movzx                hd, r5b
4624*c0909341SAndroid Build Coastguard Worker    mov                srcq, r6
4625*c0909341SAndroid Build Coastguard Worker    mov                tmpq, r8
4626*c0909341SAndroid Build Coastguard Worker%endif
4627*c0909341SAndroid Build Coastguard Worker    sub                 r5d, 1<<16
4628*c0909341SAndroid Build Coastguard Worker    jg .hv_w8_loop0
4629*c0909341SAndroid Build Coastguard Worker    RET
4630*c0909341SAndroid Build Coastguard Worker
4631*c0909341SAndroid Build Coastguard Worker%macro movifprep 2
4632*c0909341SAndroid Build Coastguard Worker %if isprep
4633*c0909341SAndroid Build Coastguard Worker    mov %1, %2
4634*c0909341SAndroid Build Coastguard Worker %endif
4635*c0909341SAndroid Build Coastguard Worker%endmacro
4636*c0909341SAndroid Build Coastguard Worker
4637*c0909341SAndroid Build Coastguard Worker%macro SAVE_REG 1
4638*c0909341SAndroid Build Coastguard Worker %xdefine r%1_save  r%1
4639*c0909341SAndroid Build Coastguard Worker %xdefine r%1q_save r%1q
4640*c0909341SAndroid Build Coastguard Worker %xdefine r%1d_save r%1d
4641*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
4642*c0909341SAndroid Build Coastguard Worker  %define r%1m_save [rstk+stack_offset+(%1+1)*4]
4643*c0909341SAndroid Build Coastguard Worker %endif
4644*c0909341SAndroid Build Coastguard Worker%endmacro
4645*c0909341SAndroid Build Coastguard Worker
4646*c0909341SAndroid Build Coastguard Worker%macro LOAD_REG 1
4647*c0909341SAndroid Build Coastguard Worker %xdefine r%1  r%1_save
4648*c0909341SAndroid Build Coastguard Worker %xdefine r%1q r%1q_save
4649*c0909341SAndroid Build Coastguard Worker %xdefine r%1d r%1d_save
4650*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
4651*c0909341SAndroid Build Coastguard Worker  %define r%1m r%1m_save
4652*c0909341SAndroid Build Coastguard Worker %endif
4653*c0909341SAndroid Build Coastguard Worker %undef r%1d_save
4654*c0909341SAndroid Build Coastguard Worker %undef r%1q_save
4655*c0909341SAndroid Build Coastguard Worker %undef r%1_save
4656*c0909341SAndroid Build Coastguard Worker%endmacro
4657*c0909341SAndroid Build Coastguard Worker
4658*c0909341SAndroid Build Coastguard Worker%macro REMAP_REG 2-3
4659*c0909341SAndroid Build Coastguard Worker %xdefine r%1  r%2
4660*c0909341SAndroid Build Coastguard Worker %xdefine r%1q r%2q
4661*c0909341SAndroid Build Coastguard Worker %xdefine r%1d r%2d
4662*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
4663*c0909341SAndroid Build Coastguard Worker  %if %3 == 0
4664*c0909341SAndroid Build Coastguard Worker   %xdefine r%1m r%2m
4665*c0909341SAndroid Build Coastguard Worker  %else
4666*c0909341SAndroid Build Coastguard Worker   %define r%1m [rstk+stack_offset+(%1+1)*4]
4667*c0909341SAndroid Build Coastguard Worker  %endif
4668*c0909341SAndroid Build Coastguard Worker %endif
4669*c0909341SAndroid Build Coastguard Worker%endmacro
4670*c0909341SAndroid Build Coastguard Worker
4671*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
4672*c0909341SAndroid Build Coastguard Worker %if isprep
4673*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
4674*c0909341SAndroid Build Coastguard Worker   SAVE_REG 14
4675*c0909341SAndroid Build Coastguard Worker   %assign %%i 14
4676*c0909341SAndroid Build Coastguard Worker   %rep 14
4677*c0909341SAndroid Build Coastguard Worker    %assign %%j %%i-1
4678*c0909341SAndroid Build Coastguard Worker    REMAP_REG %%i, %%j
4679*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i-1
4680*c0909341SAndroid Build Coastguard Worker   %endrep
4681*c0909341SAndroid Build Coastguard Worker  %else
4682*c0909341SAndroid Build Coastguard Worker   SAVE_REG 5
4683*c0909341SAndroid Build Coastguard Worker   %assign %%i 5
4684*c0909341SAndroid Build Coastguard Worker   %rep 5
4685*c0909341SAndroid Build Coastguard Worker    %assign %%j %%i-1
4686*c0909341SAndroid Build Coastguard Worker    REMAP_REG %%i, %%j, 0
4687*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i-1
4688*c0909341SAndroid Build Coastguard Worker   %endrep
4689*c0909341SAndroid Build Coastguard Worker  %endif
4690*c0909341SAndroid Build Coastguard Worker %endif
4691*c0909341SAndroid Build Coastguard Worker%endmacro
4692*c0909341SAndroid Build Coastguard Worker
4693*c0909341SAndroid Build Coastguard Worker%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
4694*c0909341SAndroid Build Coastguard Worker %if isprep
4695*c0909341SAndroid Build Coastguard Worker  %assign %%i 1
4696*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
4697*c0909341SAndroid Build Coastguard Worker   %rep 13
4698*c0909341SAndroid Build Coastguard Worker    %assign %%j %%i+1
4699*c0909341SAndroid Build Coastguard Worker    REMAP_REG %%i, %%j
4700*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i+1
4701*c0909341SAndroid Build Coastguard Worker   %endrep
4702*c0909341SAndroid Build Coastguard Worker   LOAD_REG 14
4703*c0909341SAndroid Build Coastguard Worker  %else
4704*c0909341SAndroid Build Coastguard Worker   %rep 4
4705*c0909341SAndroid Build Coastguard Worker    %assign %%j %%i+1
4706*c0909341SAndroid Build Coastguard Worker    REMAP_REG %%i, %%j, 1
4707*c0909341SAndroid Build Coastguard Worker    %assign %%i %%i+1
4708*c0909341SAndroid Build Coastguard Worker   %endrep
4709*c0909341SAndroid Build Coastguard Worker   LOAD_REG 5
4710*c0909341SAndroid Build Coastguard Worker  %endif
4711*c0909341SAndroid Build Coastguard Worker %endif
4712*c0909341SAndroid Build Coastguard Worker%endmacro
4713*c0909341SAndroid Build Coastguard Worker
4714*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
4715*c0909341SAndroid Build Coastguard Worker    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4716*c0909341SAndroid Build Coastguard Worker    RET
4717*c0909341SAndroid Build Coastguard Worker %if %1
4718*c0909341SAndroid Build Coastguard Worker    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4719*c0909341SAndroid Build Coastguard Worker %endif
4720*c0909341SAndroid Build Coastguard Worker%endmacro
4721*c0909341SAndroid Build Coastguard Worker
4722*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4723*c0909341SAndroid Build Coastguard Worker %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3]
4724*c0909341SAndroid Build Coastguard Worker    SWAP                m%2, m%5
4725*c0909341SAndroid Build Coastguard Worker    movq                m%1, [srcq+ r4]
4726*c0909341SAndroid Build Coastguard Worker    movq                m%2, [srcq+ r6]
4727*c0909341SAndroid Build Coastguard Worker    movhps              m%1, [srcq+ r7]
4728*c0909341SAndroid Build Coastguard Worker    movhps              m%2, [srcq+ r9]
4729*c0909341SAndroid Build Coastguard Worker    movq                m%3, [srcq+r10]
4730*c0909341SAndroid Build Coastguard Worker    movq                m%4, [srcq+r11]
4731*c0909341SAndroid Build Coastguard Worker    movhps              m%3, [srcq+r13]
4732*c0909341SAndroid Build Coastguard Worker    movhps              m%4, [srcq+ rX]
4733*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4734*c0909341SAndroid Build Coastguard Worker    movq                m%5, [srcq+ r4]
4735*c0909341SAndroid Build Coastguard Worker    movq                m%6, [srcq+ r6]
4736*c0909341SAndroid Build Coastguard Worker    movhps              m%5, [srcq+ r7]
4737*c0909341SAndroid Build Coastguard Worker    movhps              m%6, [srcq+ r9]
4738*c0909341SAndroid Build Coastguard Worker    movq                m%7, [srcq+r10]
4739*c0909341SAndroid Build Coastguard Worker    movq                m%8, [srcq+r11]
4740*c0909341SAndroid Build Coastguard Worker    movhps              m%7, [srcq+r13]
4741*c0909341SAndroid Build Coastguard Worker    movhps              m%8, [srcq+ rX]
4742*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4743*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%1, m%9
4744*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%5, m%9
4745*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%2, m%10
4746*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%6, m%10
4747*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%3, m%11
4748*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%7, m%11
4749*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%4, m%12
4750*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m%8, m%12
4751*c0909341SAndroid Build Coastguard Worker    phaddw              m%1, m%2
4752*c0909341SAndroid Build Coastguard Worker    phaddw              m%5, m%6
4753*c0909341SAndroid Build Coastguard Worker    phaddw              m%3, m%4
4754*c0909341SAndroid Build Coastguard Worker    phaddw              m%7, m%8
4755*c0909341SAndroid Build Coastguard Worker    phaddw              m%1, m%3
4756*c0909341SAndroid Build Coastguard Worker    phaddw              m%5, m%7
4757*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%1, m12
4758*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m%5, m12
4759*c0909341SAndroid Build Coastguard Worker    SWAP                m%2, m%5
4760*c0909341SAndroid Build Coastguard Worker %endmacro
4761*c0909341SAndroid Build Coastguard Worker%else
4762*c0909341SAndroid Build Coastguard Worker %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets
4763*c0909341SAndroid Build Coastguard Worker  %if %3 == 1
4764*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+ 0]
4765*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+ 8]
4766*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+ 4]
4767*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+12]
4768*c0909341SAndroid Build Coastguard Worker  %endif
4769*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+r0]
4770*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+rX]
4771*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+r4]
4772*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+r5]
4773*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4774*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+r0]
4775*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+rX]
4776*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+r4]
4777*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+r5]
4778*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+16]
4779*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+24]
4780*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+20]
4781*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+28]
4782*c0909341SAndroid Build Coastguard Worker    sub                srcq, ssq
4783*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+r0]
4784*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+rX]
4785*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+r4]
4786*c0909341SAndroid Build Coastguard Worker    movhps               m3, [srcq+r5]
4787*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4788*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+r0]
4789*c0909341SAndroid Build Coastguard Worker    movq                 m7, [srcq+rX]
4790*c0909341SAndroid Build Coastguard Worker    movhps               m6, [srcq+r4]
4791*c0909341SAndroid Build Coastguard Worker    movhps               m7, [srcq+r5]
4792*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
4793*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, [esp+%1+ 0]
4794*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, [esp+%1+ 0]
4795*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, [esp+%1+16]
4796*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, [esp+%1+16]
4797*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, [esp+%1+32]
4798*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, [esp+%1+32]
4799*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, [esp+%1+48]
4800*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, [esp+%1+48]
4801*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m1
4802*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
4803*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
4804*c0909341SAndroid Build Coastguard Worker    phaddw               m6, m7
4805*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m2
4806*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m6
4807*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12
4808*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
4809*c0909341SAndroid Build Coastguard Worker  %if %2 != 0
4810*c0909341SAndroid Build Coastguard Worker    mova        [esp+%2+ 0], m0
4811*c0909341SAndroid Build Coastguard Worker    mova        [esp+%2+16], m4
4812*c0909341SAndroid Build Coastguard Worker  %endif
4813*c0909341SAndroid Build Coastguard Worker %endmacro
4814*c0909341SAndroid Build Coastguard Worker%endif
4815*c0909341SAndroid Build Coastguard Worker
4816*c0909341SAndroid Build Coastguard Worker%macro MC_8TAP_SCALED 1
4817*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4818*c0909341SAndroid Build Coastguard Worker %assign isprep 0
4819*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4820*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment <= STACK_ALIGNMENT
4821*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
4822*c0909341SAndroid Build Coastguard Worker  %else
4823*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy
4824*c0909341SAndroid Build Coastguard Worker  %endif
4825*c0909341SAndroid Build Coastguard Worker %else ; ARCH_X86_32
4826*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment <= STACK_ALIGNMENT
4827*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy
4828*c0909341SAndroid Build Coastguard Worker  %else
4829*c0909341SAndroid Build Coastguard Workercglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy
4830*c0909341SAndroid Build Coastguard Worker  %endif
4831*c0909341SAndroid Build Coastguard Worker %endif
4832*c0909341SAndroid Build Coastguard Worker %xdefine base_reg r12
4833*c0909341SAndroid Build Coastguard Worker %define rndshift 10
4834*c0909341SAndroid Build Coastguard Worker%else ; prep
4835*c0909341SAndroid Build Coastguard Worker %assign isprep 1
4836*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4837*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment <= STACK_ALIGNMENT
4838*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
4839*c0909341SAndroid Build Coastguard Worker   %xdefine tmp_stridem r14q
4840*c0909341SAndroid Build Coastguard Worker  %else
4841*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy
4842*c0909341SAndroid Build Coastguard Worker   %define tmp_stridem qword [rsp+0x138]
4843*c0909341SAndroid Build Coastguard Worker  %endif
4844*c0909341SAndroid Build Coastguard Worker  %xdefine base_reg r11
4845*c0909341SAndroid Build Coastguard Worker %else ; ARCH_X86_32
4846*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment <= STACK_ALIGNMENT
4847*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
4848*c0909341SAndroid Build Coastguard Worker  %else
4849*c0909341SAndroid Build Coastguard Workercglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy
4850*c0909341SAndroid Build Coastguard Worker  %endif
4851*c0909341SAndroid Build Coastguard Worker  %define tmp_stridem dword [esp+0x138]
4852*c0909341SAndroid Build Coastguard Worker %endif
4853*c0909341SAndroid Build Coastguard Worker %define rndshift 6
4854*c0909341SAndroid Build Coastguard Worker%endif
4855*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4856*c0909341SAndroid Build Coastguard Worker    mov         [esp+0x1f0], t0d
4857*c0909341SAndroid Build Coastguard Worker    mov         [esp+0x1f4], t1d
4858*c0909341SAndroid Build Coastguard Worker %if !isprep && required_stack_alignment > STACK_ALIGNMENT
4859*c0909341SAndroid Build Coastguard Worker    mov                dstd, dstm
4860*c0909341SAndroid Build Coastguard Worker    mov                 dsd, dsm
4861*c0909341SAndroid Build Coastguard Worker    mov                srcd, srcm
4862*c0909341SAndroid Build Coastguard Worker    mov                 ssd, ssm
4863*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
4864*c0909341SAndroid Build Coastguard Worker    mov                  r4, mxm
4865*c0909341SAndroid Build Coastguard Worker  %define r0m  [esp+0x200]
4866*c0909341SAndroid Build Coastguard Worker  %define dsm  [esp+0x204]
4867*c0909341SAndroid Build Coastguard Worker  %define dsmp dsm
4868*c0909341SAndroid Build Coastguard Worker  %define r1m  dsm
4869*c0909341SAndroid Build Coastguard Worker  %define r2m  [esp+0x208]
4870*c0909341SAndroid Build Coastguard Worker  %define ssm  [esp+0x20c]
4871*c0909341SAndroid Build Coastguard Worker  %define r3m  ssm
4872*c0909341SAndroid Build Coastguard Worker  %define hm   [esp+0x210]
4873*c0909341SAndroid Build Coastguard Worker  %define mxm  [esp+0x214]
4874*c0909341SAndroid Build Coastguard Worker    mov                 r0m, dstd
4875*c0909341SAndroid Build Coastguard Worker    mov                 dsm, dsd
4876*c0909341SAndroid Build Coastguard Worker    mov                 r2m, srcd
4877*c0909341SAndroid Build Coastguard Worker    mov                 ssm, ssd
4878*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
4879*c0909341SAndroid Build Coastguard Worker    mov                  r0, mym
4880*c0909341SAndroid Build Coastguard Worker    mov                  r1, dxm
4881*c0909341SAndroid Build Coastguard Worker    mov                  r2, dym
4882*c0909341SAndroid Build Coastguard Worker  %define mym [esp+0x218]
4883*c0909341SAndroid Build Coastguard Worker  %define dxm [esp+0x09c]
4884*c0909341SAndroid Build Coastguard Worker  %define dym [esp+0x21c]
4885*c0909341SAndroid Build Coastguard Worker    mov                 mxm, r4
4886*c0909341SAndroid Build Coastguard Worker    mov                 mym, r0
4887*c0909341SAndroid Build Coastguard Worker    mov                 dxm, r1
4888*c0909341SAndroid Build Coastguard Worker    mov                 dym, r2
4889*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4890*c0909341SAndroid Build Coastguard Worker %endif
4891*c0909341SAndroid Build Coastguard Worker %if isprep && required_stack_alignment > STACK_ALIGNMENT
4892*c0909341SAndroid Build Coastguard Worker  %xdefine base_reg r5
4893*c0909341SAndroid Build Coastguard Worker %else
4894*c0909341SAndroid Build Coastguard Worker  %xdefine base_reg r6
4895*c0909341SAndroid Build Coastguard Worker %endif
4896*c0909341SAndroid Build Coastguard Worker    mov                 ssd, ssm
4897*c0909341SAndroid Build Coastguard Worker%endif
4898*c0909341SAndroid Build Coastguard Worker    LEA            base_reg, %1_8tap_scaled_8bpc_ssse3
4899*c0909341SAndroid Build Coastguard Worker%xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3
4900*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
4901*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
4902*c0909341SAndroid Build Coastguard Worker%endif
4903*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
4904*c0909341SAndroid Build Coastguard Worker %define m8  m0
4905*c0909341SAndroid Build Coastguard Worker %define m9  m1
4906*c0909341SAndroid Build Coastguard Worker %define m14 m4
4907*c0909341SAndroid Build Coastguard Worker %define m15 m3
4908*c0909341SAndroid Build Coastguard Worker%endif
4909*c0909341SAndroid Build Coastguard Worker    movd                 m8, dxm
4910*c0909341SAndroid Build Coastguard Worker    movd                m14, mxm
4911*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m8, q0000
4912*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m14, q0000
4913*c0909341SAndroid Build Coastguard Worker%if isprep && UNIX64
4914*c0909341SAndroid Build Coastguard Worker    mov                 r5d, t0d
4915*c0909341SAndroid Build Coastguard Worker DECLARE_REG_TMP 5, 7
4916*c0909341SAndroid Build Coastguard Worker%endif
4917*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4918*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
4919*c0909341SAndroid Build Coastguard Worker%endif
4920*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
4921*c0909341SAndroid Build Coastguard Worker %if WIN64
4922*c0909341SAndroid Build Coastguard Worker    mov                 r8d, hm
4923*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
4924*c0909341SAndroid Build Coastguard Worker  %define hm r5m
4925*c0909341SAndroid Build Coastguard Worker  %define dxm r8m
4926*c0909341SAndroid Build Coastguard Worker %elif ARCH_X86_64
4927*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
4928*c0909341SAndroid Build Coastguard Worker  %define hm r6m
4929*c0909341SAndroid Build Coastguard Worker %endif
4930*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4931*c0909341SAndroid Build Coastguard Worker  %if required_stack_alignment > STACK_ALIGNMENT
4932*c0909341SAndroid Build Coastguard Worker   %define dsm [rsp+0x138]
4933*c0909341SAndroid Build Coastguard Worker   %define rX r1
4934*c0909341SAndroid Build Coastguard Worker   %define rXd r1d
4935*c0909341SAndroid Build Coastguard Worker  %else
4936*c0909341SAndroid Build Coastguard Worker   %define dsm dsq
4937*c0909341SAndroid Build Coastguard Worker   %define rX r14
4938*c0909341SAndroid Build Coastguard Worker   %define rXd r14d
4939*c0909341SAndroid Build Coastguard Worker  %endif
4940*c0909341SAndroid Build Coastguard Worker %else
4941*c0909341SAndroid Build Coastguard Worker  %define rX r1
4942*c0909341SAndroid Build Coastguard Worker %endif
4943*c0909341SAndroid Build Coastguard Worker%else ; prep
4944*c0909341SAndroid Build Coastguard Worker %if WIN64
4945*c0909341SAndroid Build Coastguard Worker    mov                 r7d, hm
4946*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
4947*c0909341SAndroid Build Coastguard Worker  %define hm r4m
4948*c0909341SAndroid Build Coastguard Worker  %define dxm r7m
4949*c0909341SAndroid Build Coastguard Worker %elif ARCH_X86_64
4950*c0909341SAndroid Build Coastguard Worker  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
4951*c0909341SAndroid Build Coastguard Worker  %define hm [rsp+0x94]
4952*c0909341SAndroid Build Coastguard Worker %endif
4953*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4954*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
4955*c0909341SAndroid Build Coastguard Worker  %define rX r14
4956*c0909341SAndroid Build Coastguard Worker  %define rXd r14d
4957*c0909341SAndroid Build Coastguard Worker %else
4958*c0909341SAndroid Build Coastguard Worker  %define rX r3
4959*c0909341SAndroid Build Coastguard Worker %endif
4960*c0909341SAndroid Build Coastguard Worker%endif
4961*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4962*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pd_0x3ff]
4963*c0909341SAndroid Build Coastguard Worker    mova                m12, [base+pw_8192]
4964*c0909341SAndroid Build Coastguard Worker %ifidn %1, put
4965*c0909341SAndroid Build Coastguard Worker    mova                m13, [base+pd_512]
4966*c0909341SAndroid Build Coastguard Worker %else
4967*c0909341SAndroid Build Coastguard Worker    mova                m13, [base+pd_32]
4968*c0909341SAndroid Build Coastguard Worker %endif
4969*c0909341SAndroid Build Coastguard Worker%else
4970*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0x3ff]
4971*c0909341SAndroid Build Coastguard Worker %define m12 [base+pw_8192]
4972*c0909341SAndroid Build Coastguard Worker %ifidn %1, put
4973*c0909341SAndroid Build Coastguard Worker  %define m13 [base+pd_512]
4974*c0909341SAndroid Build Coastguard Worker %else
4975*c0909341SAndroid Build Coastguard Worker  %define m13 [base+pd_32]
4976*c0909341SAndroid Build Coastguard Worker %endif
4977*c0909341SAndroid Build Coastguard Worker%endif
4978*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
4979*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
4980*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
4981*c0909341SAndroid Build Coastguard Worker    movzx               r7d, t1b
4982*c0909341SAndroid Build Coastguard Worker    shr                 t1d, 16
4983*c0909341SAndroid Build Coastguard Worker    cmp                  hd, 6
4984*c0909341SAndroid Build Coastguard Worker    cmovs               t1d, r7d
4985*c0909341SAndroid Build Coastguard Worker    sub                srcq, ss3q
4986*c0909341SAndroid Build Coastguard Worker%else
4987*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
4988*c0909341SAndroid Build Coastguard Worker    mov                  r1, [esp+0x1f4]
4989*c0909341SAndroid Build Coastguard Worker    lea                  r0, [ssq*3]
4990*c0909341SAndroid Build Coastguard Worker    movzx                r2, r1b
4991*c0909341SAndroid Build Coastguard Worker    shr                  r1, 16
4992*c0909341SAndroid Build Coastguard Worker    cmp            dword hm, 6
4993*c0909341SAndroid Build Coastguard Worker    cmovs                r1, r2
4994*c0909341SAndroid Build Coastguard Worker    mov         [esp+0x1f4], r1
4995*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
4996*c0909341SAndroid Build Coastguard Worker    mov                  r2, r2m
4997*c0909341SAndroid Build Coastguard Worker    sub                srcq, r0
4998*c0909341SAndroid Build Coastguard Worker MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
4999*c0909341SAndroid Build Coastguard Worker %define ss3q r0
5000*c0909341SAndroid Build Coastguard Worker %define myd r4
5001*c0909341SAndroid Build Coastguard Worker %define dyd dword dym
5002*c0909341SAndroid Build Coastguard Worker %define hd  dword hm
5003*c0909341SAndroid Build Coastguard Worker%endif
5004*c0909341SAndroid Build Coastguard Worker    cmp                 dyd, 1024
5005*c0909341SAndroid Build Coastguard Worker    je .dy1
5006*c0909341SAndroid Build Coastguard Worker    cmp                 dyd, 2048
5007*c0909341SAndroid Build Coastguard Worker    je .dy2
5008*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
5009*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
5010*c0909341SAndroid Build Coastguard Worker    jmp                  wq
5011*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
5012*c0909341SAndroid Build Coastguard Worker.w2:
5013*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5014*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5015*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
5016*c0909341SAndroid Build Coastguard Worker    dec                srcq
5017*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
5018*c0909341SAndroid Build Coastguard Worker %else
5019*c0909341SAndroid Build Coastguard Worker    movzx                r4, byte [esp+0x1f0]
5020*c0909341SAndroid Build Coastguard Worker    dec                srcq
5021*c0909341SAndroid Build Coastguard Worker    movd                m15, r4
5022*c0909341SAndroid Build Coastguard Worker %endif
5023*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m8
5024*c0909341SAndroid Build Coastguard Worker    SWAP                 m8, m9
5025*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-1]
5026*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5027*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pd_0x4000]
5028*c0909341SAndroid Build Coastguard Worker %else
5029*c0909341SAndroid Build Coastguard Worker  %define m11 [base+pd_0x4000]
5030*c0909341SAndroid Build Coastguard Worker %endif
5031*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
5032*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
5033*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
5034*c0909341SAndroid Build Coastguard Worker    paddd               m15, m8
5035*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
5036*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
5037*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5038*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
5039*c0909341SAndroid Build Coastguard Worker %else
5040*c0909341SAndroid Build Coastguard Worker    movd                r3d, m15
5041*c0909341SAndroid Build Coastguard Worker %endif
5042*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_dw]
5043*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_s_shuf2]
5044*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r4*8+2]
5045*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5046*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r6*8+2]
5047*c0909341SAndroid Build Coastguard Worker %else
5048*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r3*8+2]
5049*c0909341SAndroid Build Coastguard Worker %endif
5050*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
5051*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
5052*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
5053*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
5054*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
5055*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
5056*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
5057*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x180], m14
5058*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m0
5059*c0909341SAndroid Build Coastguard Worker    SWAP                 m6, m3
5060*c0909341SAndroid Build Coastguard Worker  %define m8  m5
5061*c0909341SAndroid Build Coastguard Worker  %define m15 m6
5062*c0909341SAndroid Build Coastguard Worker %endif
5063*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
5064*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*2]
5065*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+ssq*1]
5066*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+ss3q ]
5067*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
5068*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5069*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
5070*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
5071*c0909341SAndroid Build Coastguard Worker %endif
5072*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
5073*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*2]
5074*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+ssq*1]
5075*c0909341SAndroid Build Coastguard Worker    movhps               m3, [srcq+ss3q ]
5076*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
5077*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m7
5078*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m15
5079*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5080*c0909341SAndroid Build Coastguard Worker    pand                m11, m8
5081*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
5082*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m8
5083*c0909341SAndroid Build Coastguard Worker    por                 m15, m11
5084*c0909341SAndroid Build Coastguard Worker %else
5085*c0909341SAndroid Build Coastguard Worker    pand                 m7, m8, m11
5086*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
5087*c0909341SAndroid Build Coastguard Worker  %define m8  m6
5088*c0909341SAndroid Build Coastguard Worker  %define m15 m5
5089*c0909341SAndroid Build Coastguard Worker    por                 m15, m7
5090*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x190], m15
5091*c0909341SAndroid Build Coastguard Worker %endif
5092*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
5093*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
5094*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
5095*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
5096*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
5097*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
5098*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
5099*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
5100*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m2
5101*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m3
5102*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12       ; 0 1 2 3
5103*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12       ; 4 5 6 7
5104*c0909341SAndroid Build Coastguard Worker    palignr              m2, m1, m0, 4 ; 1 2 3 4
5105*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0, m2    ; 01 12
5106*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2        ; 23 34
5107*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m1, q0321 ; 5 6 7 _
5108*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m5    ; 45 56
5109*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m1, m5    ; 67 __
5110*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
5111*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5112*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
5113*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1a0], m3
5114*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1b0], m0
5115*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1c0], m2
5116*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1d0], m4
5117*c0909341SAndroid Build Coastguard Worker %endif
5118*c0909341SAndroid Build Coastguard Worker.w2_loop:
5119*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
5120*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5121*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
5122*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
5123*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
5124*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
5125*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
5126*c0909341SAndroid Build Coastguard Worker    movq                m11, r6q
5127*c0909341SAndroid Build Coastguard Worker    punpcklbw           m11, m11
5128*c0909341SAndroid Build Coastguard Worker    psraw               m11, 8
5129*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m11, q0000
5130*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m11, q1111
5131*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m11, q2222
5132*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m11, q3333
5133*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m3, m8
5134*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m0, m9
5135*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m2, m10
5136*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m4, m11
5137*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
5138*c0909341SAndroid Build Coastguard Worker    paddd                m7, m8
5139*c0909341SAndroid Build Coastguard Worker %else
5140*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
5141*c0909341SAndroid Build Coastguard Worker    mov                  r1, [esp+0x1f4]
5142*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
5143*c0909341SAndroid Build Coastguard Worker    shr                  r4, 6
5144*c0909341SAndroid Build Coastguard Worker    lea                  r1, [r1+r4]
5145*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
5146*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r1*8+0]
5147*c0909341SAndroid Build Coastguard Worker    cmovnz               r3, [base+subpel_filters+r1*8+4]
5148*c0909341SAndroid Build Coastguard Worker    movd                 m7, r4
5149*c0909341SAndroid Build Coastguard Worker    movd                 m6, r3
5150*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m6
5151*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m7
5152*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
5153*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q0000
5154*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q1111
5155*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
5156*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m6
5157*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q2222
5158*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q3333
5159*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
5160*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7
5161*c0909341SAndroid Build Coastguard Worker    paddd                m3, m0
5162*c0909341SAndroid Build Coastguard Worker    paddd                m2, m4
5163*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m3
5164*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m2
5165*c0909341SAndroid Build Coastguard Worker %endif
5166*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
5167*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
5168*c0909341SAndroid Build Coastguard Worker    psrad                m5, 10
5169*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
5170*c0909341SAndroid Build Coastguard Worker    packuswb             m5, m5
5171*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
5172*c0909341SAndroid Build Coastguard Worker    pextrw              r6d, m5, 0
5173*c0909341SAndroid Build Coastguard Worker    mov              [dstq], r6w
5174*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
5175*c0909341SAndroid Build Coastguard Worker    dec                  hd
5176*c0909341SAndroid Build Coastguard Worker    jz .ret
5177*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
5178*c0909341SAndroid Build Coastguard Worker %else
5179*c0909341SAndroid Build Coastguard Worker    pextrw              r3d, m5, 0
5180*c0909341SAndroid Build Coastguard Worker    mov              [dstq], r3w
5181*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
5182*c0909341SAndroid Build Coastguard Worker    dec                  hd
5183*c0909341SAndroid Build Coastguard Worker    jz .ret
5184*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5185*c0909341SAndroid Build Coastguard Worker    add                 myd, dym
5186*c0909341SAndroid Build Coastguard Worker %endif
5187*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
5188*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
5189*c0909341SAndroid Build Coastguard Worker    SWAP                 m3, m5
5190*c0909341SAndroid Build Coastguard Worker    SWAP                 m2, m7
5191*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+0x1a0]
5192*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x1b0]
5193*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x1c0]
5194*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+0x1d0]
5195*c0909341SAndroid Build Coastguard Worker  %define m14 [esp+0x180]
5196*c0909341SAndroid Build Coastguard Worker  %define m15 [esp+0x190]
5197*c0909341SAndroid Build Coastguard Worker %endif
5198*c0909341SAndroid Build Coastguard Worker    jz .w2_loop
5199*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
5200*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
5201*c0909341SAndroid Build Coastguard Worker %endif
5202*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq]
5203*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
5204*c0909341SAndroid Build Coastguard Worker    jz .w2_skip_line
5205*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
5206*c0909341SAndroid Build Coastguard Worker    shufps               m3, m0, q1032      ; 01 12
5207*c0909341SAndroid Build Coastguard Worker    shufps               m0, m2, q1032      ; 23 34
5208*c0909341SAndroid Build Coastguard Worker    shufps               m2, m4, q1032      ; 45 56
5209*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
5210*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
5211*c0909341SAndroid Build Coastguard Worker    phaddw               m5, m5
5212*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m12
5213*c0909341SAndroid Build Coastguard Worker    palignr              m4, m5, m1, 12
5214*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m1, m4, m4         ; 6 7 6 7
5215*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m1, m5         ; 67 __
5216*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
5217*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1a0], m3
5218*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1b0], m0
5219*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1c0], m2
5220*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1d0], m4
5221*c0909341SAndroid Build Coastguard Worker %endif
5222*c0909341SAndroid Build Coastguard Worker    jmp .w2_loop
5223*c0909341SAndroid Build Coastguard Worker.w2_skip_line:
5224*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+ssq*1]
5225*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
5226*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0             ; 01 12
5227*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2             ; 23 34
5228*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
5229*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
5230*c0909341SAndroid Build Coastguard Worker    phaddw               m5, m5
5231*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m12            ; 6 7 6 7
5232*c0909341SAndroid Build Coastguard Worker    palignr              m4, m5, m1, 8      ; 4 5 6 7
5233*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m4, q0321      ; 5 6 7 _
5234*c0909341SAndroid Build Coastguard Worker    mova                 m1, m4
5235*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m4, m5         ; 45 56
5236*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5             ; 67 __
5237*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
5238*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1a0], m3
5239*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1b0], m0
5240*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1c0], m2
5241*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x1d0], m4
5242*c0909341SAndroid Build Coastguard Worker %endif
5243*c0909341SAndroid Build Coastguard Worker    jmp .w2_loop
5244*c0909341SAndroid Build Coastguard Worker%endif
5245*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
5246*c0909341SAndroid Build Coastguard Worker.w4:
5247*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5248*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5249*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
5250*c0909341SAndroid Build Coastguard Worker    dec                srcq
5251*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
5252*c0909341SAndroid Build Coastguard Worker%else
5253*c0909341SAndroid Build Coastguard Worker %define m8  m0
5254*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
5255*c0909341SAndroid Build Coastguard Worker %define m15 m3
5256*c0909341SAndroid Build Coastguard Worker    movzx                r4, byte [esp+0x1f0]
5257*c0909341SAndroid Build Coastguard Worker    dec                srcq
5258*c0909341SAndroid Build Coastguard Worker    movd                m15, r4
5259*c0909341SAndroid Build Coastguard Worker%endif
5260*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul]
5261*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5262*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pd_0x4000]
5263*c0909341SAndroid Build Coastguard Worker%else
5264*c0909341SAndroid Build Coastguard Worker  %define m11 [base+pd_0x4000]
5265*c0909341SAndroid Build Coastguard Worker%endif
5266*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
5267*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
5268*c0909341SAndroid Build Coastguard Worker    pand                 m0, m14, m10
5269*c0909341SAndroid Build Coastguard Worker    psrld                m0, 6
5270*c0909341SAndroid Build Coastguard Worker    paddd               m15, m0
5271*c0909341SAndroid Build Coastguard Worker    psrldq               m7, m15, 8
5272*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5273*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
5274*c0909341SAndroid Build Coastguard Worker    movd               r11d, m7
5275*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
5276*c0909341SAndroid Build Coastguard Worker    psrldq               m7, 4
5277*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
5278*c0909341SAndroid Build Coastguard Worker    movd               r13d, m7
5279*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+ r4*8+2]
5280*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+r11*8+2]
5281*c0909341SAndroid Build Coastguard Worker    movd                 m3, [base+subpel_filters+ r6*8+2]
5282*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base+subpel_filters+r13*8+2]
5283*c0909341SAndroid Build Coastguard Worker%else
5284*c0909341SAndroid Build Coastguard Worker    movd                 r0, m15
5285*c0909341SAndroid Build Coastguard Worker    movd                 rX, m7
5286*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
5287*c0909341SAndroid Build Coastguard Worker    psrldq               m7, 4
5288*c0909341SAndroid Build Coastguard Worker    movd                 r4, m15
5289*c0909341SAndroid Build Coastguard Worker    movd                 r5, m7
5290*c0909341SAndroid Build Coastguard Worker    movd                 m1, [base+subpel_filters+r0*8+2]
5291*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+rX*8+2]
5292*c0909341SAndroid Build Coastguard Worker    movd                 m3, [base+subpel_filters+r4*8+2]
5293*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r5*8+2]
5294*c0909341SAndroid Build Coastguard Worker    movifprep            r3, r3m
5295*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m7
5296*c0909341SAndroid Build Coastguard Worker %define m15 m1
5297*c0909341SAndroid Build Coastguard Worker%endif
5298*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_dw]
5299*c0909341SAndroid Build Coastguard Worker    movq                 m6, [base+subpel_s_shuf2]
5300*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
5301*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m3
5302*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4
5303*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m2
5304*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m6
5305*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
5306*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
5307*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5308*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m0, m9
5309*c0909341SAndroid Build Coastguard Worker    pand                m11, m0
5310*c0909341SAndroid Build Coastguard Worker%else
5311*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x180], m14
5312*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m4
5313*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
5314*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m0, m3
5315*c0909341SAndroid Build Coastguard Worker    pand                 m2, m11, m0
5316*c0909341SAndroid Build Coastguard Worker %define m11 m2
5317*c0909341SAndroid Build Coastguard Worker%endif
5318*c0909341SAndroid Build Coastguard Worker    pandn                m0, m15
5319*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5320*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m0
5321*c0909341SAndroid Build Coastguard Worker%else
5322*c0909341SAndroid Build Coastguard Worker %define m15 m0
5323*c0909341SAndroid Build Coastguard Worker%endif
5324*c0909341SAndroid Build Coastguard Worker    por                 m15, m11
5325*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5326*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*0]
5327*c0909341SAndroid Build Coastguard Worker    movu                 m9, [srcq+ssq*1]
5328*c0909341SAndroid Build Coastguard Worker    movu                 m8, [srcq+ssq*2]
5329*c0909341SAndroid Build Coastguard Worker    movu                m10, [srcq+ss3q ]
5330*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
5331*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0]
5332*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*1]
5333*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2]
5334*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ss3q ]
5335*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
5336*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
5337*c0909341SAndroid Build Coastguard Worker    pshufb               m9, m14
5338*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m14
5339*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m14
5340*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
5341*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m14
5342*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
5343*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
5344*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
5345*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m9, m15
5346*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m8, m15
5347*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m15
5348*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
5349*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m15
5350*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
5351*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
5352*c0909341SAndroid Build Coastguard Worker    phaddw               m7, m9
5353*c0909341SAndroid Build Coastguard Worker    phaddw               m8, m10
5354*c0909341SAndroid Build Coastguard Worker    phaddw               m9, m2, m4
5355*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m5
5356*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m12            ; 0 1
5357*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m8, m12            ; 2 3
5358*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m9, m12            ; 4 5
5359*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m12            ; 6 7
5360*c0909341SAndroid Build Coastguard Worker    shufps               m4, m7, m8, q1032  ; 1 2
5361*c0909341SAndroid Build Coastguard Worker    shufps               m5, m8, m9, q1032  ; 3 4
5362*c0909341SAndroid Build Coastguard Worker    shufps               m6, m9, m3, q1032  ; 5 6
5363*c0909341SAndroid Build Coastguard Worker    psrldq              m11, m3, 8          ; 7 _
5364*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m7, m4 ; 01
5365*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m4     ; 12
5366*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m8, m5 ; 23
5367*c0909341SAndroid Build Coastguard Worker    punpckhwd            m8, m5     ; 34
5368*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m9, m6 ; 45
5369*c0909341SAndroid Build Coastguard Worker    punpckhwd            m9, m6     ; 56
5370*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m11    ; 67
5371*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m7
5372*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m8
5373*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m9
5374*c0909341SAndroid Build Coastguard Worker%else
5375*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x190], m15
5376*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
5377*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*0]
5378*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
5379*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*2]
5380*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ss3q ]
5381*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
5382*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
5383*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
5384*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
5385*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
5386*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
5387*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
5388*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
5389*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m15
5390*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
5391*c0909341SAndroid Build Coastguard Worker    phaddw               m7, m6
5392*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
5393*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1]
5394*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2]
5395*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ss3q ]
5396*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
5397*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
5398*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
5399*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
5400*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
5401*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
5402*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
5403*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
5404*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m15
5405*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m5
5406*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m6
5407*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m12
5408*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m12
5409*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12
5410*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m12
5411*c0909341SAndroid Build Coastguard Worker    shufps               m4, m2, m7, q1032  ; 1 2
5412*c0909341SAndroid Build Coastguard Worker    shufps               m5, m7, m1, q1032  ; 3 4
5413*c0909341SAndroid Build Coastguard Worker    shufps               m6, m1, m3, q1032  ; 5 6
5414*c0909341SAndroid Build Coastguard Worker    psrldq               m0, m3, 8          ; 7 _
5415*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1a0], m0
5416*c0909341SAndroid Build Coastguard Worker %define m11 [esp+0x1a0]
5417*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m2, m4      ; 01
5418*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m4          ; 12
5419*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m7, m5      ; 23
5420*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m5          ; 34
5421*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m1, m6      ; 45
5422*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m6          ; 56
5423*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, [esp+0x1a0] ; 67
5424*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5425*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
5426*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m0 ; 01
5427*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1c0], m4 ; 23
5428*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1d0], m5 ; 45
5429*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1e0], m3 ; 67
5430*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m2 ; 12
5431*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m7 ; 34
5432*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m1 ; 56
5433*c0909341SAndroid Build Coastguard Worker    SWAP                 m1, m4
5434*c0909341SAndroid Build Coastguard Worker    SWAP                 m2, m5
5435*c0909341SAndroid Build Coastguard Worker%endif
5436*c0909341SAndroid Build Coastguard Worker.w4_loop:
5437*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
5438*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5439*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
5440*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
5441*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
5442*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
5443*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
5444*c0909341SAndroid Build Coastguard Worker    movq                m10, r6q
5445*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m10
5446*c0909341SAndroid Build Coastguard Worker    psraw               m10, 8
5447*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m10, q0000
5448*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m10, q1111
5449*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m10, q2222
5450*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
5451*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m7
5452*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m8
5453*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m9
5454*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m10
5455*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
5456*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
5457*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
5458*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
5459*c0909341SAndroid Build Coastguard Worker%else
5460*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
5461*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f4]
5462*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
5463*c0909341SAndroid Build Coastguard Worker    shr                  r4, 6
5464*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+r4]
5465*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
5466*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r5*8+0]
5467*c0909341SAndroid Build Coastguard Worker    cmovnz               r3, [base+subpel_filters+r5*8+4]
5468*c0909341SAndroid Build Coastguard Worker    movd                 m7, r4
5469*c0909341SAndroid Build Coastguard Worker    movd                 m6, r3
5470*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m6
5471*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m7
5472*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
5473*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m7, q0000
5474*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q1111
5475*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q2222
5476*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q3333
5477*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4
5478*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m5
5479*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m6
5480*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m7
5481*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
5482*c0909341SAndroid Build Coastguard Worker    paddd                m2, m3
5483*c0909341SAndroid Build Coastguard Worker    paddd                m0, m13
5484*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
5485*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m0
5486*c0909341SAndroid Build Coastguard Worker%endif
5487*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
5488*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m4
5489*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
5490*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
5491*c0909341SAndroid Build Coastguard Worker    movd             [dstq], m4
5492*c0909341SAndroid Build Coastguard Worker    add                dstq, dsmp
5493*c0909341SAndroid Build Coastguard Worker%else
5494*c0909341SAndroid Build Coastguard Worker    movq             [tmpq], m4
5495*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
5496*c0909341SAndroid Build Coastguard Worker%endif
5497*c0909341SAndroid Build Coastguard Worker    dec                  hd
5498*c0909341SAndroid Build Coastguard Worker    jz .ret
5499*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5500*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
5501*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
5502*c0909341SAndroid Build Coastguard Worker    jz .w4_loop
5503*c0909341SAndroid Build Coastguard Worker%else
5504*c0909341SAndroid Build Coastguard Worker    SWAP                 m0, m4
5505*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5506*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
5507*c0909341SAndroid Build Coastguard Worker    add                 myd, dym
5508*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
5509*c0909341SAndroid Build Coastguard Worker    jnz .w4_next_line
5510*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x1b0]
5511*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x1c0]
5512*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x1d0]
5513*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+0x1e0]
5514*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
5515*c0909341SAndroid Build Coastguard Worker.w4_next_line:
5516*c0909341SAndroid Build Coastguard Worker  %define m14 [esp+0x180]
5517*c0909341SAndroid Build Coastguard Worker  %define m15 [esp+0x190]
5518*c0909341SAndroid Build Coastguard Worker%endif
5519*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq]
5520*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
5521*c0909341SAndroid Build Coastguard Worker    jz .w4_skip_line
5522*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5523*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x00]
5524*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m1
5525*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+0x10]
5526*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m2
5527*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x20]
5528*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m3
5529*c0909341SAndroid Build Coastguard Worker%else
5530*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+0x1c0]
5531*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x000]
5532*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m5
5533*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m0
5534*c0909341SAndroid Build Coastguard Worker    mova                 m6, [esp+0x1d0]
5535*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+0x010]
5536*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m6
5537*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1c0], m1
5538*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+0x1e0]
5539*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x020]
5540*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m7
5541*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1d0], m2
5542*c0909341SAndroid Build Coastguard Worker%endif
5543*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m14
5544*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m15
5545*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m4
5546*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
5547*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m11, m4
5548*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5549*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1e0], m3
5550*c0909341SAndroid Build Coastguard Worker%endif
5551*c0909341SAndroid Build Coastguard Worker    mova                m11, m4
5552*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
5553*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
5554*c0909341SAndroid Build Coastguard Worker.w4_skip_line:
5555*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
5556*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x1c0]
5557*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x1d0]
5558*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x1e0]
5559*c0909341SAndroid Build Coastguard Worker%endif
5560*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1]
5561*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
5562*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+0x10]
5563*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+0x20]
5564*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m14
5565*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
5566*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m15
5567*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
5568*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
5569*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
5570*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m11, m4
5571*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m6
5572*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m7
5573*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m5
5574*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5575*c0909341SAndroid Build Coastguard Worker    psrldq              m11, m4, 8
5576*c0909341SAndroid Build Coastguard Worker    mova                 m0, m1
5577*c0909341SAndroid Build Coastguard Worker    mova                 m1, m2
5578*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
5579*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m11
5580*c0909341SAndroid Build Coastguard Worker%else
5581*c0909341SAndroid Build Coastguard Worker    psrldq               m6, m4, 8
5582*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m4, m6
5583*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1a0], m6
5584*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m0
5585*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1c0], m1
5586*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1d0], m2
5587*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1e0], m3
5588*c0909341SAndroid Build Coastguard Worker%endif
5589*c0909341SAndroid Build Coastguard Worker    jmp .w4_loop
5590*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
5591*c0909341SAndroid Build Coastguard Worker.w8:
5592*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 1
5593*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
5594*c0909341SAndroid Build Coastguard Worker    jmp .w_start
5595*c0909341SAndroid Build Coastguard Worker.w16:
5596*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 2
5597*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
5598*c0909341SAndroid Build Coastguard Worker    jmp .w_start
5599*c0909341SAndroid Build Coastguard Worker.w32:
5600*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 4
5601*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
5602*c0909341SAndroid Build Coastguard Worker    jmp .w_start
5603*c0909341SAndroid Build Coastguard Worker.w64:
5604*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 8
5605*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
5606*c0909341SAndroid Build Coastguard Worker    jmp .w_start
5607*c0909341SAndroid Build Coastguard Worker.w128:
5608*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 16
5609*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
5610*c0909341SAndroid Build Coastguard Worker.w_start:
5611*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
5612*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
5613*c0909341SAndroid Build Coastguard Worker%endif
5614*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5615*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
5616*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
5617*c0909341SAndroid Build Coastguard Worker%else
5618*c0909341SAndroid Build Coastguard Worker %define m8  m0
5619*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
5620*c0909341SAndroid Build Coastguard Worker %define m15 m3
5621*c0909341SAndroid Build Coastguard Worker %if isprep
5622*c0909341SAndroid Build Coastguard Worker  %define ssq ssm
5623*c0909341SAndroid Build Coastguard Worker %endif
5624*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+0x1f0]
5625*c0909341SAndroid Build Coastguard Worker    shr                  r4, 16
5626*c0909341SAndroid Build Coastguard Worker    movd                m15, r4
5627*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
5628*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5629*c0909341SAndroid Build Coastguard Worker%endif
5630*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
5631*c0909341SAndroid Build Coastguard Worker    pslld                m7, m8, 2 ; dx*4
5632*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
5633*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
5634*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
5635*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x100], m7
5636*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x120], m15
5637*c0909341SAndroid Build Coastguard Worker    mov         [rsp+0x098], srcq
5638*c0909341SAndroid Build Coastguard Worker    mov         [rsp+0x130], r0q ; dstq / tmpq
5639*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 && UNIX64
5640*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
5641*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
5642*c0909341SAndroid Build Coastguard Worker    mov                  r5, hm
5643*c0909341SAndroid Build Coastguard Worker    mov         [esp+0x094], myd
5644*c0909341SAndroid Build Coastguard Worker    mov         [esp+0x134], r5
5645*c0909341SAndroid Build Coastguard Worker%endif
5646*c0909341SAndroid Build Coastguard Worker    jmp .hloop
5647*c0909341SAndroid Build Coastguard Worker.hloop_prep:
5648*c0909341SAndroid Build Coastguard Worker    dec   dword [rsp+0x090]
5649*c0909341SAndroid Build Coastguard Worker    jz .ret
5650*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5651*c0909341SAndroid Build Coastguard Worker    add   qword [rsp+0x130], 8*(isprep+1)
5652*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
5653*c0909341SAndroid Build Coastguard Worker%else
5654*c0909341SAndroid Build Coastguard Worker    add   dword [esp+0x130], 8*(isprep+1)
5655*c0909341SAndroid Build Coastguard Worker    mov                 myd, [esp+0x094]
5656*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x134]
5657*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+0x130]
5658*c0909341SAndroid Build Coastguard Worker%endif
5659*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+0x100]
5660*c0909341SAndroid Build Coastguard Worker    mova                m14, [rsp+0x110]
5661*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5662*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pd_0x3ff]
5663*c0909341SAndroid Build Coastguard Worker%endif
5664*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+0x120]
5665*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
5666*c0909341SAndroid Build Coastguard Worker    mov                srcq, [rsp+0x098]
5667*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5668*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [rsp+0x130] ; dstq / tmpq
5669*c0909341SAndroid Build Coastguard Worker%else
5670*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
5671*c0909341SAndroid Build Coastguard Worker    mov                  hm, r5
5672*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
5673*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
5674*c0909341SAndroid Build Coastguard Worker%endif
5675*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7
5676*c0909341SAndroid Build Coastguard Worker.hloop:
5677*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5678*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pq_0x40000000]
5679*c0909341SAndroid Build Coastguard Worker%else
5680*c0909341SAndroid Build Coastguard Worker %define m11 [base+pq_0x40000000]
5681*c0909341SAndroid Build Coastguard Worker%endif
5682*c0909341SAndroid Build Coastguard Worker    psrld                m2, m14, 10
5683*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m2
5684*c0909341SAndroid Build Coastguard Worker    pand                 m6, m14, m10
5685*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
5686*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15, m6
5687*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m9
5688*c0909341SAndroid Build Coastguard Worker    psrldq               m2, m5, 8
5689*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5690*c0909341SAndroid Build Coastguard Worker    movd                r4d, m5
5691*c0909341SAndroid Build Coastguard Worker    movd                r6d, m2
5692*c0909341SAndroid Build Coastguard Worker    psrldq               m5, 4
5693*c0909341SAndroid Build Coastguard Worker    psrldq               m2, 4
5694*c0909341SAndroid Build Coastguard Worker    movd                r7d, m5
5695*c0909341SAndroid Build Coastguard Worker    movd                r9d, m2
5696*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r4*8]
5697*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+r6*8]
5698*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r7*8]
5699*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r9*8]
5700*c0909341SAndroid Build Coastguard Worker%else
5701*c0909341SAndroid Build Coastguard Worker    movd                 r0, m5
5702*c0909341SAndroid Build Coastguard Worker    movd                 rX, m2
5703*c0909341SAndroid Build Coastguard Worker    psrldq               m5, 4
5704*c0909341SAndroid Build Coastguard Worker    psrldq               m2, 4
5705*c0909341SAndroid Build Coastguard Worker    movd                 r4, m5
5706*c0909341SAndroid Build Coastguard Worker    movd                 r5, m2
5707*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r0*8]
5708*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+rX*8]
5709*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r4*8]
5710*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r5*8]
5711*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
5712*c0909341SAndroid Build Coastguard Worker %define m9 m2
5713*c0909341SAndroid Build Coastguard Worker%endif
5714*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7 ; mx+dx*[4-7]
5715*c0909341SAndroid Build Coastguard Worker    pand                 m5, m14, m10
5716*c0909341SAndroid Build Coastguard Worker    psrld                m5, 6
5717*c0909341SAndroid Build Coastguard Worker    paddd               m15, m5
5718*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m5, m9
5719*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x110], m14
5720*c0909341SAndroid Build Coastguard Worker    psrldq               m4, m15, 8
5721*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5722*c0909341SAndroid Build Coastguard Worker    movd               r10d, m15
5723*c0909341SAndroid Build Coastguard Worker    movd               r11d, m4
5724*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
5725*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 4
5726*c0909341SAndroid Build Coastguard Worker    movd               r13d, m15
5727*c0909341SAndroid Build Coastguard Worker    movd                rXd, m4
5728*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r10*8]
5729*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+r11*8]
5730*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r13*8]
5731*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+ rX*8]
5732*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
5733*c0909341SAndroid Build Coastguard Worker    psrldq               m4, m14, 8
5734*c0909341SAndroid Build Coastguard Worker    movd               r10d, m14
5735*c0909341SAndroid Build Coastguard Worker    movd               r11d, m4
5736*c0909341SAndroid Build Coastguard Worker    psrldq              m14, 4
5737*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 4
5738*c0909341SAndroid Build Coastguard Worker    movd               r13d, m14
5739*c0909341SAndroid Build Coastguard Worker    movd                rXd, m4
5740*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [rsp+ 0]
5741*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [rsp+ 8]
5742*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [rsp+ 4]
5743*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [rsp+12]
5744*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
5745*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
5746*c0909341SAndroid Build Coastguard Worker    pshufd              m14, m5, q1100
5747*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
5748*c0909341SAndroid Build Coastguard Worker    pand                 m7, m11, m4
5749*c0909341SAndroid Build Coastguard Worker    pand                 m8, m11, m6
5750*c0909341SAndroid Build Coastguard Worker    pand                m15, m11, m14
5751*c0909341SAndroid Build Coastguard Worker    pand                m11, m11, m5
5752*c0909341SAndroid Build Coastguard Worker    pandn                m4, m0
5753*c0909341SAndroid Build Coastguard Worker    pandn                m6, m1
5754*c0909341SAndroid Build Coastguard Worker    pandn               m14, m2
5755*c0909341SAndroid Build Coastguard Worker    pandn                m5, m3
5756*c0909341SAndroid Build Coastguard Worker    por                  m7, m4
5757*c0909341SAndroid Build Coastguard Worker    por                  m8, m6
5758*c0909341SAndroid Build Coastguard Worker    por                 m15, m14
5759*c0909341SAndroid Build Coastguard Worker    por                 m11, m5
5760*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m7
5761*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m8
5762*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m15
5763*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], m11
5764*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1
5765*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m1
5766*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m2
5767*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3
5768*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m3
5769*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m4
5770*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5
5771*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7
5772*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m0
5773*c0909341SAndroid Build Coastguard Worker    SWAP                 m8, m14
5774*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+0x50]
5775*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x60]
5776*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+0x70]
5777*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+0x80]
5778*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5779*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
5780*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6 ; 45a
5781*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6     ; 45b
5782*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m8 ; 67a
5783*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m8     ; 67b
5784*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2 ; 01a
5785*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2     ; 01b
5786*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m9 ; 23a
5787*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m9     ; 23b
5788*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m4
5789*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m5
5790*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m6
5791*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m7
5792*c0909341SAndroid Build Coastguard Worker    SWAP                m14, m8
5793*c0909341SAndroid Build Coastguard Worker.vloop:
5794*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
5795*c0909341SAndroid Build Coastguard Worker    mov                 r6d, 64 << 24
5796*c0909341SAndroid Build Coastguard Worker    mov                 r4d, myd
5797*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 6
5798*c0909341SAndroid Build Coastguard Worker    lea                 r4d, [t1+r4]
5799*c0909341SAndroid Build Coastguard Worker    cmovnz              r6q, [base+subpel_filters+r4*8]
5800*c0909341SAndroid Build Coastguard Worker    movq                m11, r6q
5801*c0909341SAndroid Build Coastguard Worker    punpcklbw           m11, m11
5802*c0909341SAndroid Build Coastguard Worker    psraw               m11, 8
5803*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m11, q0000
5804*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m11, q1111
5805*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m11, q2222
5806*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m11, q3333
5807*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m5, m0
5808*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m5, m1
5809*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m7, m2
5810*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m7, m3
5811*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
5812*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
5813*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
5814*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
5815*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [rsp+0x50], m10
5816*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [rsp+0x60], m10
5817*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [rsp+0x70], m11
5818*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, [rsp+0x80], m11
5819*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
5820*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
5821*c0909341SAndroid Build Coastguard Worker    paddd                m4, m8
5822*c0909341SAndroid Build Coastguard Worker    paddd                m5, m9
5823*c0909341SAndroid Build Coastguard Worker%else
5824*c0909341SAndroid Build Coastguard Worker    movd                 r0, m15
5825*c0909341SAndroid Build Coastguard Worker    movd                 rX, m4
5826*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
5827*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 4
5828*c0909341SAndroid Build Coastguard Worker    movd                 r4, m15
5829*c0909341SAndroid Build Coastguard Worker    movd                 r5, m4
5830*c0909341SAndroid Build Coastguard Worker    mova                m14, [esp+0x110]
5831*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r0*8]
5832*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+rX*8]
5833*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r4*8]
5834*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+r5*8]
5835*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
5836*c0909341SAndroid Build Coastguard Worker    mova           [esp+16], m14
5837*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+ 0]
5838*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+ 8]
5839*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+ 4]
5840*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+12]
5841*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x20], m0
5842*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x30], m1
5843*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x40], m2
5844*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x50], m3
5845*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
5846*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
5847*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m5, q1100
5848*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
5849*c0909341SAndroid Build Coastguard Worker    pand                 m0, m11, m4
5850*c0909341SAndroid Build Coastguard Worker    pand                 m1, m11, m6
5851*c0909341SAndroid Build Coastguard Worker    pand                 m2, m11, m7
5852*c0909341SAndroid Build Coastguard Worker    pand                 m3, m11, m5
5853*c0909341SAndroid Build Coastguard Worker    pandn                m4, [esp+0x20]
5854*c0909341SAndroid Build Coastguard Worker    pandn                m6, [esp+0x30]
5855*c0909341SAndroid Build Coastguard Worker    pandn                m7, [esp+0x40]
5856*c0909341SAndroid Build Coastguard Worker    pandn                m5, [esp+0x50]
5857*c0909341SAndroid Build Coastguard Worker    por                  m0, m4
5858*c0909341SAndroid Build Coastguard Worker    por                  m1, m6
5859*c0909341SAndroid Build Coastguard Worker    por                  m2, m7
5860*c0909341SAndroid Build Coastguard Worker    por                  m3, m5
5861*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x20], m0
5862*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x30], m1
5863*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x40], m2
5864*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x50], m3
5865*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x140, 0 ; 0-1
5866*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x160    ; 2-3
5867*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x180    ; 4-5
5868*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x1a0    ; 6-7
5869*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+0x180]
5870*c0909341SAndroid Build Coastguard Worker    mova                 m6, [esp+0x190]
5871*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+0x1a0]
5872*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x1b0]
5873*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
5874*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6      ; 45a
5875*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6          ; 45b
5876*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m0      ; 67a
5877*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0          ; 67b
5878*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x180], m4
5879*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x190], m5
5880*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1a0], m6
5881*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m7
5882*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x140]
5883*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x150]
5884*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+0x160]
5885*c0909341SAndroid Build Coastguard Worker    mova                 m4, [esp+0x170]
5886*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2      ; 01a
5887*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2          ; 01b
5888*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m4      ; 23a
5889*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4          ; 23b
5890*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x140], m0
5891*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x150], m1
5892*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x160], m2
5893*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x170], m3
5894*c0909341SAndroid Build Coastguard Worker.vloop:
5895*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
5896*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f4]
5897*c0909341SAndroid Build Coastguard Worker    and                 myd, 0x3ff
5898*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
5899*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
5900*c0909341SAndroid Build Coastguard Worker    shr                  r4, 6
5901*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+r4]
5902*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
5903*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r5*8+0]
5904*c0909341SAndroid Build Coastguard Worker    cmovnz               r3, [base+subpel_filters+r5*8+4]
5905*c0909341SAndroid Build Coastguard Worker    movd                 m7, r4
5906*c0909341SAndroid Build Coastguard Worker    movd                 m6, r3
5907*c0909341SAndroid Build Coastguard Worker    punpckldq            m7, m6
5908*c0909341SAndroid Build Coastguard Worker    punpcklbw            m7, m7
5909*c0909341SAndroid Build Coastguard Worker    psraw                m7, 8
5910*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m7, q0000
5911*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m7, q1111
5912*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m4
5913*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m4
5914*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m5
5915*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m5
5916*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m7, q2222
5917*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q3333
5918*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
5919*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
5920*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, [esp+0x180], m6
5921*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, [esp+0x190], m6
5922*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [esp+0x1a0], m7
5923*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, [esp+0x1b0], m7
5924*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
5925*c0909341SAndroid Build Coastguard Worker    paddd                m1, m3
5926*c0909341SAndroid Build Coastguard Worker    paddd                m0, m13
5927*c0909341SAndroid Build Coastguard Worker    paddd                m1, m13
5928*c0909341SAndroid Build Coastguard Worker    paddd                m4, m0
5929*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
5930*c0909341SAndroid Build Coastguard Worker%endif
5931*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
5932*c0909341SAndroid Build Coastguard Worker    psrad                m5, rndshift
5933*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
5934*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
5935*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
5936*c0909341SAndroid Build Coastguard Worker    movq             [dstq], m4
5937*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
5938*c0909341SAndroid Build Coastguard Worker%else
5939*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
5940*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
5941*c0909341SAndroid Build Coastguard Worker%endif
5942*c0909341SAndroid Build Coastguard Worker    dec                  hd
5943*c0909341SAndroid Build Coastguard Worker    jz .hloop_prep
5944*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
5945*c0909341SAndroid Build Coastguard Worker    add                 myd, dyd
5946*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
5947*c0909341SAndroid Build Coastguard Worker    jz .vloop
5948*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
5949*c0909341SAndroid Build Coastguard Worker    mov         [rsp+0x140], myd
5950*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [rsp+ 0]
5951*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [rsp+ 8]
5952*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [rsp+ 4]
5953*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [rsp+12]
5954*c0909341SAndroid Build Coastguard Worker    jz .skip_line
5955*c0909341SAndroid Build Coastguard Worker    mova                m14, [base+unpckw]
5956*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+r10]
5957*c0909341SAndroid Build Coastguard Worker    movq                 m7, [srcq+r11]
5958*c0909341SAndroid Build Coastguard Worker    movhps               m6, [srcq+r13]
5959*c0909341SAndroid Build Coastguard Worker    movhps               m7, [srcq+ rX]
5960*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ r4]
5961*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+ r6]
5962*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+ r7]
5963*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+ r9]
5964*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
5965*c0909341SAndroid Build Coastguard Worker    mov                 myd, [rsp+0x140]
5966*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
5967*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m14, q1032
5968*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14                ; 0a 1a
5969*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14                ; 0b 1b
5970*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m9                 ; 3a 2a
5971*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m9                 ; 3b 2b
5972*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, [rsp+0x30]
5973*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, [rsp+0x40]
5974*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, [rsp+0x10]
5975*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, [rsp+0x20]
5976*c0909341SAndroid Build Coastguard Worker    phaddw               m6, m7
5977*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
5978*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m6
5979*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
5980*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [rsp+0x50], m14    ; 4a 5a
5981*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [rsp+0x60], m14    ; 4b 5b
5982*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [rsp+0x70], m9     ; 7a 6a
5983*c0909341SAndroid Build Coastguard Worker    pshufb               m8, [rsp+0x80], m9     ; 7b 6b
5984*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2 ; 12a
5985*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3 ; 12b
5986*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m5 ; 34a
5987*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6 ; 34b
5988*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m7 ; 56a
5989*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m8 ; 56b
5990*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m4 ; 78a
5991*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m4
5992*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m4 ; 78b
5993*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m5
5994*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m6
5995*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m7
5996*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m8
5997*c0909341SAndroid Build Coastguard Worker    jmp .vloop
5998*c0909341SAndroid Build Coastguard Worker.skip_line:
5999*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x10]
6000*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+0x20]
6001*c0909341SAndroid Build Coastguard Worker    mova                m14, [rsp+0x30]
6002*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+0x40]
6003*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15
6004*c0909341SAndroid Build Coastguard Worker    mov                 myd, [rsp+0x140]
6005*c0909341SAndroid Build Coastguard Worker    mov                 dyd, dym
6006*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2         ; 01a
6007*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3         ; 01b
6008*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x50] ; 23a
6009*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+0x60] ; 23b
6010*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+0x70] ; 45a
6011*c0909341SAndroid Build Coastguard Worker    mova                 m6, [rsp+0x80] ; 45b
6012*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m4, m8     ; 67a
6013*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m8         ; 67b
6014*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m5
6015*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m6
6016*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m7
6017*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m4
6018*c0909341SAndroid Build Coastguard Worker%else
6019*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
6020*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6021*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6022*c0909341SAndroid Build Coastguard Worker    add                 myd, dym
6023*c0909341SAndroid Build Coastguard Worker    test                myd, ~0x3ff
6024*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
6025*c0909341SAndroid Build Coastguard Worker    jnz .next_line
6026*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x140]
6027*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x150]
6028*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x160]
6029*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+0x170]
6030*c0909341SAndroid Build Coastguard Worker    jmp .vloop
6031*c0909341SAndroid Build Coastguard Worker.next_line:
6032*c0909341SAndroid Build Coastguard Worker    test                myd, 0x400
6033*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+ 0]
6034*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+ 8]
6035*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+ 4]
6036*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+12]
6037*c0909341SAndroid Build Coastguard Worker    jz .skip_line
6038*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+unpckw]
6039*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x140]
6040*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x150]
6041*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+0x180]
6042*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+r0]
6043*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+rX]
6044*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+r4]
6045*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+r5]
6046*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6         ; 0a 1a
6047*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6         ; 0b 1b
6048*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6         ; 4a 5a
6049*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+16]
6050*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+24]
6051*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+20]
6052*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+28]
6053*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+r0]
6054*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+rX]
6055*c0909341SAndroid Build Coastguard Worker    movhps               m3, [srcq+r4]
6056*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+r5]
6057*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
6058*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, [esp+0x20]
6059*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, [esp+0x30]
6060*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, [esp+0x40]
6061*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, [esp+0x50]
6062*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
6063*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m2
6064*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+0x190]
6065*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x160]
6066*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m3
6067*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+0x170]
6068*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12        ; 8a 8b
6069*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6070*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m6         ; 4b 5b
6071*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q1032
6072*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6         ; 3a 2a
6073*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6         ; 3b 2b
6074*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2         ; 12a
6075*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3         ; 12b
6076*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x140], m0
6077*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x150], m1
6078*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x1a0]
6079*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x1b0]
6080*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m7         ; 34a
6081*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5         ; 34b
6082*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x160], m2
6083*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x170], m3
6084*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6         ; 7a 6a
6085*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6         ; 7b 6b
6086*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0         ; 56a
6087*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m1         ; 56b
6088*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
6089*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m4
6090*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4
6091*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x180], m7
6092*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x190], m5
6093*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1a0], m0
6094*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m1
6095*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x140]
6096*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x150]
6097*c0909341SAndroid Build Coastguard Worker    jmp .vloop
6098*c0909341SAndroid Build Coastguard Worker.skip_line:
6099*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x1c0, 0
6100*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6101*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x160]
6102*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x170]
6103*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x180]
6104*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+0x190]
6105*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x140], m0
6106*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x150], m1
6107*c0909341SAndroid Build Coastguard Worker    mova                 m4, [esp+0x1a0]
6108*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+0x1b0]
6109*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x160], m2
6110*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x170], m3
6111*c0909341SAndroid Build Coastguard Worker    mova                 m6, [esp+0x1c0]
6112*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+0x1d0]
6113*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x180], m4
6114*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x190], m5
6115*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m6, m7
6116*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m7
6117*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1a0], m4
6118*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m6
6119*c0909341SAndroid Build Coastguard Worker%endif
6120*c0909341SAndroid Build Coastguard Worker    jmp .vloop
6121*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
6122*c0909341SAndroid Build Coastguard Worker.dy1:
6123*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
6124*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
6125*c0909341SAndroid Build Coastguard Worker    jmp                  wq
6126*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
6127*c0909341SAndroid Build Coastguard Worker.dy1_w2:
6128*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6129*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6130*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
6131*c0909341SAndroid Build Coastguard Worker    dec                srcq
6132*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
6133*c0909341SAndroid Build Coastguard Worker %else
6134*c0909341SAndroid Build Coastguard Worker  %define m8  m0
6135*c0909341SAndroid Build Coastguard Worker  %define m9  m1
6136*c0909341SAndroid Build Coastguard Worker  %define m14 m4
6137*c0909341SAndroid Build Coastguard Worker  %define m15 m3
6138*c0909341SAndroid Build Coastguard Worker    movzx                r5, byte [esp+0x1f0]
6139*c0909341SAndroid Build Coastguard Worker    dec                srcd
6140*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
6141*c0909341SAndroid Build Coastguard Worker %endif
6142*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m8
6143*c0909341SAndroid Build Coastguard Worker    SWAP                 m8, m9
6144*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-1]
6145*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6146*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pd_0x4000]
6147*c0909341SAndroid Build Coastguard Worker %else
6148*c0909341SAndroid Build Coastguard Worker  %define m11 [base+pd_0x4000]
6149*c0909341SAndroid Build Coastguard Worker %endif
6150*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
6151*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
6152*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
6153*c0909341SAndroid Build Coastguard Worker    paddd               m15, m8
6154*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
6155*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
6156*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6157*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
6158*c0909341SAndroid Build Coastguard Worker %else
6159*c0909341SAndroid Build Coastguard Worker    movd                r3d, m15
6160*c0909341SAndroid Build Coastguard Worker %endif
6161*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_dw]
6162*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_s_shuf2]
6163*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r4*8+2]
6164*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6165*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r6*8+2]
6166*c0909341SAndroid Build Coastguard Worker %else
6167*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r3*8+2]
6168*c0909341SAndroid Build Coastguard Worker %endif
6169*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
6170*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
6171*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
6172*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
6173*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6174*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
6175*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
6176*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x00], m14
6177*c0909341SAndroid Build Coastguard Worker  %define m14 [esp+0x00]
6178*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m0
6179*c0909341SAndroid Build Coastguard Worker    SWAP                 m6, m3
6180*c0909341SAndroid Build Coastguard Worker  %define m8  m5
6181*c0909341SAndroid Build Coastguard Worker  %define m15 m6
6182*c0909341SAndroid Build Coastguard Worker %endif
6183*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
6184*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+ssq*2]
6185*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+ssq*1]
6186*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+ss3q ]
6187*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
6188*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6189*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6190*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
6191*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
6192*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
6193*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
6194*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
6195*c0909341SAndroid Build Coastguard Worker    movq                m10, r4
6196*c0909341SAndroid Build Coastguard Worker %else
6197*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6198*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f4]
6199*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
6200*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6201*c0909341SAndroid Build Coastguard Worker    lea                  r5, [r5+myd]
6202*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
6203*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r5*8+0]
6204*c0909341SAndroid Build Coastguard Worker    cmovnz               r3, [base+subpel_filters+r5*8+4]
6205*c0909341SAndroid Build Coastguard Worker  %define m10 m4
6206*c0909341SAndroid Build Coastguard Worker    movd                m10, r4
6207*c0909341SAndroid Build Coastguard Worker    movd                 m3, r3
6208*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6209*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m3
6210*c0909341SAndroid Build Coastguard Worker %endif
6211*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
6212*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*2]
6213*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+ssq*1]
6214*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
6215*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m10
6216*c0909341SAndroid Build Coastguard Worker    psraw               m10, 8
6217*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m7
6218*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m15
6219*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6220*c0909341SAndroid Build Coastguard Worker    pand                m11, m8
6221*c0909341SAndroid Build Coastguard Worker %else
6222*c0909341SAndroid Build Coastguard Worker    pand                 m7, m11, m8
6223*c0909341SAndroid Build Coastguard Worker  %define m11 m7
6224*c0909341SAndroid Build Coastguard Worker %endif
6225*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
6226*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m8
6227*c0909341SAndroid Build Coastguard Worker    por                 m15, m11
6228*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
6229*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m10, q0000
6230*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m10, q1111
6231*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m10, q3333
6232*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q2222
6233*c0909341SAndroid Build Coastguard Worker %else
6234*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x10], m15
6235*c0909341SAndroid Build Coastguard Worker  %define m15 [esp+0x10]
6236*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
6237*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m4, q0000
6238*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m4, q1111
6239*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m4, q2222
6240*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q3333
6241*c0909341SAndroid Build Coastguard Worker  %define m8  [esp+0x20]
6242*c0909341SAndroid Build Coastguard Worker  %define m9  [esp+0x30]
6243*c0909341SAndroid Build Coastguard Worker  %define m10 [esp+0x40]
6244*c0909341SAndroid Build Coastguard Worker  %define m11 [esp+0x50]
6245*c0909341SAndroid Build Coastguard Worker    mova                 m8, m5
6246*c0909341SAndroid Build Coastguard Worker    mova                 m9, m6
6247*c0909341SAndroid Build Coastguard Worker    mova                m10, m7
6248*c0909341SAndroid Build Coastguard Worker    mova                m11, m4
6249*c0909341SAndroid Build Coastguard Worker %endif
6250*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
6251*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
6252*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
6253*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
6254*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
6255*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
6256*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
6257*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
6258*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m2
6259*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m3
6260*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12
6261*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12
6262*c0909341SAndroid Build Coastguard Worker    palignr              m2, m1, m0, 4
6263*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m1, q2121
6264*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m0, m2     ; 01 12
6265*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2         ; 23 34
6266*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m1, m4     ; 45 56
6267*c0909341SAndroid Build Coastguard Worker.dy1_w2_loop:
6268*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*0]
6269*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+ssq*1]
6270*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
6271*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m3, m8
6272*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m0, m9
6273*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m2, m10
6274*c0909341SAndroid Build Coastguard Worker    mova                 m3, m0
6275*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
6276*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
6277*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
6278*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
6279*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
6280*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m1
6281*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12
6282*c0909341SAndroid Build Coastguard Worker    palignr              m7, m1, m4, 12
6283*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m7, m1     ; 67 78
6284*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m2, m11
6285*c0909341SAndroid Build Coastguard Worker    mova                 m4, m1
6286*c0909341SAndroid Build Coastguard Worker    paddd                m5, m6
6287*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
6288*c0909341SAndroid Build Coastguard Worker    psrad                m5, rndshift
6289*c0909341SAndroid Build Coastguard Worker    packssdw             m5, m5
6290*c0909341SAndroid Build Coastguard Worker    packuswb             m5, m5
6291*c0909341SAndroid Build Coastguard Worker    movd                r4d, m5
6292*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r4w
6293*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 16
6294*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r4w
6295*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6296*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6297*c0909341SAndroid Build Coastguard Worker    jg .dy1_w2_loop
6298*c0909341SAndroid Build Coastguard Worker    RET
6299*c0909341SAndroid Build Coastguard Worker%endif
6300*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
6301*c0909341SAndroid Build Coastguard Worker.dy1_w4:
6302*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6303*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6304*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
6305*c0909341SAndroid Build Coastguard Worker    dec                srcq
6306*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
6307*c0909341SAndroid Build Coastguard Worker%else
6308*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0x3ff]
6309*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_0x4000]
6310*c0909341SAndroid Build Coastguard Worker %define m8  m0
6311*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
6312*c0909341SAndroid Build Coastguard Worker %define m15 m3
6313*c0909341SAndroid Build Coastguard Worker %if isprep
6314*c0909341SAndroid Build Coastguard Worker  %define ssq r3
6315*c0909341SAndroid Build Coastguard Worker %endif
6316*c0909341SAndroid Build Coastguard Worker    movzx                r4, byte [esp+0x1f0]
6317*c0909341SAndroid Build Coastguard Worker    dec                srcq
6318*c0909341SAndroid Build Coastguard Worker    movd                m15, r4
6319*c0909341SAndroid Build Coastguard Worker%endif
6320*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul]
6321*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6322*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pd_0x4000]
6323*c0909341SAndroid Build Coastguard Worker%endif
6324*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
6325*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
6326*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
6327*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
6328*c0909341SAndroid Build Coastguard Worker    paddd               m15, m8
6329*c0909341SAndroid Build Coastguard Worker    psrldq               m7, m15, 8
6330*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6331*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
6332*c0909341SAndroid Build Coastguard Worker    movd               r11d, m7
6333*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
6334*c0909341SAndroid Build Coastguard Worker    psrldq               m7, 4
6335*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
6336*c0909341SAndroid Build Coastguard Worker    movd               r13d, m7
6337*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+ r4*8+2]
6338*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+r11*8+2]
6339*c0909341SAndroid Build Coastguard Worker    movd                 m3, [base+subpel_filters+ r6*8+2]
6340*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base+subpel_filters+r13*8+2]
6341*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6342*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
6343*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
6344*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
6345*c0909341SAndroid Build Coastguard Worker%else
6346*c0909341SAndroid Build Coastguard Worker    movd                 r1, m15
6347*c0909341SAndroid Build Coastguard Worker    movd                 r3, m7
6348*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
6349*c0909341SAndroid Build Coastguard Worker    psrldq               m7, 4
6350*c0909341SAndroid Build Coastguard Worker    movd                 r4, m15
6351*c0909341SAndroid Build Coastguard Worker    movd                 r5, m7
6352*c0909341SAndroid Build Coastguard Worker %define m15 m5
6353*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m7
6354*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r1*8+2]
6355*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+r3*8+2]
6356*c0909341SAndroid Build Coastguard Worker    movd                 m3, [base+subpel_filters+r4*8+2]
6357*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base+subpel_filters+r5*8+2]
6358*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6359*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+0x1f4]
6360*c0909341SAndroid Build Coastguard Worker    xor                  r5, r5
6361*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6362*c0909341SAndroid Build Coastguard Worker    lea                  rX, [rX+myd]
6363*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
6364*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+rX*8+0]
6365*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+rX*8+4]
6366*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6367*c0909341SAndroid Build Coastguard Worker %if isprep
6368*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
6369*c0909341SAndroid Build Coastguard Worker %endif
6370*c0909341SAndroid Build Coastguard Worker%endif
6371*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m3
6372*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4
6373*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m2
6374*c0909341SAndroid Build Coastguard Worker    movq                 m6, [base+subpel_s_shuf2]
6375*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6376*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
6377*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
6378*c0909341SAndroid Build Coastguard Worker    pshufb              m14, [base+bdct_lb_dw]
6379*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
6380*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
6381*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*2]
6382*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ss3q ]
6383*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
6384*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m6
6385*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]
6386*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1]
6387*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*2]
6388*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
6389*c0909341SAndroid Build Coastguard Worker    pand                m11, m8
6390*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
6391*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m8
6392*c0909341SAndroid Build Coastguard Worker    por                 m15, m11
6393*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
6394*c0909341SAndroid Build Coastguard Worker    movq                m10, r4q
6395*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m10
6396*c0909341SAndroid Build Coastguard Worker    psraw               m10, 8
6397*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
6398*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
6399*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
6400*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
6401*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m14
6402*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
6403*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
6404*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
6405*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
6406*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
6407*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
6408*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m15
6409*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
6410*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
6411*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m1
6412*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
6413*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
6414*c0909341SAndroid Build Coastguard Worker    phaddw               m6, m7, m7
6415*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12    ; 0 1
6416*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m12    ; 2 3
6417*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12    ; 4 5
6418*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m12    ; 6 _
6419*c0909341SAndroid Build Coastguard Worker    shufps               m1, m0, m2, q1032  ; 1 2
6420*c0909341SAndroid Build Coastguard Worker    shufps               m3, m2, m4, q1032  ; 3 4
6421*c0909341SAndroid Build Coastguard Worker    shufps               m5, m4, m6, q1032  ; 5 6
6422*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m0, m1 ; 01
6423*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m1     ; 12
6424*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m2, m3 ; 23
6425*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m3     ; 34
6426*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m4, m5 ; 45
6427*c0909341SAndroid Build Coastguard Worker    punpckhwd            m4, m5     ; 56
6428*c0909341SAndroid Build Coastguard Worker%else
6429*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
6430*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m3
6431*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
6432*c0909341SAndroid Build Coastguard Worker    pshufb              m14, [base+bdct_lb_dw]
6433*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
6434*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*1]
6435*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2]
6436*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
6437*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m6
6438*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m7
6439*c0909341SAndroid Build Coastguard Worker    pand                 m7, m11, m8
6440*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
6441*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m0
6442*c0909341SAndroid Build Coastguard Worker    por                 m15, m7
6443*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
6444*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
6445*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1]
6446*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2]
6447*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
6448*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
6449*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
6450*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
6451*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
6452*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
6453*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
6454*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
6455*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
6456*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x00], m14
6457*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x10], m15
6458*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
6459*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
6460*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m15
6461*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m2
6462*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ss3q ]
6463*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
6464*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
6465*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m0
6466*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
6467*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
6468*c0909341SAndroid Build Coastguard Worker %define m14 [esp+0x00]
6469*c0909341SAndroid Build Coastguard Worker %define m15 [esp+0x10]
6470*c0909341SAndroid Build Coastguard Worker    phaddw               m7, m6
6471*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m2
6472*c0909341SAndroid Build Coastguard Worker    movd                 m6, r4
6473*c0909341SAndroid Build Coastguard Worker    movd                 m0, r5
6474*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m0
6475*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m6
6476*c0909341SAndroid Build Coastguard Worker    psraw                m6, 8
6477*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x20], m6
6478*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12 ; 0 1
6479*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m12 ; 2 3
6480*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m12 ; 4 5
6481*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m2, m12 ; 6 _
6482*c0909341SAndroid Build Coastguard Worker    shufps               m0, m1, m3, q1032  ; 1 2
6483*c0909341SAndroid Build Coastguard Worker    shufps               m4, m3, m7, q1032  ; 3 4
6484*c0909341SAndroid Build Coastguard Worker    shufps               m5, m7, m2, q1032  ; 5 6
6485*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m1, m0 ; 01
6486*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m0     ; 12
6487*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x30], m1
6488*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m3, m4 ; 23
6489*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4     ; 34
6490*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x40], m3
6491*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m7, m5 ; 45
6492*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m5     ; 56
6493*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x50], m7
6494*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x60], m2
6495*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x20]
6496*c0909341SAndroid Build Coastguard Worker %xdefine m8 m1
6497*c0909341SAndroid Build Coastguard Worker %xdefine m9 m3
6498*c0909341SAndroid Build Coastguard Worker %xdefine m10 m0
6499*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m6
6500*c0909341SAndroid Build Coastguard Worker    SWAP                 m1, m4
6501*c0909341SAndroid Build Coastguard Worker    SWAP                 m3, m2
6502*c0909341SAndroid Build Coastguard Worker%endif
6503*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m10, q0000
6504*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m10, q1111
6505*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m10, q2222
6506*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q3333
6507*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6508*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m8
6509*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m2
6510*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m9
6511*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m4
6512*c0909341SAndroid Build Coastguard Worker%else
6513*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x70], m8
6514*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x80], m9
6515*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x90], m1
6516*c0909341SAndroid Build Coastguard Worker    mova         [esp+0xa0], m3
6517*c0909341SAndroid Build Coastguard Worker    mova         [esp+0xb0], m5
6518*c0909341SAndroid Build Coastguard Worker    mova         [esp+0xc0], m10
6519*c0909341SAndroid Build Coastguard Worker %ifidn %1, put
6520*c0909341SAndroid Build Coastguard Worker    mov                 dsd, dsm
6521*c0909341SAndroid Build Coastguard Worker %endif
6522*c0909341SAndroid Build Coastguard Worker %define m11 m6
6523*c0909341SAndroid Build Coastguard Worker%endif
6524*c0909341SAndroid Build Coastguard Worker.dy1_w4_loop:
6525*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6526*c0909341SAndroid Build Coastguard Worker    movu                m11, [srcq+ssq*0]
6527*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m1
6528*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, m3
6529*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m1
6530*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m3
6531*c0909341SAndroid Build Coastguard Worker    pmaddwd              m9, m5
6532*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m5
6533*c0909341SAndroid Build Coastguard Worker    paddd                m7, m8
6534*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
6535*c0909341SAndroid Build Coastguard Worker    movu                 m8, [srcq+ssq*1]
6536*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
6537*c0909341SAndroid Build Coastguard Worker    pshufb              m11, m14
6538*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m11, m15
6539*c0909341SAndroid Build Coastguard Worker    paddd                m7, m13
6540*c0909341SAndroid Build Coastguard Worker    paddd                m0, m13
6541*c0909341SAndroid Build Coastguard Worker    paddd                m7, m9
6542*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
6543*c0909341SAndroid Build Coastguard Worker    pshufb               m8, m14
6544*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m8, m15
6545*c0909341SAndroid Build Coastguard Worker    phaddw              m11, m8
6546*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+0x20]
6547*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m11, m12
6548*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m6, m11    ; 67
6549*c0909341SAndroid Build Coastguard Worker    psrldq               m6, m11, 8
6550*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m11, m6    ; 78
6551*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9, m10
6552*c0909341SAndroid Build Coastguard Worker    pmaddwd             m11, m4, m10
6553*c0909341SAndroid Build Coastguard Worker    paddd                m7, m2
6554*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x30]
6555*c0909341SAndroid Build Coastguard Worker    paddd                m0, m11
6556*c0909341SAndroid Build Coastguard Worker%else
6557*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m6
6558*c0909341SAndroid Build Coastguard Worker    SWAP                 m1, m4
6559*c0909341SAndroid Build Coastguard Worker    SWAP                 m3, m2
6560*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*0]
6561*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x30]
6562*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x40]
6563*c0909341SAndroid Build Coastguard Worker    mova                 m4, [esp+0x50]
6564*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [esp+0x90]
6565*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, [esp+0xa0]
6566*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, [esp+0x90]
6567*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, [esp+0xa0]
6568*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, [esp+0xb0]
6569*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, [esp+0xb0]
6570*c0909341SAndroid Build Coastguard Worker    paddd                m6, m1
6571*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
6572*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1]
6573*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
6574*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
6575*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
6576*c0909341SAndroid Build Coastguard Worker    paddd                m6, m13
6577*c0909341SAndroid Build Coastguard Worker    paddd                m0, m13
6578*c0909341SAndroid Build Coastguard Worker    paddd                m6, m3
6579*c0909341SAndroid Build Coastguard Worker    paddd                m0, m4
6580*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
6581*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
6582*c0909341SAndroid Build Coastguard Worker    phaddw               m5, m7
6583*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+0x80]
6584*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m5, m12
6585*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, [esp+0x60], m5 ; 67
6586*c0909341SAndroid Build Coastguard Worker    psrldq               m1, m5, 8
6587*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m1         ; 78
6588*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m3, [esp+0xc0]
6589*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m4, [esp+0xc0]
6590*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x60], m1
6591*c0909341SAndroid Build Coastguard Worker    paddd                m6, m2
6592*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x50]
6593*c0909341SAndroid Build Coastguard Worker    paddd                m0, m5
6594*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m6
6595*c0909341SAndroid Build Coastguard Worker%endif
6596*c0909341SAndroid Build Coastguard Worker    psrad                m7, rndshift
6597*c0909341SAndroid Build Coastguard Worker    psrad                m0, rndshift
6598*c0909341SAndroid Build Coastguard Worker    packssdw             m7, m0
6599*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6600*c0909341SAndroid Build Coastguard Worker    mova                 m0, [rsp+0x10]
6601*c0909341SAndroid Build Coastguard Worker%else
6602*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x40]
6603*c0909341SAndroid Build Coastguard Worker%define m11 m5
6604*c0909341SAndroid Build Coastguard Worker%endif
6605*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
6606*c0909341SAndroid Build Coastguard Worker    packuswb             m7, m7
6607*c0909341SAndroid Build Coastguard Worker    psrldq              m11, m7, 4
6608*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m7
6609*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m11
6610*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
6611*c0909341SAndroid Build Coastguard Worker%else
6612*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m7
6613*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
6614*c0909341SAndroid Build Coastguard Worker%endif
6615*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
6616*c0909341SAndroid Build Coastguard Worker    jz .ret
6617*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6618*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+0x00]
6619*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x00], m8
6620*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m2
6621*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m9
6622*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m4
6623*c0909341SAndroid Build Coastguard Worker%else
6624*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+0x70] ; 01
6625*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x80] ; 23
6626*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x50] ; 34
6627*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x30], m0
6628*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x70], m1
6629*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x40], m2
6630*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x80], m3
6631*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x50], m4
6632*c0909341SAndroid Build Coastguard Worker%endif
6633*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w4_loop
6634*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
6635*c0909341SAndroid Build Coastguard Worker.dy1_w8:
6636*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 1
6637*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
6638*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
6639*c0909341SAndroid Build Coastguard Worker.dy1_w16:
6640*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 2
6641*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
6642*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
6643*c0909341SAndroid Build Coastguard Worker.dy1_w32:
6644*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 4
6645*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
6646*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
6647*c0909341SAndroid Build Coastguard Worker.dy1_w64:
6648*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 8
6649*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
6650*c0909341SAndroid Build Coastguard Worker    jmp .dy1_w_start
6651*c0909341SAndroid Build Coastguard Worker.dy1_w128:
6652*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 16
6653*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
6654*c0909341SAndroid Build Coastguard Worker.dy1_w_start:
6655*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
6656*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
6657*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
6658*c0909341SAndroid Build Coastguard Worker%endif
6659*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6660*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
6661*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
6662*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6663*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
6664*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
6665*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
6666*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
6667*c0909341SAndroid Build Coastguard Worker%else
6668*c0909341SAndroid Build Coastguard Worker %define m8   m0
6669*c0909341SAndroid Build Coastguard Worker %define m9   m1
6670*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
6671*c0909341SAndroid Build Coastguard Worker %xdefine m15 m3
6672*c0909341SAndroid Build Coastguard Worker %if isprep
6673*c0909341SAndroid Build Coastguard Worker  %define ssq ssm
6674*c0909341SAndroid Build Coastguard Worker %endif
6675*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f0]
6676*c0909341SAndroid Build Coastguard Worker    mov                  r3, [esp+0x1f4]
6677*c0909341SAndroid Build Coastguard Worker    shr                  r5, 16
6678*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
6679*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
6680*c0909341SAndroid Build Coastguard Worker    xor                  r5, r5
6681*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
6682*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3+myd]
6683*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
6684*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r3*8+0]
6685*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+r3*8+4]
6686*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
6687*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6688*c0909341SAndroid Build Coastguard Worker%endif
6689*c0909341SAndroid Build Coastguard Worker    pslld                m7, m8, 2 ; dx*4
6690*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
6691*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
6692*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
6693*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6694*c0909341SAndroid Build Coastguard Worker    movq                 m3, r4q
6695*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
6696*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
6697*c0909341SAndroid Build Coastguard Worker%else
6698*c0909341SAndroid Build Coastguard Worker    movd                 m5, r4
6699*c0909341SAndroid Build Coastguard Worker    movd                 m6, r5
6700*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m6
6701*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m5
6702*c0909341SAndroid Build Coastguard Worker    psraw                m5, 8
6703*c0909341SAndroid Build Coastguard Worker    SWAP                 m3, m5
6704*c0909341SAndroid Build Coastguard Worker%endif
6705*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x100], m7
6706*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x120], m15
6707*c0909341SAndroid Build Coastguard Worker    mov         [rsp+0x098], srcq
6708*c0909341SAndroid Build Coastguard Worker    mov         [rsp+0x130], r0q ; dstq / tmpq
6709*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
6710*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
6711*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
6712*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
6713*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x140], m0
6714*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x150], m1
6715*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x160], m2
6716*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x170], m3
6717*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 && UNIX64
6718*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
6719*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
6720*c0909341SAndroid Build Coastguard Worker    SWAP                  m5, m3
6721*c0909341SAndroid Build Coastguard Worker    mov                   r5, hm
6722*c0909341SAndroid Build Coastguard Worker    mov          [esp+0x134], r5
6723*c0909341SAndroid Build Coastguard Worker%endif
6724*c0909341SAndroid Build Coastguard Worker    jmp .dy1_hloop
6725*c0909341SAndroid Build Coastguard Worker.dy1_hloop_prep:
6726*c0909341SAndroid Build Coastguard Worker    dec   dword [rsp+0x090]
6727*c0909341SAndroid Build Coastguard Worker    jz .ret
6728*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6729*c0909341SAndroid Build Coastguard Worker    add   qword [rsp+0x130], 8*(isprep+1)
6730*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
6731*c0909341SAndroid Build Coastguard Worker%else
6732*c0909341SAndroid Build Coastguard Worker    add   dword [rsp+0x130], 8*(isprep+1)
6733*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x134]
6734*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+0x130]
6735*c0909341SAndroid Build Coastguard Worker%endif
6736*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+0x100]
6737*c0909341SAndroid Build Coastguard Worker    mova                m14, [rsp+0x110]
6738*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6739*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pd_0x3ff]
6740*c0909341SAndroid Build Coastguard Worker%else
6741*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0x3ff]
6742*c0909341SAndroid Build Coastguard Worker%endif
6743*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+0x120]
6744*c0909341SAndroid Build Coastguard Worker    mov                srcq, [rsp+0x098]
6745*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6746*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [rsp+0x130] ; dstq / tmpq
6747*c0909341SAndroid Build Coastguard Worker%else
6748*c0909341SAndroid Build Coastguard Worker    mov                  hm, r5
6749*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
6750*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
6751*c0909341SAndroid Build Coastguard Worker%endif
6752*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7
6753*c0909341SAndroid Build Coastguard Worker.dy1_hloop:
6754*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
6755*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6756*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pq_0x40000000]
6757*c0909341SAndroid Build Coastguard Worker%else
6758*c0909341SAndroid Build Coastguard Worker %define m11 [base+pq_0x40000000]
6759*c0909341SAndroid Build Coastguard Worker%endif
6760*c0909341SAndroid Build Coastguard Worker    psrld                m2, m14, 10
6761*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m2
6762*c0909341SAndroid Build Coastguard Worker    pand                 m6, m14, m10
6763*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
6764*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15, m6
6765*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m9
6766*c0909341SAndroid Build Coastguard Worker    psrldq               m2, m5, 8
6767*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6768*c0909341SAndroid Build Coastguard Worker    movd                r4d, m5
6769*c0909341SAndroid Build Coastguard Worker    movd                r6d, m2
6770*c0909341SAndroid Build Coastguard Worker    psrldq               m5, 4
6771*c0909341SAndroid Build Coastguard Worker    psrldq               m2, 4
6772*c0909341SAndroid Build Coastguard Worker    movd                r7d, m5
6773*c0909341SAndroid Build Coastguard Worker    movd                r9d, m2
6774*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r4*8]
6775*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+r6*8]
6776*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r7*8]
6777*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r9*8]
6778*c0909341SAndroid Build Coastguard Worker%else
6779*c0909341SAndroid Build Coastguard Worker    movd                 r0, m5
6780*c0909341SAndroid Build Coastguard Worker    movd                 rX, m2
6781*c0909341SAndroid Build Coastguard Worker    psrldq               m5, 4
6782*c0909341SAndroid Build Coastguard Worker    psrldq               m2, 4
6783*c0909341SAndroid Build Coastguard Worker    movd                 r4, m5
6784*c0909341SAndroid Build Coastguard Worker    movd                 r5, m2
6785*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r0*8]
6786*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+rX*8]
6787*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r4*8]
6788*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r5*8]
6789*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
6790*c0909341SAndroid Build Coastguard Worker %define m9 m2
6791*c0909341SAndroid Build Coastguard Worker%endif
6792*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7 ; mx+dx*[4-7]
6793*c0909341SAndroid Build Coastguard Worker    pand                 m5, m14, m10
6794*c0909341SAndroid Build Coastguard Worker    psrld                m5, 6
6795*c0909341SAndroid Build Coastguard Worker    paddd               m15, m5
6796*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m5, m9
6797*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x110], m14
6798*c0909341SAndroid Build Coastguard Worker    psrldq               m4, m15, 8
6799*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6800*c0909341SAndroid Build Coastguard Worker    movd               r10d, m15
6801*c0909341SAndroid Build Coastguard Worker    movd               r11d, m4
6802*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
6803*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 4
6804*c0909341SAndroid Build Coastguard Worker    movd               r13d, m15
6805*c0909341SAndroid Build Coastguard Worker    movd                rXd, m4
6806*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r10*8]
6807*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+r11*8]
6808*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r13*8]
6809*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+ rX*8]
6810*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
6811*c0909341SAndroid Build Coastguard Worker    psrldq               m4, m14, 8
6812*c0909341SAndroid Build Coastguard Worker    movd               r10d, m14
6813*c0909341SAndroid Build Coastguard Worker    movd               r11d, m4
6814*c0909341SAndroid Build Coastguard Worker    psrldq              m14, 4
6815*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 4
6816*c0909341SAndroid Build Coastguard Worker    movd               r13d, m14
6817*c0909341SAndroid Build Coastguard Worker    movd                rXd, m4
6818*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [rsp+ 0]
6819*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [rsp+ 8]
6820*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [rsp+ 4]
6821*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [rsp+12]
6822*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
6823*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
6824*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m5, q1100
6825*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
6826*c0909341SAndroid Build Coastguard Worker    pand                 m8, m11, m4
6827*c0909341SAndroid Build Coastguard Worker    pand                 m9, m11, m6
6828*c0909341SAndroid Build Coastguard Worker    pand                m15, m11, m7
6829*c0909341SAndroid Build Coastguard Worker    pand                m11, m11, m5
6830*c0909341SAndroid Build Coastguard Worker    pandn                m4, m0
6831*c0909341SAndroid Build Coastguard Worker    pandn                m6, m1
6832*c0909341SAndroid Build Coastguard Worker    pandn                m7, m2
6833*c0909341SAndroid Build Coastguard Worker    pandn                m5, m3
6834*c0909341SAndroid Build Coastguard Worker    por                  m8, m4
6835*c0909341SAndroid Build Coastguard Worker    por                  m9, m6
6836*c0909341SAndroid Build Coastguard Worker    por                 m15, m7
6837*c0909341SAndroid Build Coastguard Worker    por                 m11, m5
6838*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m8
6839*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m9
6840*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m15
6841*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], m11
6842*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
6843*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m1
6844*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m2
6845*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
6846*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m3
6847*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m4
6848*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
6849*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
6850*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m0
6851*c0909341SAndroid Build Coastguard Worker    SWAP                 m8, m14
6852*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+0x50]
6853*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x60]
6854*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+0x70]
6855*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+0x80]
6856*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6 ; 45a
6857*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6     ; 45b
6858*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m8 ; 67a
6859*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m8     ; 67b
6860*c0909341SAndroid Build Coastguard Worker    SWAP                m14, m8
6861*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+0x140]
6862*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+0x150]
6863*c0909341SAndroid Build Coastguard Worker    mova                m10, [rsp+0x160]
6864*c0909341SAndroid Build Coastguard Worker    mova                m11, [rsp+0x170]
6865*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2 ; 01a
6866*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2     ; 01b
6867*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m15; 23a
6868*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m15    ; 23b
6869*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m4
6870*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m5
6871*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m6
6872*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m7
6873*c0909341SAndroid Build Coastguard Worker    mova                m14, [base+unpckw]
6874*c0909341SAndroid Build Coastguard Worker%else
6875*c0909341SAndroid Build Coastguard Worker    movd                 r0, m15
6876*c0909341SAndroid Build Coastguard Worker    movd                 rX, m4
6877*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
6878*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 4
6879*c0909341SAndroid Build Coastguard Worker    movd                 r4, m15
6880*c0909341SAndroid Build Coastguard Worker    movd                 r5, m4
6881*c0909341SAndroid Build Coastguard Worker    mova                m14, [esp+0x110]
6882*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r0*8]
6883*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+rX*8]
6884*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r4*8]
6885*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+r5*8]
6886*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
6887*c0909341SAndroid Build Coastguard Worker    mova           [esp+16], m14
6888*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+ 0]
6889*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+ 8]
6890*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+ 4]
6891*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+12]
6892*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x20], m0
6893*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x30], m1
6894*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x40], m2
6895*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x50], m3
6896*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
6897*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
6898*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m5, q1100
6899*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
6900*c0909341SAndroid Build Coastguard Worker    pand                 m0, m11, m4
6901*c0909341SAndroid Build Coastguard Worker    pand                 m1, m11, m6
6902*c0909341SAndroid Build Coastguard Worker    pand                 m2, m11, m7
6903*c0909341SAndroid Build Coastguard Worker    pand                 m3, m11, m5
6904*c0909341SAndroid Build Coastguard Worker    pandn                m4, [esp+0x20]
6905*c0909341SAndroid Build Coastguard Worker    pandn                m6, [esp+0x30]
6906*c0909341SAndroid Build Coastguard Worker    pandn                m7, [esp+0x40]
6907*c0909341SAndroid Build Coastguard Worker    pandn                m5, [esp+0x50]
6908*c0909341SAndroid Build Coastguard Worker    por                  m0, m4
6909*c0909341SAndroid Build Coastguard Worker    por                  m1, m6
6910*c0909341SAndroid Build Coastguard Worker    por                  m2, m7
6911*c0909341SAndroid Build Coastguard Worker    por                  m3, m5
6912*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x20], m0
6913*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x30], m1
6914*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x40], m2
6915*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x50], m3
6916*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x60, 0 ; 0-1
6917*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x180   ; 2-3
6918*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x1a0   ; 4-5
6919*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x1c0   ; 6-7
6920*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+0x1a0]
6921*c0909341SAndroid Build Coastguard Worker    mova                 m6, [esp+0x1b0]
6922*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+0x1c0]
6923*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x1d0]
6924*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6      ; 45a
6925*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6          ; 45b
6926*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m0      ; 67a
6927*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0          ; 67b
6928*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1a0], m4
6929*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m5
6930*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1c0], m6
6931*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1d0], m7
6932*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x060]
6933*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x070]
6934*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+0x180]
6935*c0909341SAndroid Build Coastguard Worker    mova                 m4, [esp+0x190]
6936*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2      ; 01a
6937*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2          ; 01b
6938*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m4      ; 23a
6939*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4          ; 23b
6940*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x060], m0
6941*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x070], m1
6942*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x180], m2
6943*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x190], m3
6944*c0909341SAndroid Build Coastguard Worker %define m8  [esp+0x140]
6945*c0909341SAndroid Build Coastguard Worker %define m9  [esp+0x150]
6946*c0909341SAndroid Build Coastguard Worker %define m10 [esp+0x160]
6947*c0909341SAndroid Build Coastguard Worker %define m11 [esp+0x170]
6948*c0909341SAndroid Build Coastguard Worker%endif
6949*c0909341SAndroid Build Coastguard Worker.dy1_vloop:
6950*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6951*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
6952*c0909341SAndroid Build Coastguard Worker%endif
6953*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m8
6954*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m8
6955*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m9
6956*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m9
6957*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
6958*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
6959*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
6960*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
6961*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6962*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [rsp+0x50], m10
6963*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [rsp+0x60], m10
6964*c0909341SAndroid Build Coastguard Worker%else
6965*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [rsp+0x1a0], m10
6966*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [rsp+0x1b0], m10
6967*c0909341SAndroid Build Coastguard Worker%endif
6968*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
6969*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
6970*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6971*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [rsp+0x70], m11
6972*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [rsp+0x80], m11
6973*c0909341SAndroid Build Coastguard Worker%else
6974*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [rsp+0x1c0], m11
6975*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [rsp+0x1d0], m11
6976*c0909341SAndroid Build Coastguard Worker%endif
6977*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
6978*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
6979*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
6980*c0909341SAndroid Build Coastguard Worker    psrad                m5, rndshift
6981*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
6982*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
6983*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
6984*c0909341SAndroid Build Coastguard Worker    movq             [dstq], m4
6985*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
6986*c0909341SAndroid Build Coastguard Worker%else
6987*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
6988*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
6989*c0909341SAndroid Build Coastguard Worker%endif
6990*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
6991*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
6992*c0909341SAndroid Build Coastguard Worker%endif
6993*c0909341SAndroid Build Coastguard Worker    dec                  hd
6994*c0909341SAndroid Build Coastguard Worker    jz .dy1_hloop_prep
6995*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
6996*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+ r4]
6997*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+ r6]
6998*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+ r7]
6999*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+ r9]
7000*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+r10]
7001*c0909341SAndroid Build Coastguard Worker    movq                 m7, [srcq+r11]
7002*c0909341SAndroid Build Coastguard Worker    movhps               m6, [srcq+r13]
7003*c0909341SAndroid Build Coastguard Worker    movhps               m7, [srcq+ rX]
7004*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
7005*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m14, q1032
7006*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14                ; 0a 1a
7007*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14                ; 0b 1b
7008*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m15                ; 3a 2a
7009*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m15                ; 3b 2b
7010*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, [rsp+0x10]
7011*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, [rsp+0x20]
7012*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, [rsp+0x30]
7013*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, [rsp+0x40]
7014*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
7015*c0909341SAndroid Build Coastguard Worker    phaddw               m6, m7
7016*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m6
7017*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12
7018*c0909341SAndroid Build Coastguard Worker    pshufb               m5, [rsp+0x70], m15    ; 7a 6a
7019*c0909341SAndroid Build Coastguard Worker    pshufb               m7, [rsp+0x80], m15    ; 7b 6b
7020*c0909341SAndroid Build Coastguard Worker    pshufb               m6, [rsp+0x50], m14    ; 4a 5a
7021*c0909341SAndroid Build Coastguard Worker    pshufb              m15, [rsp+0x60], m14    ; 4b 5b
7022*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2  ; 12a
7023*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3  ; 12b
7024*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m6  ; 34a
7025*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m15 ; 34b
7026*c0909341SAndroid Build Coastguard Worker    punpckhwd            m6, m5  ; 56a
7027*c0909341SAndroid Build Coastguard Worker    punpckhwd           m15, m7  ; 56b
7028*c0909341SAndroid Build Coastguard Worker    punpcklwd            m5, m4  ; 78a
7029*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 8
7030*c0909341SAndroid Build Coastguard Worker    punpcklwd            m7, m4  ; 78b
7031*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m6
7032*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m15
7033*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m5
7034*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m7
7035*c0909341SAndroid Build Coastguard Worker%else
7036*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+ 0]
7037*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+ 8]
7038*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+ 4]
7039*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+12]
7040*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+unpckw]
7041*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x060]
7042*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x070]
7043*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+0x1a0]
7044*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+r0]
7045*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+rX]
7046*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+r4]
7047*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+r5]
7048*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6         ; 0a 1a
7049*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6         ; 0b 1b
7050*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m6         ; 4a 5a
7051*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+16]
7052*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+24]
7053*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+20]
7054*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+28]
7055*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+r0]
7056*c0909341SAndroid Build Coastguard Worker    movq                 m2, [srcq+rX]
7057*c0909341SAndroid Build Coastguard Worker    movhps               m3, [srcq+r4]
7058*c0909341SAndroid Build Coastguard Worker    movhps               m2, [srcq+r5]
7059*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
7060*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, [esp+0x20]
7061*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, [esp+0x30]
7062*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, [esp+0x40]
7063*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, [esp+0x50]
7064*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
7065*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m2
7066*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+0x1b0]
7067*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x180]
7068*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m3
7069*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+0x190]
7070*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12        ; 8a 8b
7071*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m6         ; 4b 5b
7072*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q1032
7073*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m6         ; 3a 2a
7074*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m6         ; 3b 2b
7075*c0909341SAndroid Build Coastguard Worker    punpckhwd            m0, m2         ; 12a
7076*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m3         ; 12b
7077*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x60], m0
7078*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x70], m1
7079*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x1c0]
7080*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x1d0]
7081*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m7         ; 34a
7082*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m5         ; 34b
7083*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x180], m2
7084*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x190], m3
7085*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m6         ; 7a 6a
7086*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m6         ; 7b 6b
7087*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0         ; 56a
7088*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m1         ; 56b
7089*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m4
7090*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m4, m4
7091*c0909341SAndroid Build Coastguard Worker    punpcklwd            m1, m4
7092*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1a0], m7
7093*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m5
7094*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1c0], m0
7095*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1d0], m1
7096*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x60]
7097*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x70]
7098*c0909341SAndroid Build Coastguard Worker%endif
7099*c0909341SAndroid Build Coastguard Worker    jmp .dy1_vloop
7100*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
7101*c0909341SAndroid Build Coastguard Worker.dy2:
7102*c0909341SAndroid Build Coastguard Worker    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
7103*c0909341SAndroid Build Coastguard Worker    add                  wq, base_reg
7104*c0909341SAndroid Build Coastguard Worker    jmp                  wq
7105*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
7106*c0909341SAndroid Build Coastguard Worker.dy2_w2:
7107*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
7108*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
7109*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
7110*c0909341SAndroid Build Coastguard Worker    dec                srcq
7111*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
7112*c0909341SAndroid Build Coastguard Worker %else
7113*c0909341SAndroid Build Coastguard Worker  %define m10 [base+pd_0x3ff]
7114*c0909341SAndroid Build Coastguard Worker  %define m11 [base+pd_0x4000]
7115*c0909341SAndroid Build Coastguard Worker  %define m8  m0
7116*c0909341SAndroid Build Coastguard Worker  %define m9  m1
7117*c0909341SAndroid Build Coastguard Worker  %define m14 m4
7118*c0909341SAndroid Build Coastguard Worker  %define m15 m3
7119*c0909341SAndroid Build Coastguard Worker    movzx                r5, byte [esp+0x1f0]
7120*c0909341SAndroid Build Coastguard Worker    dec                srcd
7121*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
7122*c0909341SAndroid Build Coastguard Worker %endif
7123*c0909341SAndroid Build Coastguard Worker    punpckldq            m9, m8
7124*c0909341SAndroid Build Coastguard Worker    SWAP                 m8, m9
7125*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-1]
7126*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
7127*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pd_0x4000]
7128*c0909341SAndroid Build Coastguard Worker %endif
7129*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
7130*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
7131*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
7132*c0909341SAndroid Build Coastguard Worker    paddd               m15, m8
7133*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
7134*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
7135*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
7136*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
7137*c0909341SAndroid Build Coastguard Worker %else
7138*c0909341SAndroid Build Coastguard Worker    movd                r3d, m15
7139*c0909341SAndroid Build Coastguard Worker %endif
7140*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+bdct_lb_dw]
7141*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+subpel_s_shuf2]
7142*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r4*8+2]
7143*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
7144*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r6*8+2]
7145*c0909341SAndroid Build Coastguard Worker %else
7146*c0909341SAndroid Build Coastguard Worker    movd                 m7, [base+subpel_filters+r3*8+2]
7147*c0909341SAndroid Build Coastguard Worker %endif
7148*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
7149*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
7150*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
7151*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
7152*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7153*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
7154*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
7155*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x00], m14
7156*c0909341SAndroid Build Coastguard Worker  %define m14 [esp+0x00]
7157*c0909341SAndroid Build Coastguard Worker    SWAP                 m5, m0
7158*c0909341SAndroid Build Coastguard Worker    SWAP                 m6, m3
7159*c0909341SAndroid Build Coastguard Worker  %define m8  m5
7160*c0909341SAndroid Build Coastguard Worker  %define m15 m6
7161*c0909341SAndroid Build Coastguard Worker %endif
7162*c0909341SAndroid Build Coastguard Worker    movq                 m0, [srcq+ssq*0]
7163*c0909341SAndroid Build Coastguard Worker    movq                 m1, [srcq+ssq*1]
7164*c0909341SAndroid Build Coastguard Worker    movhps               m0, [srcq+ssq*2]
7165*c0909341SAndroid Build Coastguard Worker    movhps               m1, [srcq+ss3q ]
7166*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
7167*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
7168*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7169*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
7170*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
7171*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
7172*c0909341SAndroid Build Coastguard Worker    pshufb              m14, m5
7173*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
7174*c0909341SAndroid Build Coastguard Worker    movq                m10, r4q
7175*c0909341SAndroid Build Coastguard Worker %else
7176*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
7177*c0909341SAndroid Build Coastguard Worker    mov                  r3, [esp+0x1f4]
7178*c0909341SAndroid Build Coastguard Worker    xor                  r5, r5
7179*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7180*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3+myd]
7181*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
7182*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r3*8+0]
7183*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+r3*8+4]
7184*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7185*c0909341SAndroid Build Coastguard Worker  %define m10 m4
7186*c0909341SAndroid Build Coastguard Worker    movd                m10, r4
7187*c0909341SAndroid Build Coastguard Worker    movd                 m3, r5
7188*c0909341SAndroid Build Coastguard Worker    punpckldq           m10, m3
7189*c0909341SAndroid Build Coastguard Worker %endif
7190*c0909341SAndroid Build Coastguard Worker    movq                 m3, [srcq+ssq*0]
7191*c0909341SAndroid Build Coastguard Worker    movhps               m3, [srcq+ssq*1]
7192*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
7193*c0909341SAndroid Build Coastguard Worker    punpcklbw           m10, m10
7194*c0909341SAndroid Build Coastguard Worker    psraw               m10, 8
7195*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m7
7196*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m15
7197*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
7198*c0909341SAndroid Build Coastguard Worker    pand                m11, m8
7199*c0909341SAndroid Build Coastguard Worker %else
7200*c0909341SAndroid Build Coastguard Worker    pand                 m7, m11, m8
7201*c0909341SAndroid Build Coastguard Worker  %define m11 m7
7202*c0909341SAndroid Build Coastguard Worker %endif
7203*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
7204*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m8
7205*c0909341SAndroid Build Coastguard Worker    por                 m15, m11
7206*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
7207*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m10, q0000
7208*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m10, q1111
7209*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m10, q3333
7210*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m10, q2222
7211*c0909341SAndroid Build Coastguard Worker %else
7212*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x10], m15
7213*c0909341SAndroid Build Coastguard Worker  %define m15 [esp+0x10]
7214*c0909341SAndroid Build Coastguard Worker    mov                  r5, r0m
7215*c0909341SAndroid Build Coastguard Worker  %define dstq r5
7216*c0909341SAndroid Build Coastguard Worker    mov                 dsd, dsm
7217*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m4, q0000
7218*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m4, q1111
7219*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m4, q2222
7220*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q3333
7221*c0909341SAndroid Build Coastguard Worker  %define m8  [esp+0x20]
7222*c0909341SAndroid Build Coastguard Worker  %define m9  [esp+0x30]
7223*c0909341SAndroid Build Coastguard Worker  %define m10 [esp+0x40]
7224*c0909341SAndroid Build Coastguard Worker  %define m11 [esp+0x50]
7225*c0909341SAndroid Build Coastguard Worker    mova                 m8, m5
7226*c0909341SAndroid Build Coastguard Worker    mova                 m9, m6
7227*c0909341SAndroid Build Coastguard Worker    mova                m10, m7
7228*c0909341SAndroid Build Coastguard Worker    mova                m11, m4
7229*c0909341SAndroid Build Coastguard Worker %endif
7230*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
7231*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
7232*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
7233*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
7234*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
7235*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
7236*c0909341SAndroid Build Coastguard Worker    pslldq               m2, m3, 8
7237*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m2
7238*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m3
7239*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12            ; 0 2 _ 4
7240*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12            ; 1 3 _ 5
7241*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m0, q3110      ; 0 2 2 4
7242*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m1, q3110      ; 1 3 3 5
7243*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m2, m1         ; 01 23
7244*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m1             ; 23 45
7245*c0909341SAndroid Build Coastguard Worker.dy2_w2_loop:
7246*c0909341SAndroid Build Coastguard Worker    movq                 m6, [srcq+ssq*0]
7247*c0909341SAndroid Build Coastguard Worker    movq                 m7, [srcq+ssq*1]
7248*c0909341SAndroid Build Coastguard Worker    movhps               m6, [srcq+ssq*2]
7249*c0909341SAndroid Build Coastguard Worker    movhps               m7, [srcq+ss3q ]
7250*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
7251*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m3, m8
7252*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m2, m9
7253*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
7254*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
7255*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m15
7256*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
7257*c0909341SAndroid Build Coastguard Worker    phaddw               m6, m7
7258*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m12
7259*c0909341SAndroid Build Coastguard Worker    psrldq               m7, m6, 8
7260*c0909341SAndroid Build Coastguard Worker    palignr              m6, m0, 8
7261*c0909341SAndroid Build Coastguard Worker    palignr              m7, m1, 8
7262*c0909341SAndroid Build Coastguard Worker    mova                 m0, m6
7263*c0909341SAndroid Build Coastguard Worker    mova                 m1, m7
7264*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3221
7265*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q3221
7266*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m6, m7       ; 45 67
7267*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m6, m7       ; 67 89
7268*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m3, m10
7269*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m2, m11
7270*c0909341SAndroid Build Coastguard Worker    paddd                m4, m5
7271*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
7272*c0909341SAndroid Build Coastguard Worker    paddd                m6, m7
7273*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
7274*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
7275*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m4
7276*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
7277*c0909341SAndroid Build Coastguard Worker    movd                r4d, m4
7278*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r4w
7279*c0909341SAndroid Build Coastguard Worker    shr                 r4d, 16
7280*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r4w
7281*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
7282*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
7283*c0909341SAndroid Build Coastguard Worker    jg .dy2_w2_loop
7284*c0909341SAndroid Build Coastguard Worker    RET
7285*c0909341SAndroid Build Coastguard Worker%endif
7286*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
7287*c0909341SAndroid Build Coastguard Worker.dy2_w4:
7288*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7289*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
7290*c0909341SAndroid Build Coastguard Worker    movzx               t0d, t0b
7291*c0909341SAndroid Build Coastguard Worker    dec                srcq
7292*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
7293*c0909341SAndroid Build Coastguard Worker%else
7294*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0x3ff]
7295*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_0x4000]
7296*c0909341SAndroid Build Coastguard Worker %define m8  m0
7297*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
7298*c0909341SAndroid Build Coastguard Worker %define m15 m3
7299*c0909341SAndroid Build Coastguard Worker %define dstq r0
7300*c0909341SAndroid Build Coastguard Worker %if isprep
7301*c0909341SAndroid Build Coastguard Worker  %define ssq r3
7302*c0909341SAndroid Build Coastguard Worker %endif
7303*c0909341SAndroid Build Coastguard Worker    movzx                r4, byte [esp+0x1f0]
7304*c0909341SAndroid Build Coastguard Worker    dec                srcq
7305*c0909341SAndroid Build Coastguard Worker    movd                m15, r4
7306*c0909341SAndroid Build Coastguard Worker%endif
7307*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul]
7308*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7309*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pd_0x4000]
7310*c0909341SAndroid Build Coastguard Worker%endif
7311*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
7312*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
7313*c0909341SAndroid Build Coastguard Worker    pand                 m8, m14, m10
7314*c0909341SAndroid Build Coastguard Worker    psrld                m8, 6
7315*c0909341SAndroid Build Coastguard Worker    paddd               m15, m8
7316*c0909341SAndroid Build Coastguard Worker    psrldq               m7, m15, 8
7317*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7318*c0909341SAndroid Build Coastguard Worker    movd                r4d, m15
7319*c0909341SAndroid Build Coastguard Worker    movd               r11d, m7
7320*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
7321*c0909341SAndroid Build Coastguard Worker    psrldq               m7, 4
7322*c0909341SAndroid Build Coastguard Worker    movd                r6d, m15
7323*c0909341SAndroid Build Coastguard Worker    movd               r13d, m7
7324*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+ r4*8+2]
7325*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+r11*8+2]
7326*c0909341SAndroid Build Coastguard Worker    movd                 m3, [base+subpel_filters+ r6*8+2]
7327*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base+subpel_filters+r13*8+2]
7328*c0909341SAndroid Build Coastguard Worker    movq                 m6, [base+subpel_s_shuf2]
7329*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7330*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
7331*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
7332*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
7333*c0909341SAndroid Build Coastguard Worker%else
7334*c0909341SAndroid Build Coastguard Worker    movd                 r1, m15
7335*c0909341SAndroid Build Coastguard Worker    movd                 r3, m7
7336*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
7337*c0909341SAndroid Build Coastguard Worker    psrldq               m7, 4
7338*c0909341SAndroid Build Coastguard Worker    movd                 r4, m15
7339*c0909341SAndroid Build Coastguard Worker    movd                 r5, m7
7340*c0909341SAndroid Build Coastguard Worker %define m15 m5
7341*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m7
7342*c0909341SAndroid Build Coastguard Worker    movd                m15, [base+subpel_filters+r1*8+2]
7343*c0909341SAndroid Build Coastguard Worker    movd                 m2, [base+subpel_filters+r3*8+2]
7344*c0909341SAndroid Build Coastguard Worker    movd                 m3, [base+subpel_filters+r4*8+2]
7345*c0909341SAndroid Build Coastguard Worker    movd                 m4, [base+subpel_filters+r5*8+2]
7346*c0909341SAndroid Build Coastguard Worker    movq                 m6, [base+subpel_s_shuf2]
7347*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
7348*c0909341SAndroid Build Coastguard Worker    mov                  r3, [esp+0x1f4]
7349*c0909341SAndroid Build Coastguard Worker    xor                  r5, r5
7350*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7351*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3+myd]
7352*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
7353*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r3*8+0]
7354*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+r3*8+4]
7355*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7356*c0909341SAndroid Build Coastguard Worker %if isprep
7357*c0909341SAndroid Build Coastguard Worker    lea                ss3q, [ssq*3]
7358*c0909341SAndroid Build Coastguard Worker %endif
7359*c0909341SAndroid Build Coastguard Worker%endif
7360*c0909341SAndroid Build Coastguard Worker    punpckldq           m15, m3
7361*c0909341SAndroid Build Coastguard Worker    punpckldq            m2, m4
7362*c0909341SAndroid Build Coastguard Worker    punpcklqdq          m15, m2
7363*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7364*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m9
7365*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
7366*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
7367*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*2]
7368*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*1]
7369*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ss3q ]
7370*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
7371*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m6
7372*c0909341SAndroid Build Coastguard Worker    pshufb              m14, [base+bdct_lb_dw]
7373*c0909341SAndroid Build Coastguard Worker    movu                 m4, [srcq+ssq*0]
7374*c0909341SAndroid Build Coastguard Worker    movu                 m5, [srcq+ssq*1]
7375*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*2]
7376*c0909341SAndroid Build Coastguard Worker    pand                m11, m8
7377*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
7378*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m8
7379*c0909341SAndroid Build Coastguard Worker    por                 m15, m11
7380*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
7381*c0909341SAndroid Build Coastguard Worker    movq                m11, r4q
7382*c0909341SAndroid Build Coastguard Worker    punpcklbw           m11, m11
7383*c0909341SAndroid Build Coastguard Worker    psraw               m11, 8
7384*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
7385*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
7386*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
7387*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
7388*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m14
7389*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m14
7390*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
7391*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
7392*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
7393*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
7394*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m15
7395*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m15
7396*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m2
7397*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m3
7398*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
7399*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m12    ; 0 2
7400*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12    ; 1 3
7401*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m12    ; 4 5
7402*c0909341SAndroid Build Coastguard Worker    pshufd               m8, m11, q0000
7403*c0909341SAndroid Build Coastguard Worker    pshufd               m9, m11, q1111
7404*c0909341SAndroid Build Coastguard Worker    pshufd              m10, m11, q2222
7405*c0909341SAndroid Build Coastguard Worker    pshufd              m11, m11, q3333
7406*c0909341SAndroid Build Coastguard Worker%else
7407*c0909341SAndroid Build Coastguard Worker    pxor                 m3, m3
7408*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m8, m3
7409*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
7410*c0909341SAndroid Build Coastguard Worker    pshufb              m14, [base+bdct_lb_dw]
7411*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ssq*0]
7412*c0909341SAndroid Build Coastguard Worker    movu                 m2, [srcq+ssq*2]
7413*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*1]
7414*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
7415*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m6, m6
7416*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m7
7417*c0909341SAndroid Build Coastguard Worker    pand                 m7, m11, m8
7418*c0909341SAndroid Build Coastguard Worker    pandn                m8, m15
7419*c0909341SAndroid Build Coastguard Worker    SWAP                m15, m8
7420*c0909341SAndroid Build Coastguard Worker    por                 m15, m7
7421*c0909341SAndroid Build Coastguard Worker    paddb               m14, m6
7422*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+ssq*0]
7423*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1]
7424*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*2]
7425*c0909341SAndroid Build Coastguard Worker    add                srcq, ss3q
7426*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
7427*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m14
7428*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
7429*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m14
7430*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
7431*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
7432*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
7433*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m2, m15
7434*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
7435*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x00], m14
7436*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x10], m15
7437*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m15
7438*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
7439*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m15
7440*c0909341SAndroid Build Coastguard Worker %define m14 [esp+0x00]
7441*c0909341SAndroid Build Coastguard Worker %define m15 [esp+0x10]
7442*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m2
7443*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m0
7444*c0909341SAndroid Build Coastguard Worker    phaddw               m7, m6
7445*c0909341SAndroid Build Coastguard Worker %ifidn %1, put
7446*c0909341SAndroid Build Coastguard Worker    mov                 dsd, dsm
7447*c0909341SAndroid Build Coastguard Worker  %define dstq r5
7448*c0909341SAndroid Build Coastguard Worker %else
7449*c0909341SAndroid Build Coastguard Worker  %define tmpq r5
7450*c0909341SAndroid Build Coastguard Worker %endif
7451*c0909341SAndroid Build Coastguard Worker    movd                 m6, r4
7452*c0909341SAndroid Build Coastguard Worker    movd                 m0, r5
7453*c0909341SAndroid Build Coastguard Worker    punpckldq            m6, m0
7454*c0909341SAndroid Build Coastguard Worker    punpcklbw            m6, m6
7455*c0909341SAndroid Build Coastguard Worker    psraw                m6, 8
7456*c0909341SAndroid Build Coastguard Worker    mov                  r5, r0m
7457*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m12 ; 0 2
7458*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m12 ; 1 3
7459*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m7, m12 ; 4 5
7460*c0909341SAndroid Build Coastguard Worker    SWAP                 m0, m1, m3
7461*c0909341SAndroid Build Coastguard Worker    SWAP                 m4, m7
7462*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m6, q0000
7463*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m6, q1111
7464*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m6, q2222
7465*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3333
7466*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x30], m2
7467*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x40], m3
7468*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x50], m7
7469*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x60], m6
7470*c0909341SAndroid Build Coastguard Worker %define m8  [esp+0x30]
7471*c0909341SAndroid Build Coastguard Worker %define m9  [esp+0x40]
7472*c0909341SAndroid Build Coastguard Worker %define m10 [esp+0x50]
7473*c0909341SAndroid Build Coastguard Worker %define m11 [esp+0x60]
7474*c0909341SAndroid Build Coastguard Worker%endif
7475*c0909341SAndroid Build Coastguard Worker    psrldq               m5, m4, 8  ; 5 _
7476*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m0, m1 ; 23
7477*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1     ; 01
7478*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5     ; 45
7479*c0909341SAndroid Build Coastguard Worker.dy2_w4_loop:
7480*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m8         ; a0
7481*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m2, m8     ; b0
7482*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, m9         ; a1
7483*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m4, m9     ; b1
7484*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, m4, m10    ; a2
7485*c0909341SAndroid Build Coastguard Worker    paddd                m0, m13
7486*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
7487*c0909341SAndroid Build Coastguard Worker    paddd                m0, m2
7488*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
7489*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3
7490*c0909341SAndroid Build Coastguard Worker    movu                 m6, [srcq+ssq*0]
7491*c0909341SAndroid Build Coastguard Worker    movu                 m7, [srcq+ssq*1]
7492*c0909341SAndroid Build Coastguard Worker    movu                 m3, [srcq+ssq*2]
7493*c0909341SAndroid Build Coastguard Worker    movu                 m1, [srcq+ss3q ]
7494*c0909341SAndroid Build Coastguard Worker    lea                srcq, [srcq+ssq*4]
7495*c0909341SAndroid Build Coastguard Worker    pshufb               m6, m14
7496*c0909341SAndroid Build Coastguard Worker    pshufb               m7, m14
7497*c0909341SAndroid Build Coastguard Worker    pshufb               m3, m14
7498*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m14
7499*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m6, m15
7500*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m7, m15
7501*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m3, m15
7502*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m15
7503*c0909341SAndroid Build Coastguard Worker    phaddw               m6, m7
7504*c0909341SAndroid Build Coastguard Worker    phaddw               m3, m1
7505*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m6, m12    ; 6 7
7506*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m3, m12    ; 8 9
7507*c0909341SAndroid Build Coastguard Worker    psrldq               m7, m6, 8
7508*c0909341SAndroid Build Coastguard Worker    psrldq               m1, m3, 8
7509*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7     ; 67
7510*c0909341SAndroid Build Coastguard Worker    punpcklwd            m3, m1     ; 89
7511*c0909341SAndroid Build Coastguard Worker    mova                 m2, m6
7512*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m6, m10    ; b2
7513*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m11        ; a3
7514*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m11    ; b3
7515*c0909341SAndroid Build Coastguard Worker    paddd                m5, m1
7516*c0909341SAndroid Build Coastguard Worker    paddd                m0, m6
7517*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
7518*c0909341SAndroid Build Coastguard Worker    psrad                m0, rndshift
7519*c0909341SAndroid Build Coastguard Worker    psrad                m5, rndshift
7520*c0909341SAndroid Build Coastguard Worker    packssdw             m0, m5
7521*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
7522*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
7523*c0909341SAndroid Build Coastguard Worker    psrldq               m1, m0, 4
7524*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m0
7525*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m1
7526*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
7527*c0909341SAndroid Build Coastguard Worker%else
7528*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m0
7529*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
7530*c0909341SAndroid Build Coastguard Worker%endif
7531*c0909341SAndroid Build Coastguard Worker    mova                 m0, m4
7532*c0909341SAndroid Build Coastguard Worker    mova                 m4, m3
7533*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
7534*c0909341SAndroid Build Coastguard Worker    jg .dy2_w4_loop
7535*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET
7536*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
7537*c0909341SAndroid Build Coastguard Worker.dy2_w8:
7538*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 1
7539*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 16
7540*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
7541*c0909341SAndroid Build Coastguard Worker.dy2_w16:
7542*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 2
7543*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 32
7544*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
7545*c0909341SAndroid Build Coastguard Worker.dy2_w32:
7546*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 4
7547*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 64
7548*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
7549*c0909341SAndroid Build Coastguard Worker.dy2_w64:
7550*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 8
7551*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 128
7552*c0909341SAndroid Build Coastguard Worker    jmp .dy2_w_start
7553*c0909341SAndroid Build Coastguard Worker.dy2_w128:
7554*c0909341SAndroid Build Coastguard Worker    mov    dword [rsp+0x90], 16
7555*c0909341SAndroid Build Coastguard Worker    movifprep   tmp_stridem, 256
7556*c0909341SAndroid Build Coastguard Worker.dy2_w_start:
7557*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
7558*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
7559*c0909341SAndroid Build Coastguard Worker    movifnidn           dsm, dsq
7560*c0909341SAndroid Build Coastguard Worker%endif
7561*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7562*c0909341SAndroid Build Coastguard Worker    shr                 t0d, 16
7563*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
7564*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7565*c0909341SAndroid Build Coastguard Worker    mov                 r4d, 64 << 24
7566*c0909341SAndroid Build Coastguard Worker    lea                 myd, [t1+myq]
7567*c0909341SAndroid Build Coastguard Worker    cmovnz              r4q, [base+subpel_filters+myq*8]
7568*c0909341SAndroid Build Coastguard Worker    movd                m15, t0d
7569*c0909341SAndroid Build Coastguard Worker%else
7570*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0x3ff]
7571*c0909341SAndroid Build Coastguard Worker %define m11 [base+pd_0x4000]
7572*c0909341SAndroid Build Coastguard Worker %define m8   m0
7573*c0909341SAndroid Build Coastguard Worker %define m9   m1
7574*c0909341SAndroid Build Coastguard Worker %xdefine m14 m4
7575*c0909341SAndroid Build Coastguard Worker %xdefine m15 m3
7576*c0909341SAndroid Build Coastguard Worker %if isprep
7577*c0909341SAndroid Build Coastguard Worker  %define tmpq r0
7578*c0909341SAndroid Build Coastguard Worker  %define ssq ssm
7579*c0909341SAndroid Build Coastguard Worker %else
7580*c0909341SAndroid Build Coastguard Worker  %define dstq r0
7581*c0909341SAndroid Build Coastguard Worker %endif
7582*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x1f0]
7583*c0909341SAndroid Build Coastguard Worker    mov                  r3, [esp+0x1f4]
7584*c0909341SAndroid Build Coastguard Worker    shr                  r5, 16
7585*c0909341SAndroid Build Coastguard Worker    sub                srcq, 3
7586*c0909341SAndroid Build Coastguard Worker    movd                m15, r5
7587*c0909341SAndroid Build Coastguard Worker    xor                  r5, r5
7588*c0909341SAndroid Build Coastguard Worker    shr                 myd, 6
7589*c0909341SAndroid Build Coastguard Worker    lea                  r3, [r3+myd]
7590*c0909341SAndroid Build Coastguard Worker    mov                  r4, 64 << 24
7591*c0909341SAndroid Build Coastguard Worker    cmovnz               r4, [base+subpel_filters+r3*8+0]
7592*c0909341SAndroid Build Coastguard Worker    cmovnz               r5, [base+subpel_filters+r3*8+4]
7593*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
7594*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7595*c0909341SAndroid Build Coastguard Worker%endif
7596*c0909341SAndroid Build Coastguard Worker    pslld                m7, m8, 2 ; dx*4
7597*c0909341SAndroid Build Coastguard Worker    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
7598*c0909341SAndroid Build Coastguard Worker    pshufd              m15, m15, q0000
7599*c0909341SAndroid Build Coastguard Worker    paddd               m14, m8 ; mx+dx*[0-3]
7600*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7601*c0909341SAndroid Build Coastguard Worker    movq                 m3, r4q
7602*c0909341SAndroid Build Coastguard Worker    punpcklbw            m3, m3
7603*c0909341SAndroid Build Coastguard Worker    psraw                m3, 8
7604*c0909341SAndroid Build Coastguard Worker%else
7605*c0909341SAndroid Build Coastguard Worker    movd                 m5, r4
7606*c0909341SAndroid Build Coastguard Worker    movd                 m6, r5
7607*c0909341SAndroid Build Coastguard Worker    punpckldq            m5, m6
7608*c0909341SAndroid Build Coastguard Worker    punpcklbw            m5, m5
7609*c0909341SAndroid Build Coastguard Worker    psraw                m5, 8
7610*c0909341SAndroid Build Coastguard Worker    SWAP                 m3, m5
7611*c0909341SAndroid Build Coastguard Worker%endif
7612*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x100], m7
7613*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x120], m15
7614*c0909341SAndroid Build Coastguard Worker    mov         [rsp+0x098], srcq
7615*c0909341SAndroid Build Coastguard Worker    mov         [rsp+0x130], r0q ; dstq / tmpq
7616*c0909341SAndroid Build Coastguard Worker    pshufd               m0, m3, q0000
7617*c0909341SAndroid Build Coastguard Worker    pshufd               m1, m3, q1111
7618*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m3, q2222
7619*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3333
7620*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x140], m0
7621*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x150], m1
7622*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x160], m2
7623*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x170], m3
7624*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64 && UNIX64
7625*c0909341SAndroid Build Coastguard Worker    mov                  hm, hd
7626*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_32
7627*c0909341SAndroid Build Coastguard Worker    SWAP                  m5, m3
7628*c0909341SAndroid Build Coastguard Worker    mov                   r5, hm
7629*c0909341SAndroid Build Coastguard Worker    mov          [esp+0x134], r5
7630*c0909341SAndroid Build Coastguard Worker%endif
7631*c0909341SAndroid Build Coastguard Worker    jmp .dy2_hloop
7632*c0909341SAndroid Build Coastguard Worker.dy2_hloop_prep:
7633*c0909341SAndroid Build Coastguard Worker    dec   dword [rsp+0x090]
7634*c0909341SAndroid Build Coastguard Worker    jz .ret
7635*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7636*c0909341SAndroid Build Coastguard Worker    add   qword [rsp+0x130], 8*(isprep+1)
7637*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
7638*c0909341SAndroid Build Coastguard Worker%else
7639*c0909341SAndroid Build Coastguard Worker    add   dword [rsp+0x130], 8*(isprep+1)
7640*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+0x134]
7641*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+0x130]
7642*c0909341SAndroid Build Coastguard Worker%endif
7643*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+0x100]
7644*c0909341SAndroid Build Coastguard Worker    mova                m14, [rsp+0x110]
7645*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7646*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pd_0x3ff]
7647*c0909341SAndroid Build Coastguard Worker%else
7648*c0909341SAndroid Build Coastguard Worker %define m10 [base+pd_0x3ff]
7649*c0909341SAndroid Build Coastguard Worker%endif
7650*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+0x120]
7651*c0909341SAndroid Build Coastguard Worker    mov                srcq, [rsp+0x098]
7652*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7653*c0909341SAndroid Build Coastguard Worker    mov                 r0q, [rsp+0x130] ; dstq / tmpq
7654*c0909341SAndroid Build Coastguard Worker%else
7655*c0909341SAndroid Build Coastguard Worker    mov                  hm, r5
7656*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
7657*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
7658*c0909341SAndroid Build Coastguard Worker%endif
7659*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7
7660*c0909341SAndroid Build Coastguard Worker.dy2_hloop:
7661*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
7662*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7663*c0909341SAndroid Build Coastguard Worker    mova                m11, [base+pq_0x40000000]
7664*c0909341SAndroid Build Coastguard Worker%else
7665*c0909341SAndroid Build Coastguard Worker %define m11 [base+pq_0x40000000]
7666*c0909341SAndroid Build Coastguard Worker%endif
7667*c0909341SAndroid Build Coastguard Worker    psrld                m2, m14, 10
7668*c0909341SAndroid Build Coastguard Worker    mova              [rsp], m2
7669*c0909341SAndroid Build Coastguard Worker    pand                 m6, m14, m10
7670*c0909341SAndroid Build Coastguard Worker    psrld                m6, 6
7671*c0909341SAndroid Build Coastguard Worker    paddd                m5, m15, m6
7672*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m6, m9
7673*c0909341SAndroid Build Coastguard Worker    psrldq               m2, m5, 8
7674*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7675*c0909341SAndroid Build Coastguard Worker    movd                r4d, m5
7676*c0909341SAndroid Build Coastguard Worker    movd                r6d, m2
7677*c0909341SAndroid Build Coastguard Worker    psrldq               m5, 4
7678*c0909341SAndroid Build Coastguard Worker    psrldq               m2, 4
7679*c0909341SAndroid Build Coastguard Worker    movd                r7d, m5
7680*c0909341SAndroid Build Coastguard Worker    movd                r9d, m2
7681*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r4*8]
7682*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+r6*8]
7683*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r7*8]
7684*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r9*8]
7685*c0909341SAndroid Build Coastguard Worker%else
7686*c0909341SAndroid Build Coastguard Worker    movd                 r0, m5
7687*c0909341SAndroid Build Coastguard Worker    movd                 rX, m2
7688*c0909341SAndroid Build Coastguard Worker    psrldq               m5, 4
7689*c0909341SAndroid Build Coastguard Worker    psrldq               m2, 4
7690*c0909341SAndroid Build Coastguard Worker    movd                 r4, m5
7691*c0909341SAndroid Build Coastguard Worker    movd                 r5, m2
7692*c0909341SAndroid Build Coastguard Worker    movq                 m0, [base+subpel_filters+r0*8]
7693*c0909341SAndroid Build Coastguard Worker    movq                 m1, [base+subpel_filters+rX*8]
7694*c0909341SAndroid Build Coastguard Worker    movhps               m0, [base+subpel_filters+r4*8]
7695*c0909341SAndroid Build Coastguard Worker    movhps               m1, [base+subpel_filters+r5*8]
7696*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
7697*c0909341SAndroid Build Coastguard Worker %define m9 m2
7698*c0909341SAndroid Build Coastguard Worker%endif
7699*c0909341SAndroid Build Coastguard Worker    paddd               m14, m7 ; mx+dx*[4-7]
7700*c0909341SAndroid Build Coastguard Worker    pand                 m5, m14, m10
7701*c0909341SAndroid Build Coastguard Worker    psrld                m5, 6
7702*c0909341SAndroid Build Coastguard Worker    paddd               m15, m5
7703*c0909341SAndroid Build Coastguard Worker    pcmpeqd              m5, m9
7704*c0909341SAndroid Build Coastguard Worker    mova        [rsp+0x110], m14
7705*c0909341SAndroid Build Coastguard Worker    psrldq               m4, m15, 8
7706*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7707*c0909341SAndroid Build Coastguard Worker    movd               r10d, m15
7708*c0909341SAndroid Build Coastguard Worker    movd               r11d, m4
7709*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
7710*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 4
7711*c0909341SAndroid Build Coastguard Worker    movd               r13d, m15
7712*c0909341SAndroid Build Coastguard Worker    movd                rXd, m4
7713*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r10*8]
7714*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+r11*8]
7715*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r13*8]
7716*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+ rX*8]
7717*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
7718*c0909341SAndroid Build Coastguard Worker    psrldq               m4, m14, 8
7719*c0909341SAndroid Build Coastguard Worker    movd               r10d, m14
7720*c0909341SAndroid Build Coastguard Worker    movd               r11d, m4
7721*c0909341SAndroid Build Coastguard Worker    psrldq              m14, 4
7722*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 4
7723*c0909341SAndroid Build Coastguard Worker    movd               r13d, m14
7724*c0909341SAndroid Build Coastguard Worker    movd                rXd, m4
7725*c0909341SAndroid Build Coastguard Worker    mov                 r4d, [rsp+ 0]
7726*c0909341SAndroid Build Coastguard Worker    mov                 r6d, [rsp+ 8]
7727*c0909341SAndroid Build Coastguard Worker    mov                 r7d, [rsp+ 4]
7728*c0909341SAndroid Build Coastguard Worker    mov                 r9d, [rsp+12]
7729*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
7730*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
7731*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m5, q1100
7732*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
7733*c0909341SAndroid Build Coastguard Worker    pand                 m8, m11, m4
7734*c0909341SAndroid Build Coastguard Worker    pand                 m9, m11, m6
7735*c0909341SAndroid Build Coastguard Worker    pand                m15, m11, m7
7736*c0909341SAndroid Build Coastguard Worker    pand                m11, m11, m5
7737*c0909341SAndroid Build Coastguard Worker    pandn                m4, m0
7738*c0909341SAndroid Build Coastguard Worker    pandn                m6, m1
7739*c0909341SAndroid Build Coastguard Worker    pandn                m7, m2
7740*c0909341SAndroid Build Coastguard Worker    pandn                m5, m3
7741*c0909341SAndroid Build Coastguard Worker    por                  m8, m4
7742*c0909341SAndroid Build Coastguard Worker    por                  m9, m6
7743*c0909341SAndroid Build Coastguard Worker    por                 m15, m7
7744*c0909341SAndroid Build Coastguard Worker    por                 m11, m5
7745*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x10], m8
7746*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x20], m9
7747*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x30], m15
7748*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x40], m11
7749*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1
7750*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m1
7751*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m2
7752*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3
7753*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m3
7754*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m4
7755*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5
7756*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7
7757*c0909341SAndroid Build Coastguard Worker    SWAP                 m7, m0
7758*c0909341SAndroid Build Coastguard Worker    SWAP                 m8, m14
7759*c0909341SAndroid Build Coastguard Worker    mova                 m1, [rsp+0x50]
7760*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x60]
7761*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+0x70]
7762*c0909341SAndroid Build Coastguard Worker    mova                m15, [rsp+0x80]
7763*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6 ; 45a
7764*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6     ; 45b
7765*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m8 ; 67a
7766*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m8     ; 67b
7767*c0909341SAndroid Build Coastguard Worker    SWAP                m14, m8
7768*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+0x140]
7769*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+0x150]
7770*c0909341SAndroid Build Coastguard Worker    mova                m10, [rsp+0x160]
7771*c0909341SAndroid Build Coastguard Worker    mova                m11, [rsp+0x170]
7772*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2 ; 01a
7773*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2     ; 01b
7774*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m15; 23a
7775*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m15    ; 23b
7776*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m4
7777*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m5
7778*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m6
7779*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m7
7780*c0909341SAndroid Build Coastguard Worker%else
7781*c0909341SAndroid Build Coastguard Worker    movd                 r0, m15
7782*c0909341SAndroid Build Coastguard Worker    movd                 rX, m4
7783*c0909341SAndroid Build Coastguard Worker    psrldq              m15, 4
7784*c0909341SAndroid Build Coastguard Worker    psrldq               m4, 4
7785*c0909341SAndroid Build Coastguard Worker    movd                 r4, m15
7786*c0909341SAndroid Build Coastguard Worker    movd                 r5, m4
7787*c0909341SAndroid Build Coastguard Worker    mova                m14, [esp+0x110]
7788*c0909341SAndroid Build Coastguard Worker    movq                 m2, [base+subpel_filters+r0*8]
7789*c0909341SAndroid Build Coastguard Worker    movq                 m3, [base+subpel_filters+rX*8]
7790*c0909341SAndroid Build Coastguard Worker    movhps               m2, [base+subpel_filters+r4*8]
7791*c0909341SAndroid Build Coastguard Worker    movhps               m3, [base+subpel_filters+r5*8]
7792*c0909341SAndroid Build Coastguard Worker    psrld               m14, 10
7793*c0909341SAndroid Build Coastguard Worker    mova           [esp+16], m14
7794*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+ 0]
7795*c0909341SAndroid Build Coastguard Worker    mov                  rX, [esp+ 8]
7796*c0909341SAndroid Build Coastguard Worker    mov                  r4, [esp+ 4]
7797*c0909341SAndroid Build Coastguard Worker    mov                  r5, [esp+12]
7798*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x20], m0
7799*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x30], m1
7800*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x40], m2
7801*c0909341SAndroid Build Coastguard Worker    mova         [esp+0x50], m3
7802*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m6, q1100
7803*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q3322
7804*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m5, q1100
7805*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q3322
7806*c0909341SAndroid Build Coastguard Worker    pand                 m0, m11, m4
7807*c0909341SAndroid Build Coastguard Worker    pand                 m1, m11, m6
7808*c0909341SAndroid Build Coastguard Worker    pand                 m2, m11, m7
7809*c0909341SAndroid Build Coastguard Worker    pand                 m3, m11, m5
7810*c0909341SAndroid Build Coastguard Worker    pandn                m4, [esp+0x20]
7811*c0909341SAndroid Build Coastguard Worker    pandn                m6, [esp+0x30]
7812*c0909341SAndroid Build Coastguard Worker    pandn                m7, [esp+0x40]
7813*c0909341SAndroid Build Coastguard Worker    pandn                m5, [esp+0x50]
7814*c0909341SAndroid Build Coastguard Worker    por                  m0, m4
7815*c0909341SAndroid Build Coastguard Worker    por                  m1, m6
7816*c0909341SAndroid Build Coastguard Worker    por                  m2, m7
7817*c0909341SAndroid Build Coastguard Worker    por                  m3, m5
7818*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x20], m0
7819*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x30], m1
7820*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x40], m2
7821*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x50], m3
7822*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x60, 0 ; 0-1
7823*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x180   ; 2-3
7824*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x1a0   ; 4-5
7825*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0x1c0   ; 6-7
7826*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+0x1a0]
7827*c0909341SAndroid Build Coastguard Worker    mova                 m6, [esp+0x1b0]
7828*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+0x1c0]
7829*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x1d0]
7830*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m5, m6      ; 45a
7831*c0909341SAndroid Build Coastguard Worker    punpckhwd            m5, m6          ; 45b
7832*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m7, m0      ; 67a
7833*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0          ; 67b
7834*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1a0], m4
7835*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m5
7836*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1c0], m6
7837*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1d0], m7
7838*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x060]
7839*c0909341SAndroid Build Coastguard Worker    mova                 m2, [esp+0x070]
7840*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+0x180]
7841*c0909341SAndroid Build Coastguard Worker    mova                 m4, [esp+0x190]
7842*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m1, m2      ; 01a
7843*c0909341SAndroid Build Coastguard Worker    punpckhwd            m1, m2          ; 01b
7844*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3, m4      ; 23a
7845*c0909341SAndroid Build Coastguard Worker    punpckhwd            m3, m4          ; 23b
7846*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x180], m2
7847*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x190], m3
7848*c0909341SAndroid Build Coastguard Worker %define m8  [esp+0x140]
7849*c0909341SAndroid Build Coastguard Worker %define m9  [esp+0x150]
7850*c0909341SAndroid Build Coastguard Worker %define m10 [esp+0x160]
7851*c0909341SAndroid Build Coastguard Worker %define m11 [esp+0x170]
7852*c0909341SAndroid Build Coastguard Worker%endif
7853*c0909341SAndroid Build Coastguard Worker.dy2_vloop:
7854*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7855*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
7856*c0909341SAndroid Build Coastguard Worker%endif
7857*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m0, m8
7858*c0909341SAndroid Build Coastguard Worker    pmaddwd              m5, m1, m8
7859*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, m2, m9
7860*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, m3, m9
7861*c0909341SAndroid Build Coastguard Worker    paddd                m4, m13
7862*c0909341SAndroid Build Coastguard Worker    paddd                m5, m13
7863*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
7864*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
7865*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7866*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [rsp+0x50], m10
7867*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [rsp+0x60], m10
7868*c0909341SAndroid Build Coastguard Worker%else
7869*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [esp+0x1a0], m10
7870*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [esp+0x1b0], m10
7871*c0909341SAndroid Build Coastguard Worker%endif
7872*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
7873*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
7874*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7875*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [rsp+0x70], m11
7876*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [rsp+0x80], m11
7877*c0909341SAndroid Build Coastguard Worker%else
7878*c0909341SAndroid Build Coastguard Worker    pmaddwd              m6, [esp+0x1c0], m11
7879*c0909341SAndroid Build Coastguard Worker    pmaddwd              m7, [esp+0x1d0], m11
7880*c0909341SAndroid Build Coastguard Worker%endif
7881*c0909341SAndroid Build Coastguard Worker    paddd                m4, m6
7882*c0909341SAndroid Build Coastguard Worker    paddd                m5, m7
7883*c0909341SAndroid Build Coastguard Worker    psrad                m4, rndshift
7884*c0909341SAndroid Build Coastguard Worker    psrad                m5, rndshift
7885*c0909341SAndroid Build Coastguard Worker    packssdw             m4, m5
7886*c0909341SAndroid Build Coastguard Worker%ifidn %1, put
7887*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
7888*c0909341SAndroid Build Coastguard Worker    movq             [dstq], m4
7889*c0909341SAndroid Build Coastguard Worker    add                dstq, dsm
7890*c0909341SAndroid Build Coastguard Worker%else
7891*c0909341SAndroid Build Coastguard Worker    mova             [tmpq], m4
7892*c0909341SAndroid Build Coastguard Worker    add                tmpq, tmp_stridem
7893*c0909341SAndroid Build Coastguard Worker%endif
7894*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
7895*c0909341SAndroid Build Coastguard Worker    mov                 r0m, r0
7896*c0909341SAndroid Build Coastguard Worker%endif
7897*c0909341SAndroid Build Coastguard Worker    dec                  hd
7898*c0909341SAndroid Build Coastguard Worker    jz .dy2_hloop_prep
7899*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
7900*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+0x10]
7901*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+0x20]
7902*c0909341SAndroid Build Coastguard Worker    mova                m10, [rsp+0x30]
7903*c0909341SAndroid Build Coastguard Worker    mova                m11, [rsp+0x40]
7904*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2             ; 01a
7905*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3             ; 01b
7906*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11
7907*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+0x50] ; 23a
7908*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+0x60] ; 23b
7909*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+0x70] ; 45a
7910*c0909341SAndroid Build Coastguard Worker    mova                 m7, [rsp+0x80] ; 45b
7911*c0909341SAndroid Build Coastguard Worker    mova                 m8, [rsp+0x140]
7912*c0909341SAndroid Build Coastguard Worker    mova                 m9, [rsp+0x150]
7913*c0909341SAndroid Build Coastguard Worker    mova                m10, [rsp+0x160]
7914*c0909341SAndroid Build Coastguard Worker    mova                m11, [rsp+0x170]
7915*c0909341SAndroid Build Coastguard Worker    punpcklwd           m14, m2, m6     ; 67a
7916*c0909341SAndroid Build Coastguard Worker    punpckhwd            m2, m6         ; 67b
7917*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x50], m5
7918*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x60], m7
7919*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x70], m14
7920*c0909341SAndroid Build Coastguard Worker    mova         [rsp+0x80], m2
7921*c0909341SAndroid Build Coastguard Worker    mova                 m2, m3
7922*c0909341SAndroid Build Coastguard Worker    mova                 m3, m4
7923*c0909341SAndroid Build Coastguard Worker%else
7924*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_H   0x20, 0
7925*c0909341SAndroid Build Coastguard Worker    punpcklwd            m6, m0, m4
7926*c0909341SAndroid Build Coastguard Worker    punpckhwd            m7, m0, m4
7927*c0909341SAndroid Build Coastguard Worker    mova                 m0, [esp+0x180] ; 01a
7928*c0909341SAndroid Build Coastguard Worker    mova                 m1, [esp+0x190] ; 01b
7929*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+0x1a0]  ; 23a
7930*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+0x1b0]  ; 23b
7931*c0909341SAndroid Build Coastguard Worker    mova                 m4, [esp+0x1c0]  ; 45a
7932*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+0x1d0]  ; 45b
7933*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x180], m2
7934*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x190], m3
7935*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1a0], m4
7936*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1b0], m5
7937*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1c0], m6          ; 67a
7938*c0909341SAndroid Build Coastguard Worker    mova        [esp+0x1d0], m7          ; 67b
7939*c0909341SAndroid Build Coastguard Worker%endif
7940*c0909341SAndroid Build Coastguard Worker    jmp .dy2_vloop
7941*c0909341SAndroid Build Coastguard Worker.ret:
7942*c0909341SAndroid Build Coastguard Worker    MC_8TAP_SCALED_RET 0
7943*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
7944*c0909341SAndroid Build Coastguard Worker %define r0m [rstk+stack_offset+ 4]
7945*c0909341SAndroid Build Coastguard Worker %define r1m [rstk+stack_offset+ 8]
7946*c0909341SAndroid Build Coastguard Worker %define r2m [rstk+stack_offset+12]
7947*c0909341SAndroid Build Coastguard Worker %define r3m [rstk+stack_offset+16]
7948*c0909341SAndroid Build Coastguard Worker%endif
7949*c0909341SAndroid Build Coastguard Worker%undef isprep
7950*c0909341SAndroid Build Coastguard Worker%endmacro
7951*c0909341SAndroid Build Coastguard Worker
7952*c0909341SAndroid Build Coastguard Worker%macro BILIN_SCALED_FN 1
7953*c0909341SAndroid Build Coastguard Workercglobal %1_bilin_scaled_8bpc
7954*c0909341SAndroid Build Coastguard Worker    mov                 t0d, (5*15 << 16) | 5*15
7955*c0909341SAndroid Build Coastguard Worker    mov                 t1d, (5*15 << 16) | 5*15
7956*c0909341SAndroid Build Coastguard Worker    jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX)
7957*c0909341SAndroid Build Coastguard Worker%endmacro
7958*c0909341SAndroid Build Coastguard Worker
7959*c0909341SAndroid Build Coastguard Worker%if WIN64
7960*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 5
7961*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_64
7962*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 8
7963*c0909341SAndroid Build Coastguard Worker%else
7964*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 1, 2
7965*c0909341SAndroid Build Coastguard Worker%endif
7966*c0909341SAndroid Build Coastguard Worker%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
7967*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN put
7968*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   put_8tap_scaled_8bpc
7969*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_scaled_8bpc
7970*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_scaled_8bpc
7971*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  put_8tap_scaled_8bpc
7972*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, put_8tap_scaled_8bpc
7973*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   put_8tap_scaled_8bpc
7974*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, put_8tap_scaled_8bpc
7975*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  put_8tap_scaled_8bpc
7976*c0909341SAndroid Build Coastguard WorkerPUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
7977*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED put
7978*c0909341SAndroid Build Coastguard Worker
7979*c0909341SAndroid Build Coastguard Worker%if WIN64
7980*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5, 4
7981*c0909341SAndroid Build Coastguard Worker%elif ARCH_X86_64
7982*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7
7983*c0909341SAndroid Build Coastguard Worker%else
7984*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 1, 2
7985*c0909341SAndroid Build Coastguard Worker%endif
7986*c0909341SAndroid Build Coastguard Worker%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
7987*c0909341SAndroid Build Coastguard WorkerBILIN_SCALED_FN prep
7988*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP,   prep_8tap_scaled_8bpc
7989*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_scaled_8bpc
7990*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_scaled_8bpc
7991*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH,  prep_8tap_scaled_8bpc
7992*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_scaled_8bpc
7993*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_scaled_8bpc
7994*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR, prep_8tap_scaled_8bpc
7995*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH,  prep_8tap_scaled_8bpc
7996*c0909341SAndroid Build Coastguard WorkerPREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
7997*c0909341SAndroid Build Coastguard WorkerMC_8TAP_SCALED prep
7998*c0909341SAndroid Build Coastguard Worker
7999*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8000*c0909341SAndroid Build Coastguard Worker %macro SAVE_ALPHA_BETA 0
8001*c0909341SAndroid Build Coastguard Worker    mov              alpham, alphad
8002*c0909341SAndroid Build Coastguard Worker    mov               betam, betad
8003*c0909341SAndroid Build Coastguard Worker %endmacro
8004*c0909341SAndroid Build Coastguard Worker
8005*c0909341SAndroid Build Coastguard Worker %macro SAVE_DELTA_GAMMA 0
8006*c0909341SAndroid Build Coastguard Worker    mov              deltam, deltad
8007*c0909341SAndroid Build Coastguard Worker    mov              gammam, gammad
8008*c0909341SAndroid Build Coastguard Worker %endmacro
8009*c0909341SAndroid Build Coastguard Worker
8010*c0909341SAndroid Build Coastguard Worker %macro LOAD_ALPHA_BETA_MX 0
8011*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
8012*c0909341SAndroid Build Coastguard Worker    mov              alphad, alpham
8013*c0909341SAndroid Build Coastguard Worker    mov               betad, betam
8014*c0909341SAndroid Build Coastguard Worker    mov                 mxd, mxm
8015*c0909341SAndroid Build Coastguard Worker %endmacro
8016*c0909341SAndroid Build Coastguard Worker
8017*c0909341SAndroid Build Coastguard Worker %macro LOAD_DELTA_GAMMA_MY 0
8018*c0909341SAndroid Build Coastguard Worker    mov                 mxm, mxd
8019*c0909341SAndroid Build Coastguard Worker    mov              deltad, deltam
8020*c0909341SAndroid Build Coastguard Worker    mov              gammad, gammam
8021*c0909341SAndroid Build Coastguard Worker    mov                 myd, mym
8022*c0909341SAndroid Build Coastguard Worker %endmacro
8023*c0909341SAndroid Build Coastguard Worker
8024*c0909341SAndroid Build Coastguard Worker %define PIC_reg r2
8025*c0909341SAndroid Build Coastguard Worker %define PIC_base_offset $$
8026*c0909341SAndroid Build Coastguard Worker %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
8027*c0909341SAndroid Build Coastguard Worker%else
8028*c0909341SAndroid Build Coastguard Worker %define SAVE_ALPHA_BETA
8029*c0909341SAndroid Build Coastguard Worker %define SAVE_DELTA_GAMMA
8030*c0909341SAndroid Build Coastguard Worker %define PIC_sym(sym) sym
8031*c0909341SAndroid Build Coastguard Worker%endif
8032*c0909341SAndroid Build Coastguard Worker
8033*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8034*c0909341SAndroid Build Coastguard Worker %if STACK_ALIGNMENT < required_stack_alignment
8035*c0909341SAndroid Build Coastguard Worker  %assign copy_args 8*4
8036*c0909341SAndroid Build Coastguard Worker %else
8037*c0909341SAndroid Build Coastguard Worker  %assign copy_args 0
8038*c0909341SAndroid Build Coastguard Worker %endif
8039*c0909341SAndroid Build Coastguard Worker%endif
8040*c0909341SAndroid Build Coastguard Worker
8041*c0909341SAndroid Build Coastguard Worker%macro RELOC_ARGS 0
8042*c0909341SAndroid Build Coastguard Worker %if copy_args
8043*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
8044*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
8045*c0909341SAndroid Build Coastguard Worker    mov                  r2, r2m
8046*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
8047*c0909341SAndroid Build Coastguard Worker    mov                  r5, r5m
8048*c0909341SAndroid Build Coastguard Worker    mov                dstm, r0
8049*c0909341SAndroid Build Coastguard Worker    mov                 dsm, r1
8050*c0909341SAndroid Build Coastguard Worker    mov                srcm, r2
8051*c0909341SAndroid Build Coastguard Worker    mov                 ssm, r3
8052*c0909341SAndroid Build Coastguard Worker    mov                 mxm, r5
8053*c0909341SAndroid Build Coastguard Worker    mov                  r0, r6m
8054*c0909341SAndroid Build Coastguard Worker    mov                 mym, r0
8055*c0909341SAndroid Build Coastguard Worker %endif
8056*c0909341SAndroid Build Coastguard Worker%endmacro
8057*c0909341SAndroid Build Coastguard Worker
8058*c0909341SAndroid Build Coastguard Worker%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
8059*c0909341SAndroid Build Coastguard Worker %if cpuflag(sse4)
8060*c0909341SAndroid Build Coastguard Worker    pblendw              %1, %2, 0xAA
8061*c0909341SAndroid Build Coastguard Worker %else
8062*c0909341SAndroid Build Coastguard Worker    pand                 %2, m10
8063*c0909341SAndroid Build Coastguard Worker    por                  %1, %2
8064*c0909341SAndroid Build Coastguard Worker %endif
8065*c0909341SAndroid Build Coastguard Worker%endmacro
8066*c0909341SAndroid Build Coastguard Worker
8067*c0909341SAndroid Build Coastguard Worker%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
8068*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
8069*c0909341SAndroid Build Coastguard Worker  %define m8  m4
8070*c0909341SAndroid Build Coastguard Worker  %define m9  m5
8071*c0909341SAndroid Build Coastguard Worker  %define m14 m6
8072*c0909341SAndroid Build Coastguard Worker  %define m15 m7
8073*c0909341SAndroid Build Coastguard Worker  %define m11 m7
8074*c0909341SAndroid Build Coastguard Worker %endif
8075*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
8076*c0909341SAndroid Build Coastguard Worker    pxor                m11, m11
8077*c0909341SAndroid Build Coastguard Worker %endif
8078*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [myq+deltaq*4]
8079*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [myq+deltaq*1]
8080*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
8081*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
8082*c0909341SAndroid Build Coastguard Worker    movq                 m2, [filterq+myq  *8] ; a
8083*c0909341SAndroid Build Coastguard Worker    movq                 m8, [filterq+tmp1q*8] ; e
8084*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+deltaq*4]
8085*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmp2q+deltaq*1]
8086*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
8087*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
8088*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+tmp2q*8] ; b
8089*c0909341SAndroid Build Coastguard Worker    movq                 m0, [filterq+tmp1q*8] ; f
8090*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m3
8091*c0909341SAndroid Build Coastguard Worker    punpcklwd            m8, m0
8092*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [myq+deltaq*4]
8093*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [myq+deltaq*1]
8094*c0909341SAndroid Build Coastguard Worker    shr                 myd, 10
8095*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
8096*c0909341SAndroid Build Coastguard Worker    movq                 m0, [filterq+myq  *8] ; c
8097*c0909341SAndroid Build Coastguard Worker    movq                 m9, [filterq+tmp1q*8] ; g
8098*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+deltaq*4]
8099*c0909341SAndroid Build Coastguard Worker    lea                 myd, [tmp2q+gammaq]       ; my += gamma
8100*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
8101*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
8102*c0909341SAndroid Build Coastguard Worker    movq                 m3, [filterq+tmp2q*8] ; d
8103*c0909341SAndroid Build Coastguard Worker    movq                 m1, [filterq+tmp1q*8] ; h
8104*c0909341SAndroid Build Coastguard Worker    punpcklwd            m0, m3
8105*c0909341SAndroid Build Coastguard Worker    punpcklwd            m9, m1
8106*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2, m0
8107*c0909341SAndroid Build Coastguard Worker    punpckhdq            m2, m0
8108*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
8109*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
8110*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
8111*c0909341SAndroid Build Coastguard Worker    punpckhbw           m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
8112*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, %3
8113*c0909341SAndroid Build Coastguard Worker    pmaddwd              m3, %5
8114*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, %7
8115*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, %9
8116*c0909341SAndroid Build Coastguard Worker    paddd                m0, m3
8117*c0909341SAndroid Build Coastguard Worker    paddd                m1, m14
8118*c0909341SAndroid Build Coastguard Worker    paddd                m0, m1
8119*c0909341SAndroid Build Coastguard Worker    mova                 %1, m0
8120*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
8121*c0909341SAndroid Build Coastguard Worker    SWAP                 m3, m14
8122*c0909341SAndroid Build Coastguard Worker %endif
8123*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m8, m9
8124*c0909341SAndroid Build Coastguard Worker    punpckhdq            m8, m9
8125*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
8126*c0909341SAndroid Build Coastguard Worker    punpckhbw           m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
8127*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
8128*c0909341SAndroid Build Coastguard Worker    punpckhbw           m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
8129*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, %4
8130*c0909341SAndroid Build Coastguard Worker    pmaddwd             m14, %6
8131*c0909341SAndroid Build Coastguard Worker    pmaddwd              m2, %8
8132*c0909341SAndroid Build Coastguard Worker    pmaddwd             m15, %10
8133*c0909341SAndroid Build Coastguard Worker    paddd                m1, m14
8134*c0909341SAndroid Build Coastguard Worker    paddd                m2, m15
8135*c0909341SAndroid Build Coastguard Worker    paddd                m1, m2
8136*c0909341SAndroid Build Coastguard Worker    mova                 %2, m1
8137*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
8138*c0909341SAndroid Build Coastguard Worker    SWAP                m14, m3
8139*c0909341SAndroid Build Coastguard Worker %endif
8140*c0909341SAndroid Build Coastguard Worker%endmacro
8141*c0909341SAndroid Build Coastguard Worker
8142*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8143*c0909341SAndroid Build Coastguard Worker %define counterd r4d
8144*c0909341SAndroid Build Coastguard Worker%else
8145*c0909341SAndroid Build Coastguard Worker %if copy_args == 0
8146*c0909341SAndroid Build Coastguard Worker  %define counterd dword r4m
8147*c0909341SAndroid Build Coastguard Worker %else
8148*c0909341SAndroid Build Coastguard Worker  %define counterd dword [esp+stack_size-4*7]
8149*c0909341SAndroid Build Coastguard Worker %endif
8150*c0909341SAndroid Build Coastguard Worker%endif
8151*c0909341SAndroid Build Coastguard Worker
8152*c0909341SAndroid Build Coastguard Worker%macro WARP_AFFINE_8X8 0
8153*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8154*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts
8155*c0909341SAndroid Build Coastguard Worker%else
8156*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts
8157*c0909341SAndroid Build Coastguard Worker %if copy_args
8158*c0909341SAndroid Build Coastguard Worker  %define tmpm [esp+stack_size-4*1]
8159*c0909341SAndroid Build Coastguard Worker  %define tsm  [esp+stack_size-4*2]
8160*c0909341SAndroid Build Coastguard Worker %endif
8161*c0909341SAndroid Build Coastguard Worker%endif
8162*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main
8163*c0909341SAndroid Build Coastguard Worker.loop:
8164*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8165*c0909341SAndroid Build Coastguard Worker %define m12 m4
8166*c0909341SAndroid Build Coastguard Worker %define m13 m5
8167*c0909341SAndroid Build Coastguard Worker %define m14 m6
8168*c0909341SAndroid Build Coastguard Worker %define m15 m7
8169*c0909341SAndroid Build Coastguard Worker    mova                m12, [esp+0xC0]
8170*c0909341SAndroid Build Coastguard Worker    mova                m13, [esp+0xD0]
8171*c0909341SAndroid Build Coastguard Worker    mova                m14, [esp+0xE0]
8172*c0909341SAndroid Build Coastguard Worker    mova                m15, [esp+0xF0]
8173*c0909341SAndroid Build Coastguard Worker%endif
8174*c0909341SAndroid Build Coastguard Worker    psrad               m12, 13
8175*c0909341SAndroid Build Coastguard Worker    psrad               m13, 13
8176*c0909341SAndroid Build Coastguard Worker    psrad               m14, 13
8177*c0909341SAndroid Build Coastguard Worker    psrad               m15, 13
8178*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
8179*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
8180*c0909341SAndroid Build Coastguard Worker    mova                m13, [PIC_sym(pw_8192)]
8181*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m13 ; (x + (1 << 6)) >> 7
8182*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m13
8183*c0909341SAndroid Build Coastguard Worker    mova       [tmpq+tsq*0], m12
8184*c0909341SAndroid Build Coastguard Worker    mova       [tmpq+tsq*2], m14
8185*c0909341SAndroid Build Coastguard Worker    dec            counterd
8186*c0909341SAndroid Build Coastguard Worker    jz   mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end
8187*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8188*c0909341SAndroid Build Coastguard Worker    mov                tmpm, tmpd
8189*c0909341SAndroid Build Coastguard Worker    mov                  r0, [esp+0x100]
8190*c0909341SAndroid Build Coastguard Worker    mov                  r1, [esp+0x104]
8191*c0909341SAndroid Build Coastguard Worker%endif
8192*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2
8193*c0909341SAndroid Build Coastguard Worker    lea                tmpq, [tmpq+tsq*4]
8194*c0909341SAndroid Build Coastguard Worker    jmp .loop
8195*c0909341SAndroid Build Coastguard Worker
8196*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8197*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \
8198*c0909341SAndroid Build Coastguard Worker                              dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
8199*c0909341SAndroid Build Coastguard Worker                              filter, tmp1, delta, my, gamma
8200*c0909341SAndroid Build Coastguard Worker%else
8201*c0909341SAndroid Build Coastguard Workercglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \
8202*c0909341SAndroid Build Coastguard Worker                              dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
8203*c0909341SAndroid Build Coastguard Worker                              filter, tmp1, delta, my, gamma
8204*c0909341SAndroid Build Coastguard Worker %define alphaq     r0
8205*c0909341SAndroid Build Coastguard Worker %define alphad     r0
8206*c0909341SAndroid Build Coastguard Worker %define alpham     [esp+gprsize+0x100]
8207*c0909341SAndroid Build Coastguard Worker %define betaq      r1
8208*c0909341SAndroid Build Coastguard Worker %define betad      r1
8209*c0909341SAndroid Build Coastguard Worker %define betam      [esp+gprsize+0x104]
8210*c0909341SAndroid Build Coastguard Worker %define deltaq     r0
8211*c0909341SAndroid Build Coastguard Worker %define deltad     r0
8212*c0909341SAndroid Build Coastguard Worker %define deltam     [esp+gprsize+0x108]
8213*c0909341SAndroid Build Coastguard Worker %define gammaq     r1
8214*c0909341SAndroid Build Coastguard Worker %define gammad     r1
8215*c0909341SAndroid Build Coastguard Worker %define gammam     [esp+gprsize+0x10C]
8216*c0909341SAndroid Build Coastguard Worker %define filterq    r3
8217*c0909341SAndroid Build Coastguard Worker %define tmp1q      r4
8218*c0909341SAndroid Build Coastguard Worker %define tmp1d      r4
8219*c0909341SAndroid Build Coastguard Worker %define tmp1m      [esp+gprsize+0x110]
8220*c0909341SAndroid Build Coastguard Worker %define myq        r5
8221*c0909341SAndroid Build Coastguard Worker %define myd        r5
8222*c0909341SAndroid Build Coastguard Worker %define mym        r6m
8223*c0909341SAndroid Build Coastguard Worker %if copy_args
8224*c0909341SAndroid Build Coastguard Worker  %define dstm [esp+stack_size-4*1]
8225*c0909341SAndroid Build Coastguard Worker  %define dsm  [esp+stack_size-4*2]
8226*c0909341SAndroid Build Coastguard Worker  %define srcm [esp+stack_size-4*3]
8227*c0909341SAndroid Build Coastguard Worker  %define ssm  [esp+stack_size-4*4]
8228*c0909341SAndroid Build Coastguard Worker  %define mxm  [esp+stack_size-4*5]
8229*c0909341SAndroid Build Coastguard Worker  %define mym  [esp+stack_size-4*6]
8230*c0909341SAndroid Build Coastguard Worker %endif
8231*c0909341SAndroid Build Coastguard Worker%endif
8232*c0909341SAndroid Build Coastguard Worker    call .main
8233*c0909341SAndroid Build Coastguard Worker    jmp .start
8234*c0909341SAndroid Build Coastguard Worker.loop:
8235*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8236*c0909341SAndroid Build Coastguard Worker    mov                dstm, dstd
8237*c0909341SAndroid Build Coastguard Worker    mov              alphad, [esp+0x100]
8238*c0909341SAndroid Build Coastguard Worker    mov               betad, [esp+0x104]
8239*c0909341SAndroid Build Coastguard Worker%endif
8240*c0909341SAndroid Build Coastguard Worker    call .main2
8241*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
8242*c0909341SAndroid Build Coastguard Worker.start:
8243*c0909341SAndroid Build Coastguard Worker%if notcpuflag(sse4)
8244*c0909341SAndroid Build Coastguard Worker  %define roundval pw_8192
8245*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
8246*c0909341SAndroid Build Coastguard Worker    mova                m10, [PIC_sym(roundval)]
8247*c0909341SAndroid Build Coastguard Worker %else
8248*c0909341SAndroid Build Coastguard Worker  %define m10 [PIC_sym(roundval)]
8249*c0909341SAndroid Build Coastguard Worker %endif
8250*c0909341SAndroid Build Coastguard Worker%endif
8251*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8252*c0909341SAndroid Build Coastguard Worker %define m12 m5
8253*c0909341SAndroid Build Coastguard Worker %define m13 m6
8254*c0909341SAndroid Build Coastguard Worker    mova                m12, [esp+0xC0]
8255*c0909341SAndroid Build Coastguard Worker    mova                m13, [esp+0xD0]
8256*c0909341SAndroid Build Coastguard Worker%endif
8257*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
8258*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
8259*c0909341SAndroid Build Coastguard Worker  %define m11 m4
8260*c0909341SAndroid Build Coastguard Worker    pxor                m11, m11
8261*c0909341SAndroid Build Coastguard Worker %endif
8262*c0909341SAndroid Build Coastguard Worker    psrad               m12, 18
8263*c0909341SAndroid Build Coastguard Worker    psrad               m13, 18
8264*c0909341SAndroid Build Coastguard Worker    packusdw            m12, m13
8265*c0909341SAndroid Build Coastguard Worker    pavgw               m12, m11 ; (x + (1 << 10)) >> 11
8266*c0909341SAndroid Build Coastguard Worker%else
8267*c0909341SAndroid Build Coastguard Worker    psrad               m12, 17
8268*c0909341SAndroid Build Coastguard Worker    psrad               m13, 17
8269*c0909341SAndroid Build Coastguard Worker    packssdw            m12, m13
8270*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m12, m10
8271*c0909341SAndroid Build Coastguard Worker%endif
8272*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8273*c0909341SAndroid Build Coastguard Worker %define m14 m6
8274*c0909341SAndroid Build Coastguard Worker %define m15 m7
8275*c0909341SAndroid Build Coastguard Worker    mova                m14, [esp+0xE0]
8276*c0909341SAndroid Build Coastguard Worker    mova                m15, [esp+0xF0]
8277*c0909341SAndroid Build Coastguard Worker%endif
8278*c0909341SAndroid Build Coastguard Worker%if cpuflag(sse4)
8279*c0909341SAndroid Build Coastguard Worker    psrad               m14, 18
8280*c0909341SAndroid Build Coastguard Worker    psrad               m15, 18
8281*c0909341SAndroid Build Coastguard Worker    packusdw            m14, m15
8282*c0909341SAndroid Build Coastguard Worker    pavgw               m14, m11 ; (x + (1 << 10)) >> 11
8283*c0909341SAndroid Build Coastguard Worker%else
8284*c0909341SAndroid Build Coastguard Worker    psrad               m14, 17
8285*c0909341SAndroid Build Coastguard Worker    psrad               m15, 17
8286*c0909341SAndroid Build Coastguard Worker    packssdw            m14, m15
8287*c0909341SAndroid Build Coastguard Worker    pmulhrsw            m14, m10
8288*c0909341SAndroid Build Coastguard Worker%endif
8289*c0909341SAndroid Build Coastguard Worker    packuswb            m12, m14
8290*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m12
8291*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m12
8292*c0909341SAndroid Build Coastguard Worker    dec            counterd
8293*c0909341SAndroid Build Coastguard Worker    jg .loop
8294*c0909341SAndroid Build Coastguard Worker.end:
8295*c0909341SAndroid Build Coastguard Worker    RET
8296*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8297*c0909341SAndroid Build Coastguard Worker.main:
8298*c0909341SAndroid Build Coastguard Worker%assign stack_offset stack_offset+gprsize
8299*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8300*c0909341SAndroid Build Coastguard Worker %assign stack_size stack_size+4
8301*c0909341SAndroid Build Coastguard Worker %if copy_args
8302*c0909341SAndroid Build Coastguard Worker  %assign stack_offset stack_offset-4
8303*c0909341SAndroid Build Coastguard Worker %endif
8304*c0909341SAndroid Build Coastguard Worker    RELOC_ARGS
8305*c0909341SAndroid Build Coastguard Worker    LEA             PIC_reg, $$
8306*c0909341SAndroid Build Coastguard Worker %define PIC_mem [esp+gprsize+0x114]
8307*c0909341SAndroid Build Coastguard Worker    mov               abcdd, abcdm
8308*c0909341SAndroid Build Coastguard Worker %if copy_args == 0
8309*c0909341SAndroid Build Coastguard Worker    mov                 ssd, ssm
8310*c0909341SAndroid Build Coastguard Worker    mov                 mxd, mxm
8311*c0909341SAndroid Build Coastguard Worker %endif
8312*c0909341SAndroid Build Coastguard Worker    mov             PIC_mem, PIC_reg
8313*c0909341SAndroid Build Coastguard Worker    mov                srcd, srcm
8314*c0909341SAndroid Build Coastguard Worker%endif
8315*c0909341SAndroid Build Coastguard Worker    movsx            deltad, word [abcdq+2*2]
8316*c0909341SAndroid Build Coastguard Worker    movsx            gammad, word [abcdq+2*3]
8317*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [deltaq*3]
8318*c0909341SAndroid Build Coastguard Worker    sub              gammad, tmp1d    ; gamma -= delta*3
8319*c0909341SAndroid Build Coastguard Worker    SAVE_DELTA_GAMMA
8320*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8321*c0909341SAndroid Build Coastguard Worker    mov               abcdd, abcdm
8322*c0909341SAndroid Build Coastguard Worker%endif
8323*c0909341SAndroid Build Coastguard Worker    movsx            alphad, word [abcdq+2*0]
8324*c0909341SAndroid Build Coastguard Worker    movsx             betad, word [abcdq+2*1]
8325*c0909341SAndroid Build Coastguard Worker    lea               tmp1q, [ssq*3+3]
8326*c0909341SAndroid Build Coastguard Worker    add                 mxd, 512+(64<<10)
8327*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [alphaq*3]
8328*c0909341SAndroid Build Coastguard Worker    sub                srcq, tmp1q    ; src -= src_stride*3 + 3
8329*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8330*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcd
8331*c0909341SAndroid Build Coastguard Worker    mov             PIC_reg, PIC_mem
8332*c0909341SAndroid Build Coastguard Worker%endif
8333*c0909341SAndroid Build Coastguard Worker    sub               betad, tmp2d    ; beta -= alpha*3
8334*c0909341SAndroid Build Coastguard Worker    lea             filterq, [PIC_sym(mc_warp_filter2)]
8335*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8336*c0909341SAndroid Build Coastguard Worker    mov                 myd, r6m
8337*c0909341SAndroid Build Coastguard Worker    pxor                m11, m11
8338*c0909341SAndroid Build Coastguard Worker%endif
8339*c0909341SAndroid Build Coastguard Worker    call .h
8340*c0909341SAndroid Build Coastguard Worker    psrld                m2, m0, 16
8341*c0909341SAndroid Build Coastguard Worker    psrld                m3, m1, 16
8342*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8343*c0909341SAndroid Build Coastguard Worker    mova [esp+gprsize+0x10], m3
8344*c0909341SAndroid Build Coastguard Worker%endif
8345*c0909341SAndroid Build Coastguard Worker    call .h
8346*c0909341SAndroid Build Coastguard Worker    psrld                m4, m0, 16
8347*c0909341SAndroid Build Coastguard Worker    psrld                m5, m1, 16
8348*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8349*c0909341SAndroid Build Coastguard Worker    mova [esp+gprsize+0x20], m4
8350*c0909341SAndroid Build Coastguard Worker    mova [esp+gprsize+0x30], m5
8351*c0909341SAndroid Build Coastguard Worker%endif
8352*c0909341SAndroid Build Coastguard Worker    call .h
8353*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8354*c0909341SAndroid Build Coastguard Worker %define blendmask [rsp+gprsize+0x80]
8355*c0909341SAndroid Build Coastguard Worker%else
8356*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+gprsize+0x10]
8357*c0909341SAndroid Build Coastguard Worker %define blendmask [esp+gprsize+0x120]
8358*c0909341SAndroid Build Coastguard Worker %define m10 m7
8359*c0909341SAndroid Build Coastguard Worker%endif
8360*c0909341SAndroid Build Coastguard Worker    pcmpeqd             m10, m10
8361*c0909341SAndroid Build Coastguard Worker    pslld               m10, 16
8362*c0909341SAndroid Build Coastguard Worker    mova          blendmask, m10
8363*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m2, m0 ; 0
8364*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m3, m1 ; 2
8365*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x00], m2
8366*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x10], m3
8367*c0909341SAndroid Build Coastguard Worker    call .h
8368*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8369*c0909341SAndroid Build Coastguard Worker    mova                 m4, [esp+gprsize+0x20]
8370*c0909341SAndroid Build Coastguard Worker    mova                 m5, [esp+gprsize+0x30]
8371*c0909341SAndroid Build Coastguard Worker%endif
8372*c0909341SAndroid Build Coastguard Worker    mova                m10, blendmask
8373*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m4, m0 ; 1
8374*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m5, m1 ; 3
8375*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x20], m4
8376*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x30], m5
8377*c0909341SAndroid Build Coastguard Worker    call .h
8378*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8379*c0909341SAndroid Build Coastguard Worker    mova                 m3, [esp+gprsize+0x10]
8380*c0909341SAndroid Build Coastguard Worker %define m10 m5
8381*c0909341SAndroid Build Coastguard Worker%endif
8382*c0909341SAndroid Build Coastguard Worker    psrld                m6, m2, 16
8383*c0909341SAndroid Build Coastguard Worker    psrld                m7, m3, 16
8384*c0909341SAndroid Build Coastguard Worker    mova                m10, blendmask
8385*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m6, m0 ; 2
8386*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m7, m1 ; 4
8387*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x40], m6
8388*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x50], m7
8389*c0909341SAndroid Build Coastguard Worker    call .h
8390*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8391*c0909341SAndroid Build Coastguard Worker    mova                m4, [esp+gprsize+0x20]
8392*c0909341SAndroid Build Coastguard Worker    mova                m5, [esp+gprsize+0x30]
8393*c0909341SAndroid Build Coastguard Worker%endif
8394*c0909341SAndroid Build Coastguard Worker    psrld               m2, m4, 16
8395*c0909341SAndroid Build Coastguard Worker    psrld               m3, m5, 16
8396*c0909341SAndroid Build Coastguard Worker    mova                m10, blendmask
8397*c0909341SAndroid Build Coastguard Worker    BLENDHWDW           m2, m0 ; 3
8398*c0909341SAndroid Build Coastguard Worker    BLENDHWDW           m3, m1 ; 5
8399*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x60], m2
8400*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x70], m3
8401*c0909341SAndroid Build Coastguard Worker    call .h
8402*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8403*c0909341SAndroid Build Coastguard Worker    mova                 m6, [esp+gprsize+0x40]
8404*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+gprsize+0x50]
8405*c0909341SAndroid Build Coastguard Worker %define m10 m7
8406*c0909341SAndroid Build Coastguard Worker%endif
8407*c0909341SAndroid Build Coastguard Worker    psrld                m4, m6, 16
8408*c0909341SAndroid Build Coastguard Worker    psrld                m5, m7, 16
8409*c0909341SAndroid Build Coastguard Worker    mova                m10, blendmask
8410*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m4, m0 ; 4
8411*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m5, m1 ; 6
8412*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8413*c0909341SAndroid Build Coastguard Worker    add                 myd, 512+(64<<10)
8414*c0909341SAndroid Build Coastguard Worker    mova                 m6, m2
8415*c0909341SAndroid Build Coastguard Worker    mova                 m7, m3
8416*c0909341SAndroid Build Coastguard Worker%else
8417*c0909341SAndroid Build Coastguard Worker    mova [esp+gprsize+0x80], m4
8418*c0909341SAndroid Build Coastguard Worker    mova [esp+gprsize+0x90], m5
8419*c0909341SAndroid Build Coastguard Worker    add           dword mym, 512+(64<<10)
8420*c0909341SAndroid Build Coastguard Worker%endif
8421*c0909341SAndroid Build Coastguard Worker    mov            counterd, 4
8422*c0909341SAndroid Build Coastguard Worker    SAVE_ALPHA_BETA
8423*c0909341SAndroid Build Coastguard Worker.main2:
8424*c0909341SAndroid Build Coastguard Worker    call .h
8425*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8426*c0909341SAndroid Build Coastguard Worker    mova                 m6, [esp+gprsize+0x60]
8427*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+gprsize+0x70]
8428*c0909341SAndroid Build Coastguard Worker %define m10 m5
8429*c0909341SAndroid Build Coastguard Worker%endif
8430*c0909341SAndroid Build Coastguard Worker    psrld                m6, 16
8431*c0909341SAndroid Build Coastguard Worker    psrld                m7, 16
8432*c0909341SAndroid Build Coastguard Worker    mova                m10, blendmask
8433*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m6, m0 ; 5
8434*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m7, m1 ; 7
8435*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8436*c0909341SAndroid Build Coastguard Worker    WARP_V              m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
8437*c0909341SAndroid Build Coastguard Worker                                  m4, m5, \
8438*c0909341SAndroid Build Coastguard Worker                                  [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
8439*c0909341SAndroid Build Coastguard Worker                                  m6, m7
8440*c0909341SAndroid Build Coastguard Worker%else
8441*c0909341SAndroid Build Coastguard Worker    mova [esp+gprsize+0xA0], m6
8442*c0909341SAndroid Build Coastguard Worker    mova [esp+gprsize+0xB0], m7
8443*c0909341SAndroid Build Coastguard Worker    LOAD_DELTA_GAMMA_MY
8444*c0909341SAndroid Build Coastguard Worker    WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
8445*c0909341SAndroid Build Coastguard Worker           [esp+gprsize+0x00], [esp+gprsize+0x10], \
8446*c0909341SAndroid Build Coastguard Worker           [esp+gprsize+0x80], [esp+gprsize+0x90], \
8447*c0909341SAndroid Build Coastguard Worker           [esp+gprsize+0x20], [esp+gprsize+0x30], \
8448*c0909341SAndroid Build Coastguard Worker           [esp+gprsize+0xA0], [esp+gprsize+0xB0]
8449*c0909341SAndroid Build Coastguard Worker    LOAD_ALPHA_BETA_MX
8450*c0909341SAndroid Build Coastguard Worker%endif
8451*c0909341SAndroid Build Coastguard Worker    call .h
8452*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+0x40]
8453*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+0x50]
8454*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8455*c0909341SAndroid Build Coastguard Worker    mova                 m4, [rsp+gprsize+0x80]
8456*c0909341SAndroid Build Coastguard Worker    mova                 m5, [rsp+gprsize+0x90]
8457*c0909341SAndroid Build Coastguard Worker %define m10 m7
8458*c0909341SAndroid Build Coastguard Worker%endif
8459*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x00], m2
8460*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x10], m3
8461*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x40], m4
8462*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x50], m5
8463*c0909341SAndroid Build Coastguard Worker    psrld                m4, 16
8464*c0909341SAndroid Build Coastguard Worker    psrld                m5, 16
8465*c0909341SAndroid Build Coastguard Worker    mova                m10, blendmask
8466*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m4, m0 ; 6
8467*c0909341SAndroid Build Coastguard Worker    BLENDHWDW            m5, m1 ; 8
8468*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8469*c0909341SAndroid Build Coastguard Worker    WARP_V              m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
8470*c0909341SAndroid Build Coastguard Worker                                  m6, m7, \
8471*c0909341SAndroid Build Coastguard Worker                                  [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
8472*c0909341SAndroid Build Coastguard Worker                                  m4, m5
8473*c0909341SAndroid Build Coastguard Worker%else
8474*c0909341SAndroid Build Coastguard Worker    mova [esp+gprsize+0x80], m4
8475*c0909341SAndroid Build Coastguard Worker    mova [esp+gprsize+0x90], m5
8476*c0909341SAndroid Build Coastguard Worker    LOAD_DELTA_GAMMA_MY
8477*c0909341SAndroid Build Coastguard Worker    WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
8478*c0909341SAndroid Build Coastguard Worker           [esp+gprsize+0x20], [esp+gprsize+0x30], \
8479*c0909341SAndroid Build Coastguard Worker           [esp+gprsize+0xA0], [esp+gprsize+0xB0], \
8480*c0909341SAndroid Build Coastguard Worker           [esp+gprsize+0x00], [esp+gprsize+0x10], \
8481*c0909341SAndroid Build Coastguard Worker           [esp+gprsize+0x80], [esp+gprsize+0x90]
8482*c0909341SAndroid Build Coastguard Worker    mov                 mym, myd
8483*c0909341SAndroid Build Coastguard Worker    mov                dstd, dstm
8484*c0909341SAndroid Build Coastguard Worker    mov                 dsd, dsm
8485*c0909341SAndroid Build Coastguard Worker    mov                 mxd, mxm
8486*c0909341SAndroid Build Coastguard Worker%endif
8487*c0909341SAndroid Build Coastguard Worker    mova                 m2, [rsp+gprsize+0x60]
8488*c0909341SAndroid Build Coastguard Worker    mova                 m3, [rsp+gprsize+0x70]
8489*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8490*c0909341SAndroid Build Coastguard Worker    mova                 m6, [esp+gprsize+0xA0]
8491*c0909341SAndroid Build Coastguard Worker    mova                 m7, [esp+gprsize+0xB0]
8492*c0909341SAndroid Build Coastguard Worker%endif
8493*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x20], m2
8494*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x30], m3
8495*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x60], m6
8496*c0909341SAndroid Build Coastguard Worker    mova [rsp+gprsize+0x70], m7
8497*c0909341SAndroid Build Coastguard Worker    ret
8498*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8499*c0909341SAndroid Build Coastguard Worker.h:
8500*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8501*c0909341SAndroid Build Coastguard Worker %define m8  m3
8502*c0909341SAndroid Build Coastguard Worker %define m9  m4
8503*c0909341SAndroid Build Coastguard Worker %define m10 m5
8504*c0909341SAndroid Build Coastguard Worker %define m14 m6
8505*c0909341SAndroid Build Coastguard Worker %define m15 m7
8506*c0909341SAndroid Build Coastguard Worker%endif
8507*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [mxq+alphaq*4]
8508*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [mxq+alphaq*1]
8509*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8510*c0909341SAndroid Build Coastguard Worker %assign stack_offset stack_offset+4
8511*c0909341SAndroid Build Coastguard Worker %assign stack_size stack_size+4
8512*c0909341SAndroid Build Coastguard Worker %define PIC_mem [esp+gprsize*2+0x114]
8513*c0909341SAndroid Build Coastguard Worker    mov             PIC_mem, PIC_reg
8514*c0909341SAndroid Build Coastguard Worker    mov                srcd, srcm
8515*c0909341SAndroid Build Coastguard Worker%endif
8516*c0909341SAndroid Build Coastguard Worker    movu                m10, [srcq]
8517*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
8518*c0909341SAndroid Build Coastguard Worker    add                srcd, ssm
8519*c0909341SAndroid Build Coastguard Worker    mov                srcm, srcd
8520*c0909341SAndroid Build Coastguard Worker    mov             PIC_reg, PIC_mem
8521*c0909341SAndroid Build Coastguard Worker%else
8522*c0909341SAndroid Build Coastguard Worker    add                srcq, ssq
8523*c0909341SAndroid Build Coastguard Worker%endif
8524*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10
8525*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
8526*c0909341SAndroid Build Coastguard Worker    movq                 m1, [filterq+mxq  *8]  ; 0 X
8527*c0909341SAndroid Build Coastguard Worker    movq                 m8, [filterq+tmp1q*8]  ; 4 X
8528*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+alphaq*4]
8529*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmp2q+alphaq*1]
8530*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
8531*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
8532*c0909341SAndroid Build Coastguard Worker    movhps               m1, [filterq+tmp2q*8]  ; 0 1
8533*c0909341SAndroid Build Coastguard Worker    movhps               m8, [filterq+tmp1q*8]  ; 4 5
8534*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [mxq+alphaq*4]
8535*c0909341SAndroid Build Coastguard Worker    lea               tmp2d, [mxq+alphaq*1]
8536*c0909341SAndroid Build Coastguard Worker    shr                 mxd, 10
8537*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
8538*c0909341SAndroid Build Coastguard Worker    movq                m14, [filterq+mxq  *8]  ; 2 X
8539*c0909341SAndroid Build Coastguard Worker    movq                 m9, [filterq+tmp1q*8]  ; 6 X
8540*c0909341SAndroid Build Coastguard Worker    lea               tmp1d, [tmp2q+alphaq*4]
8541*c0909341SAndroid Build Coastguard Worker    lea                 mxd, [tmp2q+betaq]  ; mx += beta
8542*c0909341SAndroid Build Coastguard Worker    shr               tmp2d, 10
8543*c0909341SAndroid Build Coastguard Worker    shr               tmp1d, 10
8544*c0909341SAndroid Build Coastguard Worker    movhps              m14, [filterq+tmp2q*8]  ; 2 3
8545*c0909341SAndroid Build Coastguard Worker    movhps               m9, [filterq+tmp1q*8]  ; 6 7
8546*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m10, [PIC_sym(warp_8x8_shufA)]
8547*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m1
8548*c0909341SAndroid Build Coastguard Worker    pshufb               m1, m10, [PIC_sym(warp_8x8_shufB)]
8549*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m8
8550*c0909341SAndroid Build Coastguard Worker    pshufb              m15, m10, [PIC_sym(warp_8x8_shufC)]
8551*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m15, m14
8552*c0909341SAndroid Build Coastguard Worker    pshufb              m10, m10, [PIC_sym(warp_8x8_shufD)]
8553*c0909341SAndroid Build Coastguard Worker    pmaddubsw           m10, m9
8554*c0909341SAndroid Build Coastguard Worker    phaddw               m0, m15
8555*c0909341SAndroid Build Coastguard Worker    phaddw               m1, m10
8556*c0909341SAndroid Build Coastguard Worker    mova                m14, [PIC_sym(pw_8192)]
8557*c0909341SAndroid Build Coastguard Worker    mova                 m9, [PIC_sym(pd_32768)]
8558*c0909341SAndroid Build Coastguard Worker    pmaddwd              m0, m14 ; 17-bit intermediate, upshifted by 13
8559*c0909341SAndroid Build Coastguard Worker    pmaddwd              m1, m14
8560*c0909341SAndroid Build Coastguard Worker    paddd                m0, m9  ; rounded 14-bit result in upper 16 bits of dword
8561*c0909341SAndroid Build Coastguard Worker    paddd                m1, m9
8562*c0909341SAndroid Build Coastguard Worker    ret
8563*c0909341SAndroid Build Coastguard Worker%endmacro
8564*c0909341SAndroid Build Coastguard Worker
8565*c0909341SAndroid Build Coastguard Worker%if WIN64
8566*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 4
8567*c0909341SAndroid Build Coastguard Worker%else
8568*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 6, 7
8569*c0909341SAndroid Build Coastguard Worker%endif
8570*c0909341SAndroid Build Coastguard Worker
8571*c0909341SAndroid Build Coastguard Worker%macro BIDIR_FN 1 ; op
8572*c0909341SAndroid Build Coastguard Worker    %1                    0
8573*c0909341SAndroid Build Coastguard Worker    lea            stride3q, [strideq*3]
8574*c0909341SAndroid Build Coastguard Worker    jmp                  wq
8575*c0909341SAndroid Build Coastguard Worker.w4_loop:
8576*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            2
8577*c0909341SAndroid Build Coastguard Worker    %1                    0
8578*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*4]
8579*c0909341SAndroid Build Coastguard Worker.w4: ; tile 4x
8580*c0909341SAndroid Build Coastguard Worker    movd   [dstq          ], m0      ; copy dw[0]
8581*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032 ; swap dw[1] and dw[0]
8582*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1      ; copy dw[1]
8583*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0      ; swap dw[3,2] with dw[1,0]
8584*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*2], m0      ; dw[2]
8585*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32      ; shift right in dw[3]
8586*c0909341SAndroid Build Coastguard Worker    movd   [dstq+stride3q ], m0      ; copy
8587*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
8588*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
8589*c0909341SAndroid Build Coastguard Worker    RET
8590*c0909341SAndroid Build Coastguard Worker.w8_loop:
8591*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            2
8592*c0909341SAndroid Build Coastguard Worker    %1                    0
8593*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8594*c0909341SAndroid Build Coastguard Worker.w8:
8595*c0909341SAndroid Build Coastguard Worker    movq   [dstq          ], m0
8596*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
8597*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8598*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
8599*c0909341SAndroid Build Coastguard Worker    RET
8600*c0909341SAndroid Build Coastguard Worker.w16_loop:
8601*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            2
8602*c0909341SAndroid Build Coastguard Worker    %1                    0
8603*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq]
8604*c0909341SAndroid Build Coastguard Worker.w16:
8605*c0909341SAndroid Build Coastguard Worker    mova   [dstq          ], m0
8606*c0909341SAndroid Build Coastguard Worker    dec                  hd
8607*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
8608*c0909341SAndroid Build Coastguard Worker    RET
8609*c0909341SAndroid Build Coastguard Worker.w32_loop:
8610*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            4
8611*c0909341SAndroid Build Coastguard Worker    %1                    0
8612*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq]
8613*c0909341SAndroid Build Coastguard Worker.w32:
8614*c0909341SAndroid Build Coastguard Worker    mova   [dstq          ], m0
8615*c0909341SAndroid Build Coastguard Worker    %1                    2
8616*c0909341SAndroid Build Coastguard Worker    mova   [dstq + 16     ], m0
8617*c0909341SAndroid Build Coastguard Worker    dec                  hd
8618*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
8619*c0909341SAndroid Build Coastguard Worker    RET
8620*c0909341SAndroid Build Coastguard Worker.w64_loop:
8621*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            8
8622*c0909341SAndroid Build Coastguard Worker    %1                    0
8623*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
8624*c0909341SAndroid Build Coastguard Worker.w64:
8625*c0909341SAndroid Build Coastguard Worker    %assign i 0
8626*c0909341SAndroid Build Coastguard Worker    %rep 4
8627*c0909341SAndroid Build Coastguard Worker    mova   [dstq + i*16   ], m0
8628*c0909341SAndroid Build Coastguard Worker    %assign i i+1
8629*c0909341SAndroid Build Coastguard Worker    %if i < 4
8630*c0909341SAndroid Build Coastguard Worker    %1                    2*i
8631*c0909341SAndroid Build Coastguard Worker    %endif
8632*c0909341SAndroid Build Coastguard Worker    %endrep
8633*c0909341SAndroid Build Coastguard Worker    dec                  hd
8634*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
8635*c0909341SAndroid Build Coastguard Worker    RET
8636*c0909341SAndroid Build Coastguard Worker.w128_loop:
8637*c0909341SAndroid Build Coastguard Worker    %1_INC_PTR            16
8638*c0909341SAndroid Build Coastguard Worker    %1                    0
8639*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
8640*c0909341SAndroid Build Coastguard Worker.w128:
8641*c0909341SAndroid Build Coastguard Worker    %assign i 0
8642*c0909341SAndroid Build Coastguard Worker    %rep 8
8643*c0909341SAndroid Build Coastguard Worker    mova   [dstq + i*16   ], m0
8644*c0909341SAndroid Build Coastguard Worker    %assign i i+1
8645*c0909341SAndroid Build Coastguard Worker    %if i < 8
8646*c0909341SAndroid Build Coastguard Worker    %1                    2*i
8647*c0909341SAndroid Build Coastguard Worker    %endif
8648*c0909341SAndroid Build Coastguard Worker    %endrep
8649*c0909341SAndroid Build Coastguard Worker    dec                  hd
8650*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
8651*c0909341SAndroid Build Coastguard Worker    RET
8652*c0909341SAndroid Build Coastguard Worker%endmacro
8653*c0909341SAndroid Build Coastguard Worker
8654*c0909341SAndroid Build Coastguard Worker%macro AVG 1 ; src_offset
8655*c0909341SAndroid Build Coastguard Worker    ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
8656*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
8657*c0909341SAndroid Build Coastguard Worker    paddw                m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
8658*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q+(%1+1)*mmsize]
8659*c0909341SAndroid Build Coastguard Worker    paddw                m1, [tmp2q+(%1+1)*mmsize]
8660*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m2
8661*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m2
8662*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
8663*c0909341SAndroid Build Coastguard Worker%endmacro
8664*c0909341SAndroid Build Coastguard Worker
8665*c0909341SAndroid Build Coastguard Worker%macro AVG_INC_PTR 1
8666*c0909341SAndroid Build Coastguard Worker    add               tmp1q, %1*mmsize
8667*c0909341SAndroid Build Coastguard Worker    add               tmp2q, %1*mmsize
8668*c0909341SAndroid Build Coastguard Worker%endmacro
8669*c0909341SAndroid Build Coastguard Worker
8670*c0909341SAndroid Build Coastguard Workercglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
8671*c0909341SAndroid Build Coastguard Worker    LEA                  r6, avg_ssse3_table
8672*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm ; leading zeros
8673*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm ; move h(stack) to h(register) if not already that register
8674*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
8675*c0909341SAndroid Build Coastguard Worker    mova                 m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
8676*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
8677*c0909341SAndroid Build Coastguard Worker    BIDIR_FN            AVG
8678*c0909341SAndroid Build Coastguard Worker
8679*c0909341SAndroid Build Coastguard Worker%macro W_AVG 1 ; src_offset
8680*c0909341SAndroid Build Coastguard Worker    ; (a * weight + b * (16 - weight) + 128) >> 8
8681*c0909341SAndroid Build Coastguard Worker    ; = ((a - b) * weight + (b << 4) + 128) >> 8
8682*c0909341SAndroid Build Coastguard Worker    ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
8683*c0909341SAndroid Build Coastguard Worker    ; = ((((b - a) * (-weight     << 12)) >> 16) + b + 8) >> 4
8684*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmp1q+(%1+0)*mmsize]
8685*c0909341SAndroid Build Coastguard Worker    mova                 m0, m2
8686*c0909341SAndroid Build Coastguard Worker    psubw                m2, [tmp2q+(%1+0)*mmsize]
8687*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp1q+(%1+1)*mmsize]
8688*c0909341SAndroid Build Coastguard Worker    mova                 m1, m3
8689*c0909341SAndroid Build Coastguard Worker    psubw                m3, [tmp2q+(%1+1)*mmsize]
8690*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m4
8691*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m4
8692*c0909341SAndroid Build Coastguard Worker    paddw                m0, m2
8693*c0909341SAndroid Build Coastguard Worker    paddw                m1, m3
8694*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
8695*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
8696*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
8697*c0909341SAndroid Build Coastguard Worker%endmacro
8698*c0909341SAndroid Build Coastguard Worker
8699*c0909341SAndroid Build Coastguard Worker%define W_AVG_INC_PTR AVG_INC_PTR
8700*c0909341SAndroid Build Coastguard Worker
8701*c0909341SAndroid Build Coastguard Workercglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
8702*c0909341SAndroid Build Coastguard Worker    LEA                  r6, w_avg_ssse3_table
8703*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
8704*c0909341SAndroid Build Coastguard Worker    movd                 m4, r6m
8705*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
8706*c0909341SAndroid Build Coastguard Worker    pxor                 m0, m0
8707*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r6+wq*4]
8708*c0909341SAndroid Build Coastguard Worker    mova                 m5, [pw_2048+r6-w_avg_ssse3_table]
8709*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m0
8710*c0909341SAndroid Build Coastguard Worker    psllw                m4, 12 ; (weight-16) << 12 when interpreted as signed
8711*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
8712*c0909341SAndroid Build Coastguard Worker    cmp           dword r6m, 7
8713*c0909341SAndroid Build Coastguard Worker    jg .weight_gt7
8714*c0909341SAndroid Build Coastguard Worker    mov                  r6, tmp1q
8715*c0909341SAndroid Build Coastguard Worker    psubw                m0, m4
8716*c0909341SAndroid Build Coastguard Worker    mov               tmp1q, tmp2q
8717*c0909341SAndroid Build Coastguard Worker    mova                 m4, m0 ; -weight
8718*c0909341SAndroid Build Coastguard Worker    mov               tmp2q, r6
8719*c0909341SAndroid Build Coastguard Worker.weight_gt7:
8720*c0909341SAndroid Build Coastguard Worker    BIDIR_FN          W_AVG
8721*c0909341SAndroid Build Coastguard Worker
8722*c0909341SAndroid Build Coastguard Worker%macro MASK 1 ; src_offset
8723*c0909341SAndroid Build Coastguard Worker    ; (a * m + b * (64 - m) + 512) >> 10
8724*c0909341SAndroid Build Coastguard Worker    ; = ((a - b) * m + (b << 6) + 512) >> 10
8725*c0909341SAndroid Build Coastguard Worker    ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
8726*c0909341SAndroid Build Coastguard Worker    mova                 m3,     [maskq+(%1+0)*(mmsize/2)]
8727*c0909341SAndroid Build Coastguard Worker    mova                 m0,     [tmp2q+(%1+0)*mmsize] ; b
8728*c0909341SAndroid Build Coastguard Worker    psubw                m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
8729*c0909341SAndroid Build Coastguard Worker    mova                 m6, m3      ; m
8730*c0909341SAndroid Build Coastguard Worker    psubb                m3, m4, m6  ; -m
8731*c0909341SAndroid Build Coastguard Worker    paddw                m1, m1     ; (b - a) << 1
8732*c0909341SAndroid Build Coastguard Worker    paddb                m3, m3     ; -m << 1
8733*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
8734*c0909341SAndroid Build Coastguard Worker    pmulhw               m1, m2     ; (-m * (b - a)) << 10
8735*c0909341SAndroid Build Coastguard Worker    paddw                m0, m1     ; + b
8736*c0909341SAndroid Build Coastguard Worker    mova                 m1,     [tmp2q+(%1+1)*mmsize] ; b
8737*c0909341SAndroid Build Coastguard Worker    psubw                m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
8738*c0909341SAndroid Build Coastguard Worker    paddw                m2, m2  ; (b - a) << 1
8739*c0909341SAndroid Build Coastguard Worker    mova                 m6, m3  ; (-m << 1)
8740*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m4, m6 ; (-m << 9)
8741*c0909341SAndroid Build Coastguard Worker    pmulhw               m2, m3 ; (-m << 9)
8742*c0909341SAndroid Build Coastguard Worker    paddw                m1, m2 ; (-m * (b - a)) << 10
8743*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5 ; round
8744*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5 ; round
8745*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1 ; interleave 16 -> 8
8746*c0909341SAndroid Build Coastguard Worker%endmacro
8747*c0909341SAndroid Build Coastguard Worker
8748*c0909341SAndroid Build Coastguard Worker%macro MASK_INC_PTR 1
8749*c0909341SAndroid Build Coastguard Worker    add               maskq, %1*mmsize/2
8750*c0909341SAndroid Build Coastguard Worker    add               tmp1q, %1*mmsize
8751*c0909341SAndroid Build Coastguard Worker    add               tmp2q, %1*mmsize
8752*c0909341SAndroid Build Coastguard Worker%endmacro
8753*c0909341SAndroid Build Coastguard Worker
8754*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8755*c0909341SAndroid Build Coastguard Workercglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
8756*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
8757*c0909341SAndroid Build Coastguard Worker%else
8758*c0909341SAndroid Build Coastguard Workercglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
8759*c0909341SAndroid Build Coastguard Worker%define hd dword r5m
8760*c0909341SAndroid Build Coastguard Worker%endif
8761*c0909341SAndroid Build Coastguard Worker%define base r6-mask_ssse3_table
8762*c0909341SAndroid Build Coastguard Worker    LEA                  r6, mask_ssse3_table
8763*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
8764*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r6+wq*4]
8765*c0909341SAndroid Build Coastguard Worker    pxor                 m4, m4
8766*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_2048]
8767*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
8768*c0909341SAndroid Build Coastguard Worker    mov               maskq, r6m
8769*c0909341SAndroid Build Coastguard Worker    BIDIR_FN           MASK
8770*c0909341SAndroid Build Coastguard Worker%undef hd
8771*c0909341SAndroid Build Coastguard Worker
8772*c0909341SAndroid Build Coastguard Worker%macro W_MASK_420_END 1-*
8773*c0909341SAndroid Build Coastguard Worker%rep %0
8774*c0909341SAndroid Build Coastguard Worker    call .main
8775*c0909341SAndroid Build Coastguard Worker    paddw                m2, [maskq+16*%1]
8776*c0909341SAndroid Build Coastguard Worker    mova      [maskq+16*%1], m2
8777*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*(2*%1+0)], m0
8778*c0909341SAndroid Build Coastguard Worker    call .main
8779*c0909341SAndroid Build Coastguard Worker    psubw                m3, m7, m2
8780*c0909341SAndroid Build Coastguard Worker    psubw                m1, m7, [maskq+16*%1]
8781*c0909341SAndroid Build Coastguard Worker    psubw                m3, [dstq+strideq*1+16*(2*%1+1)]
8782*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
8783*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 2
8784*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m3
8785*c0909341SAndroid Build Coastguard Worker    mova      [maskq+16*%1], m1
8786*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*(2*%1+1)], m0
8787*c0909341SAndroid Build Coastguard Worker    %rotate 1
8788*c0909341SAndroid Build Coastguard Worker%endrep
8789*c0909341SAndroid Build Coastguard Worker%endmacro
8790*c0909341SAndroid Build Coastguard Worker
8791*c0909341SAndroid Build Coastguard Worker%if UNIX64
8792*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 7
8793*c0909341SAndroid Build Coastguard Worker%else
8794*c0909341SAndroid Build Coastguard WorkerDECLARE_REG_TMP 5
8795*c0909341SAndroid Build Coastguard Worker%endif
8796*c0909341SAndroid Build Coastguard Worker
8797*c0909341SAndroid Build Coastguard Workercglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
8798*c0909341SAndroid Build Coastguard Worker%define base t0-w_mask_420_ssse3_table
8799*c0909341SAndroid Build Coastguard Worker    LEA                  t0, w_mask_420_ssse3_table
8800*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
8801*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; sign
8802*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, tmp1q
8803*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
8804*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+pw_2048]
8805*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+wm_420_sign+r6*8] ; 258 - sign
8806*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
8807*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8808*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
8809*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
8810*c0909341SAndroid Build Coastguard Worker%else
8811*c0909341SAndroid Build Coastguard Worker    %define              m8  [base+pw_6903]
8812*c0909341SAndroid Build Coastguard Worker    %define              hd  dword hm
8813*c0909341SAndroid Build Coastguard Worker%endif
8814*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
8815*c0909341SAndroid Build Coastguard Worker    call .main
8816*c0909341SAndroid Build Coastguard Worker    jmp                  wq
8817*c0909341SAndroid Build Coastguard Worker.w4_loop:
8818*c0909341SAndroid Build Coastguard Worker    call .main
8819*c0909341SAndroid Build Coastguard Worker    add               maskq, 4
8820*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8821*c0909341SAndroid Build Coastguard Worker.w4:
8822*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m2, q2020
8823*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q3131
8824*c0909341SAndroid Build Coastguard Worker    psubw                m1, m7, m3
8825*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
8826*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
8827*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
8828*c0909341SAndroid Build Coastguard Worker    movd            [maskq], m1
8829*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
8830*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
8831*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
8832*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
8833*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8834*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
8835*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
8836*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
8837*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
8838*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
8839*c0909341SAndroid Build Coastguard Worker    RET
8840*c0909341SAndroid Build Coastguard Worker.w8_loop:
8841*c0909341SAndroid Build Coastguard Worker    call .main
8842*c0909341SAndroid Build Coastguard Worker    add               maskq, 4
8843*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8844*c0909341SAndroid Build Coastguard Worker.w8:
8845*c0909341SAndroid Build Coastguard Worker    movhlps              m3, m2
8846*c0909341SAndroid Build Coastguard Worker    psubw                m1, m7, m2
8847*c0909341SAndroid Build Coastguard Worker    psubw                m1, m3
8848*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
8849*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
8850*c0909341SAndroid Build Coastguard Worker    movd            [maskq], m1
8851*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
8852*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
8853*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8854*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
8855*c0909341SAndroid Build Coastguard Worker    RET
8856*c0909341SAndroid Build Coastguard Worker.w16_loop:
8857*c0909341SAndroid Build Coastguard Worker    call .main
8858*c0909341SAndroid Build Coastguard Worker    add               maskq, 8
8859*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8860*c0909341SAndroid Build Coastguard Worker.w16:
8861*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m2
8862*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
8863*c0909341SAndroid Build Coastguard Worker    call .main
8864*c0909341SAndroid Build Coastguard Worker    psubw                m1, m7, [dstq+strideq*1]
8865*c0909341SAndroid Build Coastguard Worker    psubw                m1, m2
8866*c0909341SAndroid Build Coastguard Worker    psrlw                m1, 2
8867*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
8868*c0909341SAndroid Build Coastguard Worker    movq            [maskq], m1
8869*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
8870*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8871*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
8872*c0909341SAndroid Build Coastguard Worker    RET
8873*c0909341SAndroid Build Coastguard Worker.w32_loop:
8874*c0909341SAndroid Build Coastguard Worker    call .main
8875*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
8876*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8877*c0909341SAndroid Build Coastguard Worker.w32:
8878*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m2
8879*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
8880*c0909341SAndroid Build Coastguard Worker    call .main
8881*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m2
8882*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m0
8883*c0909341SAndroid Build Coastguard Worker    W_MASK_420_END        0
8884*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8885*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
8886*c0909341SAndroid Build Coastguard Worker    RET
8887*c0909341SAndroid Build Coastguard Worker.w64_loop:
8888*c0909341SAndroid Build Coastguard Worker    call .main
8889*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*2
8890*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8891*c0909341SAndroid Build Coastguard Worker.w64:
8892*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*0], m2
8893*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
8894*c0909341SAndroid Build Coastguard Worker    call .main
8895*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m2
8896*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m0
8897*c0909341SAndroid Build Coastguard Worker    call .main
8898*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*1], m2
8899*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*2], m0
8900*c0909341SAndroid Build Coastguard Worker    call .main
8901*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*3], m2
8902*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*3], m0
8903*c0909341SAndroid Build Coastguard Worker    W_MASK_420_END        0, 1
8904*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8905*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
8906*c0909341SAndroid Build Coastguard Worker    RET
8907*c0909341SAndroid Build Coastguard Worker.w128_loop:
8908*c0909341SAndroid Build Coastguard Worker    call .main
8909*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*4
8910*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
8911*c0909341SAndroid Build Coastguard Worker.w128:
8912*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*0], m2
8913*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*0], m0
8914*c0909341SAndroid Build Coastguard Worker    call .main
8915*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*1], m2
8916*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*1], m0
8917*c0909341SAndroid Build Coastguard Worker    call .main
8918*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*1], m2
8919*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*2], m0
8920*c0909341SAndroid Build Coastguard Worker    call .main
8921*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*3], m2
8922*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*3], m0
8923*c0909341SAndroid Build Coastguard Worker    call .main
8924*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*2], m2
8925*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*4], m0
8926*c0909341SAndroid Build Coastguard Worker    call .main
8927*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*5], m2
8928*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*5], m0
8929*c0909341SAndroid Build Coastguard Worker    call .main
8930*c0909341SAndroid Build Coastguard Worker    mova       [maskq+16*3], m2
8931*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*6], m0
8932*c0909341SAndroid Build Coastguard Worker    call .main
8933*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*1+16*7], m2
8934*c0909341SAndroid Build Coastguard Worker    mova [dstq+strideq*0+16*7], m0
8935*c0909341SAndroid Build Coastguard Worker    W_MASK_420_END        0, 1, 2, 3
8936*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
8937*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
8938*c0909341SAndroid Build Coastguard Worker    RET
8939*c0909341SAndroid Build Coastguard WorkerALIGN function_align
8940*c0909341SAndroid Build Coastguard Worker.main:
8941*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q      +16*0]
8942*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp1q+tmp2q+16*0]
8943*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q      +16*1]
8944*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+tmp2q+16*1]
8945*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 16*2
8946*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
8947*c0909341SAndroid Build Coastguard Worker    psubw                m4, m1
8948*c0909341SAndroid Build Coastguard Worker    pabsw                m5, m3
8949*c0909341SAndroid Build Coastguard Worker    psubusw              m2, m8, m5
8950*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 8 ; 64 - m
8951*c0909341SAndroid Build Coastguard Worker    psllw                m5, m2, 10
8952*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m5
8953*c0909341SAndroid Build Coastguard Worker    pabsw                m5, m4
8954*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
8955*c0909341SAndroid Build Coastguard Worker    psubusw              m3, m8, m5
8956*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 8
8957*c0909341SAndroid Build Coastguard Worker    phaddw               m2, m3
8958*c0909341SAndroid Build Coastguard Worker    psllw                m3, 10
8959*c0909341SAndroid Build Coastguard Worker    pmulhw               m4, m3
8960*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
8961*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m6
8962*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m6
8963*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
8964*c0909341SAndroid Build Coastguard Worker    ret
8965*c0909341SAndroid Build Coastguard Worker
8966*c0909341SAndroid Build Coastguard Worker%macro W_MASK_422_BACKUP 1 ; mask_offset
8967*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8968*c0909341SAndroid Build Coastguard Worker    mova                m10, m2
8969*c0909341SAndroid Build Coastguard Worker%else
8970*c0909341SAndroid Build Coastguard Worker    mova      [maskq+16*%1], m2
8971*c0909341SAndroid Build Coastguard Worker%endif
8972*c0909341SAndroid Build Coastguard Worker%endmacro
8973*c0909341SAndroid Build Coastguard Worker
8974*c0909341SAndroid Build Coastguard Worker%macro W_MASK_422_END 1 ; mask_offset
8975*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
8976*c0909341SAndroid Build Coastguard Worker    packuswb            m10, m2
8977*c0909341SAndroid Build Coastguard Worker    psubb                m1, m7, m10
8978*c0909341SAndroid Build Coastguard Worker    pavgb                m1, m9
8979*c0909341SAndroid Build Coastguard Worker%else
8980*c0909341SAndroid Build Coastguard Worker    mova                 m3, [maskq+16*%1]
8981*c0909341SAndroid Build Coastguard Worker    packuswb             m3, m2
8982*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
8983*c0909341SAndroid Build Coastguard Worker    psubb                m1, m7, m3
8984*c0909341SAndroid Build Coastguard Worker    pavgb                m1, m2
8985*c0909341SAndroid Build Coastguard Worker%endif
8986*c0909341SAndroid Build Coastguard Worker    mova      [maskq+16*%1], m1
8987*c0909341SAndroid Build Coastguard Worker%endmacro
8988*c0909341SAndroid Build Coastguard Worker
8989*c0909341SAndroid Build Coastguard Workercglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask
8990*c0909341SAndroid Build Coastguard Worker%define base t0-w_mask_422_ssse3_table
8991*c0909341SAndroid Build Coastguard Worker    LEA                  t0, w_mask_422_ssse3_table
8992*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
8993*c0909341SAndroid Build Coastguard Worker    mov                 r6d, r7m ; sign
8994*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, tmp1q
8995*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
8996*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+pw_2048]
8997*c0909341SAndroid Build Coastguard Worker    movddup              m7, [base+wm_422_sign+r6*8] ; 128 - sign
8998*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
8999*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9000*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pw_6903]
9001*c0909341SAndroid Build Coastguard Worker    pxor                 m9, m9
9002*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
9003*c0909341SAndroid Build Coastguard Worker%else
9004*c0909341SAndroid Build Coastguard Worker    add                  t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table
9005*c0909341SAndroid Build Coastguard Worker    %define              hd  dword hm
9006*c0909341SAndroid Build Coastguard Worker%endif
9007*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
9008*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9009*c0909341SAndroid Build Coastguard Worker    jmp                  wq
9010*c0909341SAndroid Build Coastguard Worker.w4_loop:
9011*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9012*c0909341SAndroid Build Coastguard Worker    add               maskq, 8
9013*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9014*c0909341SAndroid Build Coastguard Worker.w4:
9015*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m2
9016*c0909341SAndroid Build Coastguard Worker    psubb                m1, m7, m2
9017*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9018*c0909341SAndroid Build Coastguard Worker    pavgb                m1, m9
9019*c0909341SAndroid Build Coastguard Worker%else
9020*c0909341SAndroid Build Coastguard Worker    pxor                 m2, m2
9021*c0909341SAndroid Build Coastguard Worker    pavgb                m1, m2
9022*c0909341SAndroid Build Coastguard Worker%endif
9023*c0909341SAndroid Build Coastguard Worker    movq            [maskq], m1
9024*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
9025*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
9026*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
9027*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
9028*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9029*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
9030*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
9031*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
9032*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
9033*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
9034*c0909341SAndroid Build Coastguard Worker    RET
9035*c0909341SAndroid Build Coastguard Worker.w8_loop:
9036*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9037*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9038*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9039*c0909341SAndroid Build Coastguard Worker.w8:
9040*c0909341SAndroid Build Coastguard Worker    W_MASK_422_BACKUP     0
9041*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
9042*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
9043*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9044*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9045*c0909341SAndroid Build Coastguard Worker    W_MASK_422_END        0
9046*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
9047*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
9048*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
9049*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
9050*c0909341SAndroid Build Coastguard Worker    RET
9051*c0909341SAndroid Build Coastguard Worker.w16_loop:
9052*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9053*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9054*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9055*c0909341SAndroid Build Coastguard Worker.w16:
9056*c0909341SAndroid Build Coastguard Worker    W_MASK_422_BACKUP     0
9057*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
9058*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9059*c0909341SAndroid Build Coastguard Worker    W_MASK_422_END        0
9060*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
9061*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9062*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
9063*c0909341SAndroid Build Coastguard Worker    RET
9064*c0909341SAndroid Build Coastguard Worker.w32_loop:
9065*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9066*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9067*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9068*c0909341SAndroid Build Coastguard Worker.w32:
9069*c0909341SAndroid Build Coastguard Worker    W_MASK_422_BACKUP     0
9070*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9071*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9072*c0909341SAndroid Build Coastguard Worker    W_MASK_422_END        0
9073*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
9074*c0909341SAndroid Build Coastguard Worker    dec                  hd
9075*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
9076*c0909341SAndroid Build Coastguard Worker    RET
9077*c0909341SAndroid Build Coastguard Worker.w64_loop:
9078*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9079*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*2
9080*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9081*c0909341SAndroid Build Coastguard Worker.w64:
9082*c0909341SAndroid Build Coastguard Worker    W_MASK_422_BACKUP     0
9083*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9084*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9085*c0909341SAndroid Build Coastguard Worker    W_MASK_422_END        0
9086*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
9087*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9088*c0909341SAndroid Build Coastguard Worker    W_MASK_422_BACKUP     1
9089*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
9090*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9091*c0909341SAndroid Build Coastguard Worker    W_MASK_422_END        1
9092*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m0
9093*c0909341SAndroid Build Coastguard Worker    dec                  hd
9094*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
9095*c0909341SAndroid Build Coastguard Worker    RET
9096*c0909341SAndroid Build Coastguard Worker.w128_loop:
9097*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9098*c0909341SAndroid Build Coastguard Worker    add               maskq, 16*4
9099*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9100*c0909341SAndroid Build Coastguard Worker.w128:
9101*c0909341SAndroid Build Coastguard Worker    W_MASK_422_BACKUP     0
9102*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9103*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9104*c0909341SAndroid Build Coastguard Worker    W_MASK_422_END        0
9105*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
9106*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9107*c0909341SAndroid Build Coastguard Worker    W_MASK_422_BACKUP     1
9108*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
9109*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9110*c0909341SAndroid Build Coastguard Worker    W_MASK_422_END        1
9111*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m0
9112*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9113*c0909341SAndroid Build Coastguard Worker    W_MASK_422_BACKUP     2
9114*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
9115*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9116*c0909341SAndroid Build Coastguard Worker    W_MASK_422_END        2
9117*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m0
9118*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9119*c0909341SAndroid Build Coastguard Worker    W_MASK_422_BACKUP     3
9120*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m0
9121*c0909341SAndroid Build Coastguard Worker    call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
9122*c0909341SAndroid Build Coastguard Worker    W_MASK_422_END        3
9123*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m0
9124*c0909341SAndroid Build Coastguard Worker    dec                  hd
9125*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
9126*c0909341SAndroid Build Coastguard Worker    RET
9127*c0909341SAndroid Build Coastguard Worker
9128*c0909341SAndroid Build Coastguard Workercglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
9129*c0909341SAndroid Build Coastguard Worker%define base t0-w_mask_444_ssse3_table
9130*c0909341SAndroid Build Coastguard Worker    LEA                  t0, w_mask_444_ssse3_table
9131*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
9132*c0909341SAndroid Build Coastguard Worker    mov               maskq, maskmp
9133*c0909341SAndroid Build Coastguard Worker    sub               tmp2q, tmp1q
9134*c0909341SAndroid Build Coastguard Worker    movsxd               wq, [t0+wq*4]
9135*c0909341SAndroid Build Coastguard Worker    mova                 m6, [base+pw_6903]
9136*c0909341SAndroid Build Coastguard Worker    mova                 m7, [base+pw_2048]
9137*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
9138*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9139*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pb_64]
9140*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
9141*c0909341SAndroid Build Coastguard Worker%else
9142*c0909341SAndroid Build Coastguard Worker    %define              m8  [base+pb_64]
9143*c0909341SAndroid Build Coastguard Worker    %define              hd  dword hm
9144*c0909341SAndroid Build Coastguard Worker%endif
9145*c0909341SAndroid Build Coastguard Worker    call .main
9146*c0909341SAndroid Build Coastguard Worker    jmp                  wq
9147*c0909341SAndroid Build Coastguard Worker.w4_loop:
9148*c0909341SAndroid Build Coastguard Worker    call .main
9149*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9150*c0909341SAndroid Build Coastguard Worker.w4:
9151*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
9152*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
9153*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
9154*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m0, m0
9155*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9156*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*0], m0
9157*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m0, q1032
9158*c0909341SAndroid Build Coastguard Worker    movd   [dstq+strideq*1], m1
9159*c0909341SAndroid Build Coastguard Worker    sub                  hd, 4
9160*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
9161*c0909341SAndroid Build Coastguard Worker    RET
9162*c0909341SAndroid Build Coastguard Worker.w8_loop:
9163*c0909341SAndroid Build Coastguard Worker    call .main
9164*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9165*c0909341SAndroid Build Coastguard Worker.w8:
9166*c0909341SAndroid Build Coastguard Worker    movq   [dstq+strideq*0], m0
9167*c0909341SAndroid Build Coastguard Worker    movhps [dstq+strideq*1], m0
9168*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9169*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
9170*c0909341SAndroid Build Coastguard Worker    RET
9171*c0909341SAndroid Build Coastguard Worker.w16_loop:
9172*c0909341SAndroid Build Coastguard Worker    call .main
9173*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+strideq*2]
9174*c0909341SAndroid Build Coastguard Worker.w16:
9175*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*0], m0
9176*c0909341SAndroid Build Coastguard Worker    call .main
9177*c0909341SAndroid Build Coastguard Worker    mova   [dstq+strideq*1], m0
9178*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9179*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
9180*c0909341SAndroid Build Coastguard Worker    RET
9181*c0909341SAndroid Build Coastguard Worker.w32_loop:
9182*c0909341SAndroid Build Coastguard Worker    call .main
9183*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9184*c0909341SAndroid Build Coastguard Worker.w32:
9185*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9186*c0909341SAndroid Build Coastguard Worker    call .main
9187*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
9188*c0909341SAndroid Build Coastguard Worker    dec                  hd
9189*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
9190*c0909341SAndroid Build Coastguard Worker    RET
9191*c0909341SAndroid Build Coastguard Worker.w64_loop:
9192*c0909341SAndroid Build Coastguard Worker    call .main
9193*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9194*c0909341SAndroid Build Coastguard Worker.w64:
9195*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9196*c0909341SAndroid Build Coastguard Worker    call .main
9197*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
9198*c0909341SAndroid Build Coastguard Worker    call .main
9199*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
9200*c0909341SAndroid Build Coastguard Worker    call .main
9201*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m0
9202*c0909341SAndroid Build Coastguard Worker    dec                  hd
9203*c0909341SAndroid Build Coastguard Worker    jg .w64_loop
9204*c0909341SAndroid Build Coastguard Worker    RET
9205*c0909341SAndroid Build Coastguard Worker.w128_loop:
9206*c0909341SAndroid Build Coastguard Worker    call .main
9207*c0909341SAndroid Build Coastguard Worker    add                dstq, strideq
9208*c0909341SAndroid Build Coastguard Worker.w128:
9209*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9210*c0909341SAndroid Build Coastguard Worker    call .main
9211*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*1], m0
9212*c0909341SAndroid Build Coastguard Worker    call .main
9213*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*2], m0
9214*c0909341SAndroid Build Coastguard Worker    call .main
9215*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*3], m0
9216*c0909341SAndroid Build Coastguard Worker    call .main
9217*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*4], m0
9218*c0909341SAndroid Build Coastguard Worker    call .main
9219*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*5], m0
9220*c0909341SAndroid Build Coastguard Worker    call .main
9221*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*6], m0
9222*c0909341SAndroid Build Coastguard Worker    call .main
9223*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*7], m0
9224*c0909341SAndroid Build Coastguard Worker    dec                  hd
9225*c0909341SAndroid Build Coastguard Worker    jg .w128_loop
9226*c0909341SAndroid Build Coastguard Worker    RET
9227*c0909341SAndroid Build Coastguard WorkerALIGN function_align
9228*c0909341SAndroid Build Coastguard Worker.main:
9229*c0909341SAndroid Build Coastguard Worker    mova                 m0, [tmp1q      +16*0]
9230*c0909341SAndroid Build Coastguard Worker    mova                 m3, [tmp1q+tmp2q+16*0]
9231*c0909341SAndroid Build Coastguard Worker    mova                 m1, [tmp1q      +16*1]
9232*c0909341SAndroid Build Coastguard Worker    mova                 m4, [tmp1q+tmp2q+16*1]
9233*c0909341SAndroid Build Coastguard Worker    add               tmp1q, 16*2
9234*c0909341SAndroid Build Coastguard Worker    psubw                m3, m0
9235*c0909341SAndroid Build Coastguard Worker    psubw                m4, m1
9236*c0909341SAndroid Build Coastguard Worker    pabsw                m5, m3
9237*c0909341SAndroid Build Coastguard Worker    psubusw              m2, m6, m5
9238*c0909341SAndroid Build Coastguard Worker    psrlw                m2, 8 ; 64 - m
9239*c0909341SAndroid Build Coastguard Worker    psllw                m5, m2, 10
9240*c0909341SAndroid Build Coastguard Worker    pmulhw               m3, m5
9241*c0909341SAndroid Build Coastguard Worker    pabsw                m5, m4
9242*c0909341SAndroid Build Coastguard Worker    paddw                m0, m3
9243*c0909341SAndroid Build Coastguard Worker    psubusw              m3, m6, m5
9244*c0909341SAndroid Build Coastguard Worker    psrlw                m3, 8
9245*c0909341SAndroid Build Coastguard Worker    packuswb             m2, m3
9246*c0909341SAndroid Build Coastguard Worker    psllw                m3, 10
9247*c0909341SAndroid Build Coastguard Worker    pmulhw               m4, m3
9248*c0909341SAndroid Build Coastguard Worker    psubb                m3, m8, m2
9249*c0909341SAndroid Build Coastguard Worker    paddw                m1, m4
9250*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m7
9251*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m7
9252*c0909341SAndroid Build Coastguard Worker    mova            [maskq], m3
9253*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9254*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1
9255*c0909341SAndroid Build Coastguard Worker    ret
9256*c0909341SAndroid Build Coastguard Worker
9257*c0909341SAndroid Build Coastguard Worker%macro BLEND_64M 4; a, b, mask1, mask2
9258*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, %1, %2; {b;a}[7..0]
9259*c0909341SAndroid Build Coastguard Worker    punpckhbw            %1, %2    ; {b;a}[15..8]
9260*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, %3    ; {b*m[0] + (64-m[0])*a}[7..0] u16
9261*c0909341SAndroid Build Coastguard Worker    pmaddubsw            %1, %4    ; {b*m[1] + (64-m[1])*a}[15..8] u16
9262*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
9263*c0909341SAndroid Build Coastguard Worker    pmulhrsw             %1, m5    ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
9264*c0909341SAndroid Build Coastguard Worker    packuswb             m0, %1    ; {blendpx}[15..0] u8
9265*c0909341SAndroid Build Coastguard Worker%endmacro
9266*c0909341SAndroid Build Coastguard Worker
9267*c0909341SAndroid Build Coastguard Worker%macro BLEND 2; a, b
9268*c0909341SAndroid Build Coastguard Worker    psubb                m3, m4, m0 ; m3 = (64 - m)
9269*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
9270*c0909341SAndroid Build Coastguard Worker    punpckhbw            m3, m0     ; {m;(64-m)}[15..8]
9271*c0909341SAndroid Build Coastguard Worker    BLEND_64M            %1, %2, m2, m3
9272*c0909341SAndroid Build Coastguard Worker%endmacro
9273*c0909341SAndroid Build Coastguard Worker
9274*c0909341SAndroid Build Coastguard Workercglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
9275*c0909341SAndroid Build Coastguard Worker%define base r6-blend_ssse3_table
9276*c0909341SAndroid Build Coastguard Worker    LEA                  r6, blend_ssse3_table
9277*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
9278*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
9279*c0909341SAndroid Build Coastguard Worker    movifnidn         maskq, maskmp
9280*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r6+wq*4]
9281*c0909341SAndroid Build Coastguard Worker    mova                 m4, [base+pb_64]
9282*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_512]
9283*c0909341SAndroid Build Coastguard Worker    add                  wq, r6
9284*c0909341SAndroid Build Coastguard Worker    lea                  r6, [dsq*3]
9285*c0909341SAndroid Build Coastguard Worker    jmp                  wq
9286*c0909341SAndroid Build Coastguard Worker.w4:
9287*c0909341SAndroid Build Coastguard Worker    movq                 m0, [maskq]; m
9288*c0909341SAndroid Build Coastguard Worker    movd                 m1, [dstq+dsq*0] ; a
9289*c0909341SAndroid Build Coastguard Worker    movd                 m6, [dstq+dsq*1]
9290*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m6
9291*c0909341SAndroid Build Coastguard Worker    movq                 m6, [tmpq] ; b
9292*c0909341SAndroid Build Coastguard Worker    psubb                m3, m4, m0 ; m3 = (64 - m)
9293*c0909341SAndroid Build Coastguard Worker    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
9294*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m6    ; {b;a}[7..0]
9295*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m2    ; {b*m[0] + (64-m[0])*a}[7..0] u16
9296*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
9297*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m0    ; {blendpx}[15..0] u8
9298*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m1
9299*c0909341SAndroid Build Coastguard Worker    psrlq                m1, 32
9300*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m1
9301*c0909341SAndroid Build Coastguard Worker    add               maskq, 8
9302*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8
9303*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
9304*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9305*c0909341SAndroid Build Coastguard Worker    jg .w4
9306*c0909341SAndroid Build Coastguard Worker    RET
9307*c0909341SAndroid Build Coastguard Worker.w8:
9308*c0909341SAndroid Build Coastguard Worker    mova                 m0, [maskq]; m
9309*c0909341SAndroid Build Coastguard Worker    movq                 m1, [dstq+dsq*0] ; a
9310*c0909341SAndroid Build Coastguard Worker    movhps               m1, [dstq+dsq*1]
9311*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tmpq] ; b
9312*c0909341SAndroid Build Coastguard Worker    BLEND                m1, m6
9313*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
9314*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
9315*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9316*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
9317*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
9318*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9319*c0909341SAndroid Build Coastguard Worker    jg .w8
9320*c0909341SAndroid Build Coastguard Worker    RET
9321*c0909341SAndroid Build Coastguard Worker.w16:
9322*c0909341SAndroid Build Coastguard Worker    mova                 m0, [maskq]; m
9323*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq] ; a
9324*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tmpq] ; b
9325*c0909341SAndroid Build Coastguard Worker    BLEND                m1, m6
9326*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
9327*c0909341SAndroid Build Coastguard Worker    add               maskq, 16
9328*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
9329*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq ; dst_stride
9330*c0909341SAndroid Build Coastguard Worker    dec                  hd
9331*c0909341SAndroid Build Coastguard Worker    jg .w16
9332*c0909341SAndroid Build Coastguard Worker    RET
9333*c0909341SAndroid Build Coastguard Worker.w32:
9334*c0909341SAndroid Build Coastguard Worker    %assign i 0
9335*c0909341SAndroid Build Coastguard Worker    %rep 2
9336*c0909341SAndroid Build Coastguard Worker    mova                 m0, [maskq+16*i]; m
9337*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+16*i] ; a
9338*c0909341SAndroid Build Coastguard Worker    mova                 m6, [tmpq+16*i] ; b
9339*c0909341SAndroid Build Coastguard Worker    BLEND                m1, m6
9340*c0909341SAndroid Build Coastguard Worker    mova        [dstq+i*16], m0
9341*c0909341SAndroid Build Coastguard Worker    %assign i i+1
9342*c0909341SAndroid Build Coastguard Worker    %endrep
9343*c0909341SAndroid Build Coastguard Worker    add               maskq, 32
9344*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
9345*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq ; dst_stride
9346*c0909341SAndroid Build Coastguard Worker    dec                  hd
9347*c0909341SAndroid Build Coastguard Worker    jg .w32
9348*c0909341SAndroid Build Coastguard Worker    RET
9349*c0909341SAndroid Build Coastguard Worker
9350*c0909341SAndroid Build Coastguard Workercglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask
9351*c0909341SAndroid Build Coastguard Worker%define base r5-blend_v_ssse3_table
9352*c0909341SAndroid Build Coastguard Worker    LEA                  r5, blend_v_ssse3_table
9353*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
9354*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
9355*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [r5+wq*4]
9356*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_512]
9357*c0909341SAndroid Build Coastguard Worker    add                  wq, r5
9358*c0909341SAndroid Build Coastguard Worker    add               maskq, obmc_masks-blend_v_ssse3_table
9359*c0909341SAndroid Build Coastguard Worker    jmp                  wq
9360*c0909341SAndroid Build Coastguard Worker.w2:
9361*c0909341SAndroid Build Coastguard Worker    movd                 m3, [maskq+4]
9362*c0909341SAndroid Build Coastguard Worker    punpckldq            m3, m3
9363*c0909341SAndroid Build Coastguard Worker    ; 2 mask blend is provided for 4 pixels / 2 lines
9364*c0909341SAndroid Build Coastguard Worker.w2_loop:
9365*c0909341SAndroid Build Coastguard Worker    movd                 m1, [dstq+dsq*0] ; a {..;a;a}
9366*c0909341SAndroid Build Coastguard Worker    pinsrw               m1, [dstq+dsq*1], 1
9367*c0909341SAndroid Build Coastguard Worker    movd                 m2, [tmpq] ; b
9368*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1, m2; {b;a}[7..0]
9369*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m3    ; {b*m + (64-m)*a}[7..0] u16
9370*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
9371*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m1    ; {blendpx}[8..0] u8
9372*c0909341SAndroid Build Coastguard Worker    movd                r3d, m0
9373*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r3w
9374*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 16
9375*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r3w
9376*c0909341SAndroid Build Coastguard Worker    add                tmpq, 2*2
9377*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq + dsq * 2]
9378*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9379*c0909341SAndroid Build Coastguard Worker    jg .w2_loop
9380*c0909341SAndroid Build Coastguard Worker    RET
9381*c0909341SAndroid Build Coastguard Worker.w4:
9382*c0909341SAndroid Build Coastguard Worker    movddup              m3, [maskq+8]
9383*c0909341SAndroid Build Coastguard Worker    ; 4 mask blend is provided for 8 pixels / 2 lines
9384*c0909341SAndroid Build Coastguard Worker.w4_loop:
9385*c0909341SAndroid Build Coastguard Worker    movd                 m1, [dstq+dsq*0] ; a
9386*c0909341SAndroid Build Coastguard Worker    movd                 m2, [dstq+dsq*1] ;
9387*c0909341SAndroid Build Coastguard Worker    punpckldq            m1, m2
9388*c0909341SAndroid Build Coastguard Worker    movq                 m2, [tmpq] ; b
9389*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, m2    ; {b;a}[7..0]
9390*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m3    ; {b*m + (64-m)*a}[7..0] u16
9391*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
9392*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1    ; {blendpx}[8..0] u8
9393*c0909341SAndroid Build Coastguard Worker    movd             [dstq], m1
9394*c0909341SAndroid Build Coastguard Worker    psrlq                m1, 32
9395*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m1
9396*c0909341SAndroid Build Coastguard Worker    add                tmpq, 2*4
9397*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
9398*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9399*c0909341SAndroid Build Coastguard Worker    jg .w4_loop
9400*c0909341SAndroid Build Coastguard Worker    RET
9401*c0909341SAndroid Build Coastguard Worker.w8:
9402*c0909341SAndroid Build Coastguard Worker    mova                 m3, [maskq+16]
9403*c0909341SAndroid Build Coastguard Worker    ; 8 mask blend is provided for 16 pixels
9404*c0909341SAndroid Build Coastguard Worker.w8_loop:
9405*c0909341SAndroid Build Coastguard Worker    movq                 m1, [dstq+dsq*0] ; a
9406*c0909341SAndroid Build Coastguard Worker    movhps               m1, [dstq+dsq*1]
9407*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq]; b
9408*c0909341SAndroid Build Coastguard Worker    BLEND_64M            m1, m2, m3, m3
9409*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
9410*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
9411*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
9412*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
9413*c0909341SAndroid Build Coastguard Worker    sub                  hd, 2
9414*c0909341SAndroid Build Coastguard Worker    jg .w8_loop
9415*c0909341SAndroid Build Coastguard Worker    RET
9416*c0909341SAndroid Build Coastguard Worker.w16:
9417*c0909341SAndroid Build Coastguard Worker    ; 16 mask blend is provided for 32 pixels
9418*c0909341SAndroid Build Coastguard Worker    mova                  m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
9419*c0909341SAndroid Build Coastguard Worker    mova                  m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
9420*c0909341SAndroid Build Coastguard Worker.w16_loop:
9421*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq] ; a
9422*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq] ; b
9423*c0909341SAndroid Build Coastguard Worker    BLEND_64M            m1, m2, m3, m4
9424*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
9425*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
9426*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
9427*c0909341SAndroid Build Coastguard Worker    dec                  hd
9428*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
9429*c0909341SAndroid Build Coastguard Worker    RET
9430*c0909341SAndroid Build Coastguard Worker.w32:
9431*c0909341SAndroid Build Coastguard Worker%if WIN64
9432*c0909341SAndroid Build Coastguard Worker    mova            [rsp+8], xmm6
9433*c0909341SAndroid Build Coastguard Worker%endif
9434*c0909341SAndroid Build Coastguard Worker    mova                 m3, [maskq+64] ; obmc_masks_32[0] (64-m[0])
9435*c0909341SAndroid Build Coastguard Worker    mova                 m4, [maskq+80] ; obmc_masks_32[1] (64-m[1])
9436*c0909341SAndroid Build Coastguard Worker    mova                 m6, [maskq+96] ; obmc_masks_32[2] (64-m[2])
9437*c0909341SAndroid Build Coastguard Worker    ; 16 mask blend is provided for 64 pixels
9438*c0909341SAndroid Build Coastguard Worker.w32_loop:
9439*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq+16*0] ; a
9440*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq+16*0] ; b
9441*c0909341SAndroid Build Coastguard Worker    BLEND_64M            m1, m2, m3, m4
9442*c0909341SAndroid Build Coastguard Worker    movq                 m1, [dstq+16*1] ; a
9443*c0909341SAndroid Build Coastguard Worker    punpcklbw            m1, [tmpq+16*1] ; b
9444*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m1, m6
9445*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m1, m5
9446*c0909341SAndroid Build Coastguard Worker    packuswb             m1, m1
9447*c0909341SAndroid Build Coastguard Worker    mova        [dstq+16*0], m0
9448*c0909341SAndroid Build Coastguard Worker    movq        [dstq+16*1], m1
9449*c0909341SAndroid Build Coastguard Worker    add                tmpq, 32
9450*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
9451*c0909341SAndroid Build Coastguard Worker    dec                  hd
9452*c0909341SAndroid Build Coastguard Worker    jg .w32_loop
9453*c0909341SAndroid Build Coastguard Worker%if WIN64
9454*c0909341SAndroid Build Coastguard Worker    mova               xmm6, [rsp+8]
9455*c0909341SAndroid Build Coastguard Worker%endif
9456*c0909341SAndroid Build Coastguard Worker    RET
9457*c0909341SAndroid Build Coastguard Worker
9458*c0909341SAndroid Build Coastguard Workercglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
9459*c0909341SAndroid Build Coastguard Worker%define base t0-blend_h_ssse3_table
9460*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
9461*c0909341SAndroid Build Coastguard Worker    ; We need to keep the PIC pointer for w4, reload wd from stack instead
9462*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 6
9463*c0909341SAndroid Build Coastguard Worker%else
9464*c0909341SAndroid Build Coastguard Worker    DECLARE_REG_TMP 5
9465*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wd
9466*c0909341SAndroid Build Coastguard Worker%endif
9467*c0909341SAndroid Build Coastguard Worker    LEA                  t0, blend_h_ssse3_table
9468*c0909341SAndroid Build Coastguard Worker    tzcnt                wd, wm
9469*c0909341SAndroid Build Coastguard Worker    mov                  hd, hm
9470*c0909341SAndroid Build Coastguard Worker    movsxd               wq, dword [t0+wq*4]
9471*c0909341SAndroid Build Coastguard Worker    mova                 m5, [base+pw_512]
9472*c0909341SAndroid Build Coastguard Worker    add                  wq, t0
9473*c0909341SAndroid Build Coastguard Worker    lea               maskq, [base+obmc_masks+hq*2]
9474*c0909341SAndroid Build Coastguard Worker    lea                  hd, [hq*3]
9475*c0909341SAndroid Build Coastguard Worker    shr                  hd, 2 ; h * 3/4
9476*c0909341SAndroid Build Coastguard Worker    lea               maskq, [maskq+hq*2]
9477*c0909341SAndroid Build Coastguard Worker    neg                  hq
9478*c0909341SAndroid Build Coastguard Worker    jmp                  wq
9479*c0909341SAndroid Build Coastguard Worker.w2:
9480*c0909341SAndroid Build Coastguard Worker    movd                 m0, [dstq+dsq*0]
9481*c0909341SAndroid Build Coastguard Worker    pinsrw               m0, [dstq+dsq*1], 1
9482*c0909341SAndroid Build Coastguard Worker    movd                 m2, [maskq+hq*2]
9483*c0909341SAndroid Build Coastguard Worker    movd                 m1, [tmpq]
9484*c0909341SAndroid Build Coastguard Worker    punpcklwd            m2, m2
9485*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
9486*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
9487*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
9488*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
9489*c0909341SAndroid Build Coastguard Worker    movd                r3d, m0
9490*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*0], r3w
9491*c0909341SAndroid Build Coastguard Worker    shr                 r3d, 16
9492*c0909341SAndroid Build Coastguard Worker    mov        [dstq+dsq*1], r3w
9493*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
9494*c0909341SAndroid Build Coastguard Worker    add                tmpq, 2*2
9495*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
9496*c0909341SAndroid Build Coastguard Worker    jl .w2
9497*c0909341SAndroid Build Coastguard Worker    RET
9498*c0909341SAndroid Build Coastguard Worker.w4:
9499*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
9500*c0909341SAndroid Build Coastguard Worker    mova                 m3, [base+blend_shuf]
9501*c0909341SAndroid Build Coastguard Worker%else
9502*c0909341SAndroid Build Coastguard Worker    mova                 m3, [blend_shuf]
9503*c0909341SAndroid Build Coastguard Worker%endif
9504*c0909341SAndroid Build Coastguard Worker.w4_loop:
9505*c0909341SAndroid Build Coastguard Worker    movd                 m0, [dstq+dsq*0]
9506*c0909341SAndroid Build Coastguard Worker    movd                 m2, [dstq+dsq*1]
9507*c0909341SAndroid Build Coastguard Worker    punpckldq            m0, m2 ; a
9508*c0909341SAndroid Build Coastguard Worker    movq                 m1, [tmpq] ; b
9509*c0909341SAndroid Build Coastguard Worker    movq                 m2, [maskq+hq*2] ; m
9510*c0909341SAndroid Build Coastguard Worker    pshufb               m2, m3
9511*c0909341SAndroid Build Coastguard Worker    punpcklbw            m0, m1
9512*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m0, m2
9513*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m0, m5
9514*c0909341SAndroid Build Coastguard Worker    packuswb             m0, m0
9515*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*0], m0
9516*c0909341SAndroid Build Coastguard Worker    psrlq                m0, 32
9517*c0909341SAndroid Build Coastguard Worker    movd       [dstq+dsq*1], m0
9518*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
9519*c0909341SAndroid Build Coastguard Worker    add                tmpq, 4*2
9520*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
9521*c0909341SAndroid Build Coastguard Worker    jl .w4_loop
9522*c0909341SAndroid Build Coastguard Worker    RET
9523*c0909341SAndroid Build Coastguard Worker.w8:
9524*c0909341SAndroid Build Coastguard Worker    movd                 m4, [maskq+hq*2]
9525*c0909341SAndroid Build Coastguard Worker    punpcklwd            m4, m4
9526*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m4, q0000
9527*c0909341SAndroid Build Coastguard Worker    pshufd               m4, m4, q1111
9528*c0909341SAndroid Build Coastguard Worker    movq                 m1, [dstq+dsq*0] ; a
9529*c0909341SAndroid Build Coastguard Worker    movhps               m1, [dstq+dsq*1]
9530*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq]
9531*c0909341SAndroid Build Coastguard Worker    BLEND_64M            m1, m2, m3, m4
9532*c0909341SAndroid Build Coastguard Worker    movq       [dstq+dsq*0], m0
9533*c0909341SAndroid Build Coastguard Worker    movhps     [dstq+dsq*1], m0
9534*c0909341SAndroid Build Coastguard Worker    lea                dstq, [dstq+dsq*2]
9535*c0909341SAndroid Build Coastguard Worker    add                tmpq, 8*2
9536*c0909341SAndroid Build Coastguard Worker    add                  hq, 2
9537*c0909341SAndroid Build Coastguard Worker    jl .w8
9538*c0909341SAndroid Build Coastguard Worker    RET
9539*c0909341SAndroid Build Coastguard Worker; w16/w32/w64/w128
9540*c0909341SAndroid Build Coastguard Worker.w16:
9541*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
9542*c0909341SAndroid Build Coastguard Worker    mov                 r6d, wm
9543*c0909341SAndroid Build Coastguard Worker%endif
9544*c0909341SAndroid Build Coastguard Worker    sub                 dsq, r6
9545*c0909341SAndroid Build Coastguard Worker.w16_loop0:
9546*c0909341SAndroid Build Coastguard Worker    movd                 m3, [maskq+hq*2]
9547*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q0000
9548*c0909341SAndroid Build Coastguard Worker    punpcklqdq           m3, m3
9549*c0909341SAndroid Build Coastguard Worker    mov                  wd, r6d
9550*c0909341SAndroid Build Coastguard Worker.w16_loop:
9551*c0909341SAndroid Build Coastguard Worker    mova                 m1, [dstq] ; a
9552*c0909341SAndroid Build Coastguard Worker    mova                 m2, [tmpq] ; b
9553*c0909341SAndroid Build Coastguard Worker    BLEND_64M            m1, m2, m3, m3
9554*c0909341SAndroid Build Coastguard Worker    mova             [dstq], m0
9555*c0909341SAndroid Build Coastguard Worker    add                dstq, 16
9556*c0909341SAndroid Build Coastguard Worker    add                tmpq, 16
9557*c0909341SAndroid Build Coastguard Worker    sub                  wd, 16
9558*c0909341SAndroid Build Coastguard Worker    jg .w16_loop
9559*c0909341SAndroid Build Coastguard Worker    add                dstq, dsq
9560*c0909341SAndroid Build Coastguard Worker    inc                  hq
9561*c0909341SAndroid Build Coastguard Worker    jl .w16_loop0
9562*c0909341SAndroid Build Coastguard Worker    RET
9563*c0909341SAndroid Build Coastguard Worker
9564*c0909341SAndroid Build Coastguard Worker; emu_edge args:
9565*c0909341SAndroid Build Coastguard Worker; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
9566*c0909341SAndroid Build Coastguard Worker; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
9567*c0909341SAndroid Build Coastguard Worker; const pixel *ref, const ptrdiff_t ref_stride
9568*c0909341SAndroid Build Coastguard Worker;
9569*c0909341SAndroid Build Coastguard Worker; bw, bh total filled size
9570*c0909341SAndroid Build Coastguard Worker; iw, ih, copied block -> fill bottom, right
9571*c0909341SAndroid Build Coastguard Worker; x, y, offset in bw/bh -> fill top, left
9572*c0909341SAndroid Build Coastguard Workercglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \
9573*c0909341SAndroid Build Coastguard Worker                                  y, dst, dstride, src, sstride, \
9574*c0909341SAndroid Build Coastguard Worker                                  bottomext, rightext, blk
9575*c0909341SAndroid Build Coastguard Worker    ; we assume that the buffer (stride) is larger than width, so we can
9576*c0909341SAndroid Build Coastguard Worker    ; safely overwrite by a few bytes
9577*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
9578*c0909341SAndroid Build Coastguard Worker
9579*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9580*c0909341SAndroid Build Coastguard Worker %define reg_zero       r12q
9581*c0909341SAndroid Build Coastguard Worker %define reg_tmp        r10
9582*c0909341SAndroid Build Coastguard Worker %define reg_src        srcq
9583*c0909341SAndroid Build Coastguard Worker %define reg_bottomext  bottomextq
9584*c0909341SAndroid Build Coastguard Worker %define reg_rightext   rightextq
9585*c0909341SAndroid Build Coastguard Worker %define reg_blkm       r9m
9586*c0909341SAndroid Build Coastguard Worker%else
9587*c0909341SAndroid Build Coastguard Worker %define reg_zero       r6
9588*c0909341SAndroid Build Coastguard Worker %define reg_tmp        r0
9589*c0909341SAndroid Build Coastguard Worker %define reg_src        r1
9590*c0909341SAndroid Build Coastguard Worker %define reg_bottomext  r0
9591*c0909341SAndroid Build Coastguard Worker %define reg_rightext   r1
9592*c0909341SAndroid Build Coastguard Worker %define reg_blkm       r2m
9593*c0909341SAndroid Build Coastguard Worker%endif
9594*c0909341SAndroid Build Coastguard Worker    ;
9595*c0909341SAndroid Build Coastguard Worker    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
9596*c0909341SAndroid Build Coastguard Worker    xor            reg_zero, reg_zero
9597*c0909341SAndroid Build Coastguard Worker    lea             reg_tmp, [ihq-1]
9598*c0909341SAndroid Build Coastguard Worker    cmp                  yq, ihq
9599*c0909341SAndroid Build Coastguard Worker    cmovs           reg_tmp, yq
9600*c0909341SAndroid Build Coastguard Worker    test                 yq, yq
9601*c0909341SAndroid Build Coastguard Worker    cmovs           reg_tmp, reg_zero
9602*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9603*c0909341SAndroid Build Coastguard Worker    imul            reg_tmp, sstrideq
9604*c0909341SAndroid Build Coastguard Worker    add                srcq, reg_tmp
9605*c0909341SAndroid Build Coastguard Worker%else
9606*c0909341SAndroid Build Coastguard Worker    imul            reg_tmp, sstridem
9607*c0909341SAndroid Build Coastguard Worker    mov             reg_src, srcm
9608*c0909341SAndroid Build Coastguard Worker    add             reg_src, reg_tmp
9609*c0909341SAndroid Build Coastguard Worker%endif
9610*c0909341SAndroid Build Coastguard Worker    ;
9611*c0909341SAndroid Build Coastguard Worker    ; ref += iclip(x, 0, iw - 1)
9612*c0909341SAndroid Build Coastguard Worker    lea             reg_tmp, [iwq-1]
9613*c0909341SAndroid Build Coastguard Worker    cmp                  xq, iwq
9614*c0909341SAndroid Build Coastguard Worker    cmovs           reg_tmp, xq
9615*c0909341SAndroid Build Coastguard Worker    test                 xq, xq
9616*c0909341SAndroid Build Coastguard Worker    cmovs           reg_tmp, reg_zero
9617*c0909341SAndroid Build Coastguard Worker    add             reg_src, reg_tmp
9618*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
9619*c0909341SAndroid Build Coastguard Worker    mov                srcm, reg_src
9620*c0909341SAndroid Build Coastguard Worker%endif
9621*c0909341SAndroid Build Coastguard Worker    ;
9622*c0909341SAndroid Build Coastguard Worker    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
9623*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
9624*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m ; restore bh
9625*c0909341SAndroid Build Coastguard Worker%endif
9626*c0909341SAndroid Build Coastguard Worker    lea       reg_bottomext, [yq+bhq]
9627*c0909341SAndroid Build Coastguard Worker    sub       reg_bottomext, ihq
9628*c0909341SAndroid Build Coastguard Worker    lea                  r3, [bhq-1]
9629*c0909341SAndroid Build Coastguard Worker    cmovs     reg_bottomext, reg_zero
9630*c0909341SAndroid Build Coastguard Worker    ;
9631*c0909341SAndroid Build Coastguard Worker
9632*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, bh, iw, ih, x, \
9633*c0909341SAndroid Build Coastguard Worker                topext, dst, dstride, src, sstride, \
9634*c0909341SAndroid Build Coastguard Worker                bottomext, rightext, blk
9635*c0909341SAndroid Build Coastguard Worker
9636*c0909341SAndroid Build Coastguard Worker    ; top_ext = iclip(-y, 0, bh - 1)
9637*c0909341SAndroid Build Coastguard Worker    neg             topextq
9638*c0909341SAndroid Build Coastguard Worker    cmovs           topextq, reg_zero
9639*c0909341SAndroid Build Coastguard Worker    cmp       reg_bottomext, bhq
9640*c0909341SAndroid Build Coastguard Worker    cmovns    reg_bottomext, r3
9641*c0909341SAndroid Build Coastguard Worker    cmp             topextq, bhq
9642*c0909341SAndroid Build Coastguard Worker    cmovg           topextq, r3
9643*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
9644*c0909341SAndroid Build Coastguard Worker    mov                 r4m, reg_bottomext
9645*c0909341SAndroid Build Coastguard Worker    ;
9646*c0909341SAndroid Build Coastguard Worker    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
9647*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m ; restore bw
9648*c0909341SAndroid Build Coastguard Worker %endif
9649*c0909341SAndroid Build Coastguard Worker    lea        reg_rightext, [xq+bwq]
9650*c0909341SAndroid Build Coastguard Worker    sub        reg_rightext, iwq
9651*c0909341SAndroid Build Coastguard Worker    lea                  r2, [bwq-1]
9652*c0909341SAndroid Build Coastguard Worker    cmovs      reg_rightext, reg_zero
9653*c0909341SAndroid Build Coastguard Worker
9654*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, bh, iw, ih, leftext, \
9655*c0909341SAndroid Build Coastguard Worker                topext, dst, dstride, src, sstride, \
9656*c0909341SAndroid Build Coastguard Worker                bottomext, rightext, blk
9657*c0909341SAndroid Build Coastguard Worker
9658*c0909341SAndroid Build Coastguard Worker    ; left_ext = iclip(-x, 0, bw - 1)
9659*c0909341SAndroid Build Coastguard Worker    neg            leftextq
9660*c0909341SAndroid Build Coastguard Worker    cmovs          leftextq, reg_zero
9661*c0909341SAndroid Build Coastguard Worker    cmp        reg_rightext, bwq
9662*c0909341SAndroid Build Coastguard Worker    cmovns     reg_rightext, r2
9663*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_32
9664*c0909341SAndroid Build Coastguard Worker    mov                 r3m, r1
9665*c0909341SAndroid Build Coastguard Worker %endif
9666*c0909341SAndroid Build Coastguard Worker    cmp            leftextq, bwq
9667*c0909341SAndroid Build Coastguard Worker    cmovns         leftextq, r2
9668*c0909341SAndroid Build Coastguard Worker
9669*c0909341SAndroid Build Coastguard Worker%undef reg_zero
9670*c0909341SAndroid Build Coastguard Worker%undef reg_tmp
9671*c0909341SAndroid Build Coastguard Worker%undef reg_src
9672*c0909341SAndroid Build Coastguard Worker%undef reg_bottomext
9673*c0909341SAndroid Build Coastguard Worker%undef reg_rightext
9674*c0909341SAndroid Build Coastguard Worker
9675*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
9676*c0909341SAndroid Build Coastguard Worker                topext, dst, dstride, src, sstride, \
9677*c0909341SAndroid Build Coastguard Worker                bottomext, rightext, blk
9678*c0909341SAndroid Build Coastguard Worker
9679*c0909341SAndroid Build Coastguard Worker    ; center_h = bh - top_ext - bottom_ext
9680*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9681*c0909341SAndroid Build Coastguard Worker    lea                  r3, [bottomextq+topextq]
9682*c0909341SAndroid Build Coastguard Worker    sub            centerhq, r3
9683*c0909341SAndroid Build Coastguard Worker%else
9684*c0909341SAndroid Build Coastguard Worker    mov                   r1, centerhm ; restore r1
9685*c0909341SAndroid Build Coastguard Worker    sub             centerhq, topextq
9686*c0909341SAndroid Build Coastguard Worker    sub             centerhq, r4m
9687*c0909341SAndroid Build Coastguard Worker    mov                  r1m, centerhq
9688*c0909341SAndroid Build Coastguard Worker%endif
9689*c0909341SAndroid Build Coastguard Worker    ;
9690*c0909341SAndroid Build Coastguard Worker    ; blk += top_ext * PXSTRIDE(dst_stride)
9691*c0909341SAndroid Build Coastguard Worker    mov                  r2, topextq
9692*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9693*c0909341SAndroid Build Coastguard Worker    imul                 r2, dstrideq
9694*c0909341SAndroid Build Coastguard Worker%else
9695*c0909341SAndroid Build Coastguard Worker    mov                  r6, r6m ; restore dstq
9696*c0909341SAndroid Build Coastguard Worker    imul                 r2, dstridem
9697*c0909341SAndroid Build Coastguard Worker%endif
9698*c0909341SAndroid Build Coastguard Worker    add                dstq, r2
9699*c0909341SAndroid Build Coastguard Worker    mov            reg_blkm, dstq ; save pointer for ext
9700*c0909341SAndroid Build Coastguard Worker    ;
9701*c0909341SAndroid Build Coastguard Worker    ; center_w = bw - left_ext - right_ext
9702*c0909341SAndroid Build Coastguard Worker    mov            centerwq, bwq
9703*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9704*c0909341SAndroid Build Coastguard Worker    lea                  r3, [rightextq+leftextq]
9705*c0909341SAndroid Build Coastguard Worker    sub            centerwq, r3
9706*c0909341SAndroid Build Coastguard Worker%else
9707*c0909341SAndroid Build Coastguard Worker    sub            centerwq, r3m
9708*c0909341SAndroid Build Coastguard Worker    sub            centerwq, leftextq
9709*c0909341SAndroid Build Coastguard Worker%endif
9710*c0909341SAndroid Build Coastguard Worker
9711*c0909341SAndroid Build Coastguard Worker; vloop Macro
9712*c0909341SAndroid Build Coastguard Worker%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
9713*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9714*c0909341SAndroid Build Coastguard Worker    %define reg_tmp        r12
9715*c0909341SAndroid Build Coastguard Worker  %else
9716*c0909341SAndroid Build Coastguard Worker    %define reg_tmp        r0
9717*c0909341SAndroid Build Coastguard Worker  %endif
9718*c0909341SAndroid Build Coastguard Worker.v_loop_%3:
9719*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_32
9720*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m
9721*c0909341SAndroid Build Coastguard Worker    mov                  r1, r1m
9722*c0909341SAndroid Build Coastguard Worker  %endif
9723*c0909341SAndroid Build Coastguard Worker%if %1
9724*c0909341SAndroid Build Coastguard Worker    ; left extension
9725*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9726*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq]
9727*c0909341SAndroid Build Coastguard Worker  %else
9728*c0909341SAndroid Build Coastguard Worker    mov                  r3, srcm
9729*c0909341SAndroid Build Coastguard Worker    movd                 m0, [r3]
9730*c0909341SAndroid Build Coastguard Worker  %endif
9731*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
9732*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
9733*c0909341SAndroid Build Coastguard Worker.left_loop_%3:
9734*c0909341SAndroid Build Coastguard Worker    mova          [dstq+r3], m0
9735*c0909341SAndroid Build Coastguard Worker    add                  r3, mmsize
9736*c0909341SAndroid Build Coastguard Worker    cmp                  r3, leftextq
9737*c0909341SAndroid Build Coastguard Worker    jl .left_loop_%3
9738*c0909341SAndroid Build Coastguard Worker    ; body
9739*c0909341SAndroid Build Coastguard Worker    lea             reg_tmp, [dstq+leftextq]
9740*c0909341SAndroid Build Coastguard Worker%endif
9741*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
9742*c0909341SAndroid Build Coastguard Worker.body_loop_%3:
9743*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9744*c0909341SAndroid Build Coastguard Worker    movu                 m0, [srcq+r3]
9745*c0909341SAndroid Build Coastguard Worker  %else
9746*c0909341SAndroid Build Coastguard Worker    mov                  r1, srcm
9747*c0909341SAndroid Build Coastguard Worker    movu                 m0, [r1+r3]
9748*c0909341SAndroid Build Coastguard Worker  %endif
9749*c0909341SAndroid Build Coastguard Worker%if %1
9750*c0909341SAndroid Build Coastguard Worker    movu       [reg_tmp+r3], m0
9751*c0909341SAndroid Build Coastguard Worker%else
9752*c0909341SAndroid Build Coastguard Worker    movu          [dstq+r3], m0
9753*c0909341SAndroid Build Coastguard Worker%endif
9754*c0909341SAndroid Build Coastguard Worker    add                  r3, mmsize
9755*c0909341SAndroid Build Coastguard Worker    cmp                  r3, centerwq
9756*c0909341SAndroid Build Coastguard Worker    jl .body_loop_%3
9757*c0909341SAndroid Build Coastguard Worker%if %2
9758*c0909341SAndroid Build Coastguard Worker    ; right extension
9759*c0909341SAndroid Build Coastguard Worker%if %1
9760*c0909341SAndroid Build Coastguard Worker    add             reg_tmp, centerwq
9761*c0909341SAndroid Build Coastguard Worker%else
9762*c0909341SAndroid Build Coastguard Worker    lea             reg_tmp, [dstq+centerwq]
9763*c0909341SAndroid Build Coastguard Worker%endif
9764*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9765*c0909341SAndroid Build Coastguard Worker    movd                 m0, [srcq+centerwq-1]
9766*c0909341SAndroid Build Coastguard Worker  %else
9767*c0909341SAndroid Build Coastguard Worker    mov                  r3, srcm
9768*c0909341SAndroid Build Coastguard Worker    movd                 m0, [r3+centerwq-1]
9769*c0909341SAndroid Build Coastguard Worker  %endif
9770*c0909341SAndroid Build Coastguard Worker    pshufb               m0, m1
9771*c0909341SAndroid Build Coastguard Worker    xor                  r3, r3
9772*c0909341SAndroid Build Coastguard Worker.right_loop_%3:
9773*c0909341SAndroid Build Coastguard Worker    movu       [reg_tmp+r3], m0
9774*c0909341SAndroid Build Coastguard Worker    add                  r3, mmsize
9775*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9776*c0909341SAndroid Build Coastguard Worker    cmp                  r3, rightextq
9777*c0909341SAndroid Build Coastguard Worker  %else
9778*c0909341SAndroid Build Coastguard Worker    cmp                  r3, r3m
9779*c0909341SAndroid Build Coastguard Worker  %endif
9780*c0909341SAndroid Build Coastguard Worker    jl .right_loop_%3
9781*c0909341SAndroid Build Coastguard Worker%endif
9782*c0909341SAndroid Build Coastguard Worker  %if ARCH_X86_64
9783*c0909341SAndroid Build Coastguard Worker    add                dstq, dstrideq
9784*c0909341SAndroid Build Coastguard Worker    add                srcq, sstrideq
9785*c0909341SAndroid Build Coastguard Worker    dec            centerhq
9786*c0909341SAndroid Build Coastguard Worker    jg .v_loop_%3
9787*c0909341SAndroid Build Coastguard Worker  %else
9788*c0909341SAndroid Build Coastguard Worker    add                dstq, dstridem
9789*c0909341SAndroid Build Coastguard Worker    mov                  r0, sstridem
9790*c0909341SAndroid Build Coastguard Worker    add                srcm, r0
9791*c0909341SAndroid Build Coastguard Worker    sub       dword centerhm, 1
9792*c0909341SAndroid Build Coastguard Worker    jg .v_loop_%3
9793*c0909341SAndroid Build Coastguard Worker    mov                  r0, r0m ; restore r0
9794*c0909341SAndroid Build Coastguard Worker  %endif
9795*c0909341SAndroid Build Coastguard Worker%endmacro ; vloop MACRO
9796*c0909341SAndroid Build Coastguard Worker
9797*c0909341SAndroid Build Coastguard Worker    test           leftextq, leftextq
9798*c0909341SAndroid Build Coastguard Worker    jnz .need_left_ext
9799*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9800*c0909341SAndroid Build Coastguard Worker    test          rightextq, rightextq
9801*c0909341SAndroid Build Coastguard Worker    jnz .need_right_ext
9802*c0909341SAndroid Build Coastguard Worker %else
9803*c0909341SAndroid Build Coastguard Worker    cmp            leftextq, r3m ; leftextq == 0
9804*c0909341SAndroid Build Coastguard Worker    jne .need_right_ext
9805*c0909341SAndroid Build Coastguard Worker %endif
9806*c0909341SAndroid Build Coastguard Worker    v_loop                0, 0, 0
9807*c0909341SAndroid Build Coastguard Worker    jmp .body_done
9808*c0909341SAndroid Build Coastguard Worker
9809*c0909341SAndroid Build Coastguard Worker    ;left right extensions
9810*c0909341SAndroid Build Coastguard Worker.need_left_ext:
9811*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9812*c0909341SAndroid Build Coastguard Worker    test          rightextq, rightextq
9813*c0909341SAndroid Build Coastguard Worker %else
9814*c0909341SAndroid Build Coastguard Worker    mov                  r3, r3m
9815*c0909341SAndroid Build Coastguard Worker    test                 r3, r3
9816*c0909341SAndroid Build Coastguard Worker %endif
9817*c0909341SAndroid Build Coastguard Worker    jnz .need_left_right_ext
9818*c0909341SAndroid Build Coastguard Worker    v_loop                1, 0, 1
9819*c0909341SAndroid Build Coastguard Worker    jmp .body_done
9820*c0909341SAndroid Build Coastguard Worker
9821*c0909341SAndroid Build Coastguard Worker.need_left_right_ext:
9822*c0909341SAndroid Build Coastguard Worker    v_loop                1, 1, 2
9823*c0909341SAndroid Build Coastguard Worker    jmp .body_done
9824*c0909341SAndroid Build Coastguard Worker
9825*c0909341SAndroid Build Coastguard Worker.need_right_ext:
9826*c0909341SAndroid Build Coastguard Worker    v_loop                0, 1, 3
9827*c0909341SAndroid Build Coastguard Worker
9828*c0909341SAndroid Build Coastguard Worker.body_done:
9829*c0909341SAndroid Build Coastguard Worker; r0 ; bw
9830*c0909341SAndroid Build Coastguard Worker; r1 ;; x loop
9831*c0909341SAndroid Build Coastguard Worker; r4 ;; y loop
9832*c0909341SAndroid Build Coastguard Worker; r5 ; topextq
9833*c0909341SAndroid Build Coastguard Worker; r6 ;dstq
9834*c0909341SAndroid Build Coastguard Worker; r7 ;dstrideq
9835*c0909341SAndroid Build Coastguard Worker; r8 ; srcq
9836*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9837*c0909341SAndroid Build Coastguard Worker %define reg_dstride    dstrideq
9838*c0909341SAndroid Build Coastguard Worker%else
9839*c0909341SAndroid Build Coastguard Worker %define reg_dstride    r2
9840*c0909341SAndroid Build Coastguard Worker%endif
9841*c0909341SAndroid Build Coastguard Worker    ;
9842*c0909341SAndroid Build Coastguard Worker    ; bottom edge extension
9843*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9844*c0909341SAndroid Build Coastguard Worker    test         bottomextq, bottomextq
9845*c0909341SAndroid Build Coastguard Worker    jz .top
9846*c0909341SAndroid Build Coastguard Worker %else
9847*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
9848*c0909341SAndroid Build Coastguard Worker    cmp                  r1, r4m
9849*c0909341SAndroid Build Coastguard Worker    je .top
9850*c0909341SAndroid Build Coastguard Worker %endif
9851*c0909341SAndroid Build Coastguard Worker    ;
9852*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9853*c0909341SAndroid Build Coastguard Worker    mov                srcq, dstq
9854*c0909341SAndroid Build Coastguard Worker    sub                srcq, dstrideq
9855*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
9856*c0909341SAndroid Build Coastguard Worker %else
9857*c0909341SAndroid Build Coastguard Worker    mov                  r3, dstq
9858*c0909341SAndroid Build Coastguard Worker    mov         reg_dstride, dstridem
9859*c0909341SAndroid Build Coastguard Worker    sub                  r3, reg_dstride
9860*c0909341SAndroid Build Coastguard Worker    mov                srcm, r3
9861*c0909341SAndroid Build Coastguard Worker %endif
9862*c0909341SAndroid Build Coastguard Worker    ;
9863*c0909341SAndroid Build Coastguard Worker.bottom_x_loop:
9864*c0909341SAndroid Build Coastguard Worker %if ARCH_X86_64
9865*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+r1]
9866*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1]
9867*c0909341SAndroid Build Coastguard Worker    mov                  r4, bottomextq
9868*c0909341SAndroid Build Coastguard Worker %else
9869*c0909341SAndroid Build Coastguard Worker    mov                  r3, srcm
9870*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+r1]
9871*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1]
9872*c0909341SAndroid Build Coastguard Worker    mov                  r4, r4m
9873*c0909341SAndroid Build Coastguard Worker %endif
9874*c0909341SAndroid Build Coastguard Worker    ;
9875*c0909341SAndroid Build Coastguard Worker.bottom_y_loop:
9876*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
9877*c0909341SAndroid Build Coastguard Worker    add                  r3, reg_dstride
9878*c0909341SAndroid Build Coastguard Worker    dec                  r4
9879*c0909341SAndroid Build Coastguard Worker    jg .bottom_y_loop
9880*c0909341SAndroid Build Coastguard Worker    add                  r1, mmsize
9881*c0909341SAndroid Build Coastguard Worker    cmp                  r1, bwq
9882*c0909341SAndroid Build Coastguard Worker    jl .bottom_x_loop
9883*c0909341SAndroid Build Coastguard Worker
9884*c0909341SAndroid Build Coastguard Worker.top:
9885*c0909341SAndroid Build Coastguard Worker    ; top edge extension
9886*c0909341SAndroid Build Coastguard Worker    test            topextq, topextq
9887*c0909341SAndroid Build Coastguard Worker    jz .end
9888*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9889*c0909341SAndroid Build Coastguard Worker    mov                srcq, reg_blkm
9890*c0909341SAndroid Build Coastguard Worker%else
9891*c0909341SAndroid Build Coastguard Worker    mov                  r3, reg_blkm
9892*c0909341SAndroid Build Coastguard Worker    mov         reg_dstride, dstridem
9893*c0909341SAndroid Build Coastguard Worker%endif
9894*c0909341SAndroid Build Coastguard Worker    mov                dstq, dstm
9895*c0909341SAndroid Build Coastguard Worker    xor                  r1, r1
9896*c0909341SAndroid Build Coastguard Worker    ;
9897*c0909341SAndroid Build Coastguard Worker.top_x_loop:
9898*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9899*c0909341SAndroid Build Coastguard Worker    mova                 m0, [srcq+r1]
9900*c0909341SAndroid Build Coastguard Worker%else
9901*c0909341SAndroid Build Coastguard Worker    mov                  r3, reg_blkm
9902*c0909341SAndroid Build Coastguard Worker    mova                 m0, [r3+r1]
9903*c0909341SAndroid Build Coastguard Worker%endif
9904*c0909341SAndroid Build Coastguard Worker    lea                  r3, [dstq+r1]
9905*c0909341SAndroid Build Coastguard Worker    mov                  r4, topextq
9906*c0909341SAndroid Build Coastguard Worker    ;
9907*c0909341SAndroid Build Coastguard Worker.top_y_loop:
9908*c0909341SAndroid Build Coastguard Worker    mova               [r3], m0
9909*c0909341SAndroid Build Coastguard Worker    add                  r3, reg_dstride
9910*c0909341SAndroid Build Coastguard Worker    dec                  r4
9911*c0909341SAndroid Build Coastguard Worker    jg .top_y_loop
9912*c0909341SAndroid Build Coastguard Worker    add                  r1, mmsize
9913*c0909341SAndroid Build Coastguard Worker    cmp                  r1, bwq
9914*c0909341SAndroid Build Coastguard Worker    jl .top_x_loop
9915*c0909341SAndroid Build Coastguard Worker
9916*c0909341SAndroid Build Coastguard Worker.end:
9917*c0909341SAndroid Build Coastguard Worker    RET
9918*c0909341SAndroid Build Coastguard Worker
9919*c0909341SAndroid Build Coastguard Worker%undef reg_dstride
9920*c0909341SAndroid Build Coastguard Worker%undef reg_blkm
9921*c0909341SAndroid Build Coastguard Worker%undef reg_tmp
9922*c0909341SAndroid Build Coastguard Worker
9923*c0909341SAndroid Build Coastguard Workercextern resize_filter
9924*c0909341SAndroid Build Coastguard Worker
9925*c0909341SAndroid Build Coastguard Worker%macro SCRATCH 3
9926*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_32
9927*c0909341SAndroid Build Coastguard Worker    mova [rsp+%3*mmsize], m%1
9928*c0909341SAndroid Build Coastguard Worker%define m%2 [rsp+%3*mmsize]
9929*c0909341SAndroid Build Coastguard Worker%else
9930*c0909341SAndroid Build Coastguard Worker    SWAP             %1, %2
9931*c0909341SAndroid Build Coastguard Worker%endif
9932*c0909341SAndroid Build Coastguard Worker%endmacro
9933*c0909341SAndroid Build Coastguard Worker
9934*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9935*c0909341SAndroid Build Coastguard Workercglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \
9936*c0909341SAndroid Build Coastguard Worker                                dst_w, h, src_w, dx, mx0
9937*c0909341SAndroid Build Coastguard Worker%elif STACK_ALIGNMENT >= 16
9938*c0909341SAndroid Build Coastguard Workercglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
9939*c0909341SAndroid Build Coastguard Worker                                      dst_w, h, src_w, dx, mx0
9940*c0909341SAndroid Build Coastguard Worker%else
9941*c0909341SAndroid Build Coastguard Workercglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
9942*c0909341SAndroid Build Coastguard Worker                                      dst_w, h, src_w, dx, mx0
9943*c0909341SAndroid Build Coastguard Worker%endif
9944*c0909341SAndroid Build Coastguard Worker    movifnidn          dstq, dstmp
9945*c0909341SAndroid Build Coastguard Worker    movifnidn          srcq, srcmp
9946*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
9947*c0909341SAndroid Build Coastguard Worker    movifnidn        dst_wd, dst_wm
9948*c0909341SAndroid Build Coastguard Worker%endif
9949*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9950*c0909341SAndroid Build Coastguard Worker    movifnidn            hd, hm
9951*c0909341SAndroid Build Coastguard Worker%endif
9952*c0909341SAndroid Build Coastguard Worker    sub          dword mx0m, 4<<14
9953*c0909341SAndroid Build Coastguard Worker    sub        dword src_wm, 8
9954*c0909341SAndroid Build Coastguard Worker    movd                 m7, dxm
9955*c0909341SAndroid Build Coastguard Worker    movd                 m6, mx0m
9956*c0909341SAndroid Build Coastguard Worker    movd                 m5, src_wm
9957*c0909341SAndroid Build Coastguard Worker    pshufd               m7, m7, q0000
9958*c0909341SAndroid Build Coastguard Worker    pshufd               m6, m6, q0000
9959*c0909341SAndroid Build Coastguard Worker    pshufd               m5, m5, q0000
9960*c0909341SAndroid Build Coastguard Worker
9961*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9962*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
9963*c0909341SAndroid Build Coastguard Worker    LEA                  r7, $$
9964*c0909341SAndroid Build Coastguard Worker%define base r7-$$
9965*c0909341SAndroid Build Coastguard Worker%else
9966*c0909341SAndroid Build Coastguard Worker    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
9967*c0909341SAndroid Build Coastguard Worker%define hd dword r5m
9968*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
9969*c0909341SAndroid Build Coastguard Worker    LEA                  r6, $$
9970*c0909341SAndroid Build Coastguard Worker%define base r6-$$
9971*c0909341SAndroid Build Coastguard Worker%else
9972*c0909341SAndroid Build Coastguard Worker    LEA                  r4, $$
9973*c0909341SAndroid Build Coastguard Worker%define base r4-$$
9974*c0909341SAndroid Build Coastguard Worker%endif
9975*c0909341SAndroid Build Coastguard Worker%endif
9976*c0909341SAndroid Build Coastguard Worker
9977*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
9978*c0909341SAndroid Build Coastguard Worker    mova                m10, [base+pw_m256]
9979*c0909341SAndroid Build Coastguard Worker    mova                 m9, [base+pd_63]
9980*c0909341SAndroid Build Coastguard Worker    mova                 m8, [base+pb_8x0_8x8]
9981*c0909341SAndroid Build Coastguard Worker%else
9982*c0909341SAndroid Build Coastguard Worker%define m10 [base+pw_m256]
9983*c0909341SAndroid Build Coastguard Worker%define m9  [base+pd_63]
9984*c0909341SAndroid Build Coastguard Worker%define m8  [base+pb_8x0_8x8]
9985*c0909341SAndroid Build Coastguard Worker%endif
9986*c0909341SAndroid Build Coastguard Worker    pmaddwd              m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
9987*c0909341SAndroid Build Coastguard Worker    pslld                m7, 2                      ; dx*4
9988*c0909341SAndroid Build Coastguard Worker    pslld                m5, 14
9989*c0909341SAndroid Build Coastguard Worker    paddd                m6, m4                     ; mx+[0..3]*dx
9990*c0909341SAndroid Build Coastguard Worker    SCRATCH               7, 13, 0
9991*c0909341SAndroid Build Coastguard Worker    SCRATCH               6, 12, 1
9992*c0909341SAndroid Build Coastguard Worker    SCRATCH               5, 11, 2
9993*c0909341SAndroid Build Coastguard Worker
9994*c0909341SAndroid Build Coastguard Worker    ; m10 = pmulhrsw constant for x=(x+64)>>7
9995*c0909341SAndroid Build Coastguard Worker    ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8
9996*c0909341SAndroid Build Coastguard Worker
9997*c0909341SAndroid Build Coastguard Worker.loop_y:
9998*c0909341SAndroid Build Coastguard Worker    xor                  xd, xd
9999*c0909341SAndroid Build Coastguard Worker    mova                 m0, m12                    ; per-line working version of mx
10000*c0909341SAndroid Build Coastguard Worker
10001*c0909341SAndroid Build Coastguard Worker.loop_x:
10002*c0909341SAndroid Build Coastguard Worker    pxor                 m1, m1
10003*c0909341SAndroid Build Coastguard Worker    pcmpgtd              m1, m0
10004*c0909341SAndroid Build Coastguard Worker    pandn                m1, m0
10005*c0909341SAndroid Build Coastguard Worker    psrad                m2, m0, 8                  ; filter offset (unmasked)
10006*c0909341SAndroid Build Coastguard Worker    pcmpgtd              m3, m11, m1
10007*c0909341SAndroid Build Coastguard Worker    pand                 m1, m3
10008*c0909341SAndroid Build Coastguard Worker    pandn                m3, m11
10009*c0909341SAndroid Build Coastguard Worker    por                  m1, m3
10010*c0909341SAndroid Build Coastguard Worker    psubd                m3, m0, m1                 ; pshufb offset
10011*c0909341SAndroid Build Coastguard Worker    psrad                m1, 14                     ; clipped src_x offset
10012*c0909341SAndroid Build Coastguard Worker    psrad                m3, 14                     ; pshufb edge_emu offset
10013*c0909341SAndroid Build Coastguard Worker    pand                 m2, m9                     ; filter offset (masked)
10014*c0909341SAndroid Build Coastguard Worker
10015*c0909341SAndroid Build Coastguard Worker    ; load source pixels
10016*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
10017*c0909341SAndroid Build Coastguard Worker    movd                r8d, m1
10018*c0909341SAndroid Build Coastguard Worker    pshuflw              m1, m1, q3232
10019*c0909341SAndroid Build Coastguard Worker    movd                r9d, m1
10020*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1, m1
10021*c0909341SAndroid Build Coastguard Worker    movd               r10d, m1
10022*c0909341SAndroid Build Coastguard Worker    psrlq                m1, 32
10023*c0909341SAndroid Build Coastguard Worker    movd               r11d, m1
10024*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+r8]
10025*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+r10]
10026*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+r9]
10027*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+r11]
10028*c0909341SAndroid Build Coastguard Worker%else
10029*c0909341SAndroid Build Coastguard Worker    movd                r3d,  m1
10030*c0909341SAndroid Build Coastguard Worker    pshufd               m1,  m1, q3312
10031*c0909341SAndroid Build Coastguard Worker    movd                r1d,  m1
10032*c0909341SAndroid Build Coastguard Worker    pshuflw              m1,  m1, q3232
10033*c0909341SAndroid Build Coastguard Worker    movq                 m4, [srcq+r3]
10034*c0909341SAndroid Build Coastguard Worker    movq                 m5, [srcq+r1]
10035*c0909341SAndroid Build Coastguard Worker    movd                r3d,  m1
10036*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m1,  m1
10037*c0909341SAndroid Build Coastguard Worker    movd                r1d,  m1
10038*c0909341SAndroid Build Coastguard Worker    movhps               m4, [srcq+r3]
10039*c0909341SAndroid Build Coastguard Worker    movhps               m5, [srcq+r1]
10040*c0909341SAndroid Build Coastguard Worker%endif
10041*c0909341SAndroid Build Coastguard Worker
10042*c0909341SAndroid Build Coastguard Worker    ; if no emulation is required, we don't need to shuffle or emulate edges
10043*c0909341SAndroid Build Coastguard Worker    ; this also saves 2 quasi-vpgatherdqs
10044*c0909341SAndroid Build Coastguard Worker    pxor                 m6, m6
10045*c0909341SAndroid Build Coastguard Worker    pcmpeqb              m6, m3
10046*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
10047*c0909341SAndroid Build Coastguard Worker    pmovmskb            r8d, m6
10048*c0909341SAndroid Build Coastguard Worker    cmp                 r8d, 0xffff
10049*c0909341SAndroid Build Coastguard Worker%else
10050*c0909341SAndroid Build Coastguard Worker    pmovmskb            r3d, m6
10051*c0909341SAndroid Build Coastguard Worker    cmp                 r3d, 0xffff
10052*c0909341SAndroid Build Coastguard Worker%endif
10053*c0909341SAndroid Build Coastguard Worker    je .filter
10054*c0909341SAndroid Build Coastguard Worker
10055*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
10056*c0909341SAndroid Build Coastguard Worker    movd                r8d, m3
10057*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q3232
10058*c0909341SAndroid Build Coastguard Worker    movd                r9d, m3
10059*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m3
10060*c0909341SAndroid Build Coastguard Worker    movd               r10d, m3
10061*c0909341SAndroid Build Coastguard Worker    psrlq                m3, 32
10062*c0909341SAndroid Build Coastguard Worker    movd               r11d, m3
10063*c0909341SAndroid Build Coastguard Worker    movsxd               r8, r8d
10064*c0909341SAndroid Build Coastguard Worker    movsxd               r9, r9d
10065*c0909341SAndroid Build Coastguard Worker    movsxd              r10, r10d
10066*c0909341SAndroid Build Coastguard Worker    movsxd              r11, r11d
10067*c0909341SAndroid Build Coastguard Worker    movq                 m6, [base+resize_shuf+4+r8]
10068*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base+resize_shuf+4+r10]
10069*c0909341SAndroid Build Coastguard Worker    movhps               m6, [base+resize_shuf+4+r9]
10070*c0909341SAndroid Build Coastguard Worker    movhps               m7, [base+resize_shuf+4+r11]
10071*c0909341SAndroid Build Coastguard Worker%else
10072*c0909341SAndroid Build Coastguard Worker    movd                r3d, m3
10073*c0909341SAndroid Build Coastguard Worker    pshufd               m3, m3, q3312
10074*c0909341SAndroid Build Coastguard Worker    movd                r1d, m3
10075*c0909341SAndroid Build Coastguard Worker    pshuflw              m3, m3, q3232
10076*c0909341SAndroid Build Coastguard Worker    movq                 m6, [base+resize_shuf+4+r3]
10077*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base+resize_shuf+4+r1]
10078*c0909341SAndroid Build Coastguard Worker    movd                r3d, m3
10079*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m3, m3
10080*c0909341SAndroid Build Coastguard Worker    movd                r1d, m3
10081*c0909341SAndroid Build Coastguard Worker    movhps               m6, [base+resize_shuf+4+r3]
10082*c0909341SAndroid Build Coastguard Worker    movhps               m7, [base+resize_shuf+4+r1]
10083*c0909341SAndroid Build Coastguard Worker%endif
10084*c0909341SAndroid Build Coastguard Worker
10085*c0909341SAndroid Build Coastguard Worker    paddb                m6, m8
10086*c0909341SAndroid Build Coastguard Worker    paddb                m7, m8
10087*c0909341SAndroid Build Coastguard Worker    pshufb               m4, m6
10088*c0909341SAndroid Build Coastguard Worker    pshufb               m5, m7
10089*c0909341SAndroid Build Coastguard Worker
10090*c0909341SAndroid Build Coastguard Worker.filter:
10091*c0909341SAndroid Build Coastguard Worker%if ARCH_X86_64
10092*c0909341SAndroid Build Coastguard Worker    movd                r8d, m2
10093*c0909341SAndroid Build Coastguard Worker    pshuflw              m2, m2, q3232
10094*c0909341SAndroid Build Coastguard Worker    movd                r9d, m2
10095*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m2
10096*c0909341SAndroid Build Coastguard Worker    movd               r10d, m2
10097*c0909341SAndroid Build Coastguard Worker    psrlq                m2, 32
10098*c0909341SAndroid Build Coastguard Worker    movd               r11d, m2
10099*c0909341SAndroid Build Coastguard Worker    movq                 m6, [base+resize_filter+r8*8]
10100*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base+resize_filter+r10*8]
10101*c0909341SAndroid Build Coastguard Worker    movhps               m6, [base+resize_filter+r9*8]
10102*c0909341SAndroid Build Coastguard Worker    movhps               m7, [base+resize_filter+r11*8]
10103*c0909341SAndroid Build Coastguard Worker%else
10104*c0909341SAndroid Build Coastguard Worker    movd                r3d, m2
10105*c0909341SAndroid Build Coastguard Worker    pshufd               m2, m2, q3312
10106*c0909341SAndroid Build Coastguard Worker    movd                r1d, m2
10107*c0909341SAndroid Build Coastguard Worker    pshuflw              m2, m2, q3232
10108*c0909341SAndroid Build Coastguard Worker    movq                 m6, [base+resize_filter+r3*8]
10109*c0909341SAndroid Build Coastguard Worker    movq                 m7, [base+resize_filter+r1*8]
10110*c0909341SAndroid Build Coastguard Worker    movd                r3d, m2
10111*c0909341SAndroid Build Coastguard Worker    punpckhqdq           m2, m2
10112*c0909341SAndroid Build Coastguard Worker    movd                r1d, m2
10113*c0909341SAndroid Build Coastguard Worker    movhps               m6, [base+resize_filter+r3*8]
10114*c0909341SAndroid Build Coastguard Worker    movhps               m7, [base+resize_filter+r1*8]
10115*c0909341SAndroid Build Coastguard Worker%endif
10116*c0909341SAndroid Build Coastguard Worker
10117*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m4, m6
10118*c0909341SAndroid Build Coastguard Worker    pmaddubsw            m5, m7
10119*c0909341SAndroid Build Coastguard Worker    phaddw               m4, m5
10120*c0909341SAndroid Build Coastguard Worker    phaddsw              m4, m4
10121*c0909341SAndroid Build Coastguard Worker    pmulhrsw             m4, m10                    ; x=(x+64)>>7
10122*c0909341SAndroid Build Coastguard Worker    packuswb             m4, m4
10123*c0909341SAndroid Build Coastguard Worker    movd          [dstq+xq], m4
10124*c0909341SAndroid Build Coastguard Worker
10125*c0909341SAndroid Build Coastguard Worker    paddd                m0, m13
10126*c0909341SAndroid Build Coastguard Worker    add                  xd, 4
10127*c0909341SAndroid Build Coastguard Worker%if STACK_ALIGNMENT >= 16
10128*c0909341SAndroid Build Coastguard Worker    cmp                  xd, dst_wd
10129*c0909341SAndroid Build Coastguard Worker%else
10130*c0909341SAndroid Build Coastguard Worker    cmp                  xd, dst_wm
10131*c0909341SAndroid Build Coastguard Worker%endif
10132*c0909341SAndroid Build Coastguard Worker    jl .loop_x
10133*c0909341SAndroid Build Coastguard Worker
10134*c0909341SAndroid Build Coastguard Worker    add                dstq, dst_stridemp
10135*c0909341SAndroid Build Coastguard Worker    add                srcq, src_stridemp
10136*c0909341SAndroid Build Coastguard Worker    dec                  hd
10137*c0909341SAndroid Build Coastguard Worker    jg .loop_y
10138*c0909341SAndroid Build Coastguard Worker    RET
10139*c0909341SAndroid Build Coastguard Worker
10140*c0909341SAndroid Build Coastguard WorkerINIT_XMM ssse3
10141*c0909341SAndroid Build Coastguard WorkerWARP_AFFINE_8X8
10142*c0909341SAndroid Build Coastguard Worker
10143*c0909341SAndroid Build Coastguard WorkerINIT_XMM sse4
10144*c0909341SAndroid Build Coastguard WorkerWARP_AFFINE_8X8
10145