xref: /aosp_15_r20/external/libyuv/source/scale_gcc.cc (revision 4e366538070a3a6c5c163c31b791eab742e1657a)
1*4e366538SXin Li /*
2*4e366538SXin Li  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3*4e366538SXin Li  *
4*4e366538SXin Li  *  Use of this source code is governed by a BSD-style license
5*4e366538SXin Li  *  that can be found in the LICENSE file in the root of the source
6*4e366538SXin Li  *  tree. An additional intellectual property rights grant can be found
7*4e366538SXin Li  *  in the file PATENTS. All contributing project authors may
8*4e366538SXin Li  *  be found in the AUTHORS file in the root of the source tree.
9*4e366538SXin Li  */
10*4e366538SXin Li 
11*4e366538SXin Li #include "libyuv/row.h"
12*4e366538SXin Li #include "libyuv/scale_row.h"
13*4e366538SXin Li 
14*4e366538SXin Li #ifdef __cplusplus
15*4e366538SXin Li namespace libyuv {
16*4e366538SXin Li extern "C" {
17*4e366538SXin Li #endif
18*4e366538SXin Li 
19*4e366538SXin Li // This module is for GCC x86 and x64.
20*4e366538SXin Li #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21*4e366538SXin Li 
22*4e366538SXin Li // Offsets for source bytes 0 to 9
23*4e366538SXin Li static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
24*4e366538SXin Li                              128, 128, 128, 128, 128, 128, 128, 128};
25*4e366538SXin Li 
26*4e366538SXin Li // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27*4e366538SXin Li static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
28*4e366538SXin Li                              128, 128, 128, 128, 128, 128, 128, 128};
29*4e366538SXin Li 
30*4e366538SXin Li // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31*4e366538SXin Li static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
32*4e366538SXin Li                              128, 128, 128, 128, 128, 128, 128, 128};
33*4e366538SXin Li 
34*4e366538SXin Li // Offsets for source bytes 0 to 10
35*4e366538SXin Li static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36*4e366538SXin Li 
37*4e366538SXin Li // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38*4e366538SXin Li static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
39*4e366538SXin Li                               8, 9, 9, 10, 10, 11, 12, 13};
40*4e366538SXin Li 
41*4e366538SXin Li // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42*4e366538SXin Li static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
43*4e366538SXin Li                               10, 11, 12, 13, 13, 14, 14, 15};
44*4e366538SXin Li 
45*4e366538SXin Li // Coefficients for source bytes 0 to 10
46*4e366538SXin Li static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47*4e366538SXin Li 
48*4e366538SXin Li // Coefficients for source bytes 10 to 21
49*4e366538SXin Li static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50*4e366538SXin Li 
51*4e366538SXin Li // Coefficients for source bytes 21 to 31
52*4e366538SXin Li static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53*4e366538SXin Li 
54*4e366538SXin Li // Coefficients for source bytes 21 to 31
55*4e366538SXin Li static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56*4e366538SXin Li 
57*4e366538SXin Li static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
58*4e366538SXin Li                                128, 128, 128, 128, 128, 128, 128, 128};
59*4e366538SXin Li 
60*4e366538SXin Li static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
61*4e366538SXin Li                                6,   8,   11,  14,  128, 128, 128, 128};
62*4e366538SXin Li 
63*4e366538SXin Li // Arrange words 0,3,6 into 0,1,2
64*4e366538SXin Li static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
65*4e366538SXin Li                               128, 128, 128, 128, 128, 128, 128, 128};
66*4e366538SXin Li 
67*4e366538SXin Li // Arrange words 0,3,6 into 3,4,5
68*4e366538SXin Li static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
69*4e366538SXin Li                                6,   7,   12,  13,  128, 128, 128, 128};
70*4e366538SXin Li 
71*4e366538SXin Li // Scaling values for boxes of 3x3 and 2x3
72*4e366538SXin Li static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73*4e366538SXin Li                                   65536 / 9, 65536 / 6, 0,         0};
74*4e366538SXin Li 
75*4e366538SXin Li // Arrange first value for pixels 0,1,2,3,4,5
76*4e366538SXin Li static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
77*4e366538SXin Li                                11, 128, 14, 128, 128, 128, 128, 128};
78*4e366538SXin Li 
79*4e366538SXin Li // Arrange second value for pixels 0,1,2,3,4,5
80*4e366538SXin Li static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
81*4e366538SXin Li                                12, 128, 15, 128, 128, 128, 128, 128};
82*4e366538SXin Li 
83*4e366538SXin Li // Arrange third value for pixels 0,1,2,3,4,5
84*4e366538SXin Li static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
85*4e366538SXin Li                                13, 128, 128, 128, 128, 128, 128, 128};
86*4e366538SXin Li 
87*4e366538SXin Li // Scaling values for boxes of 3x2 and 2x2
88*4e366538SXin Li static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89*4e366538SXin Li                                  65536 / 3, 65536 / 2, 0,         0};
90*4e366538SXin Li 
91*4e366538SXin Li // GCC versions of row functions are verbatim conversions from Visual C.
92*4e366538SXin Li // Generated using gcc disassembly on Visual C object file:
93*4e366538SXin Li // objdump -D yuvscaler.obj >yuvscaler.txt
94*4e366538SXin Li 
ScaleRowDown2_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95*4e366538SXin Li void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
96*4e366538SXin Li                          ptrdiff_t src_stride,
97*4e366538SXin Li                          uint8_t* dst_ptr,
98*4e366538SXin Li                          int dst_width) {
99*4e366538SXin Li   (void)src_stride;
100*4e366538SXin Li   asm volatile(
101*4e366538SXin Li       // 16 pixel loop.
102*4e366538SXin Li       LABELALIGN
103*4e366538SXin Li       "1:                                        \n"
104*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
105*4e366538SXin Li       "movdqu      0x10(%0),%%xmm1               \n"
106*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
107*4e366538SXin Li       "psrlw       $0x8,%%xmm0                   \n"
108*4e366538SXin Li       "psrlw       $0x8,%%xmm1                   \n"
109*4e366538SXin Li       "packuswb    %%xmm1,%%xmm0                 \n"
110*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
111*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
112*4e366538SXin Li       "sub         $0x10,%2                      \n"
113*4e366538SXin Li       "jg          1b                            \n"
114*4e366538SXin Li       : "+r"(src_ptr),   // %0
115*4e366538SXin Li         "+r"(dst_ptr),   // %1
116*4e366538SXin Li         "+r"(dst_width)  // %2
117*4e366538SXin Li         ::"memory",
118*4e366538SXin Li         "cc", "xmm0", "xmm1");
119*4e366538SXin Li }
120*4e366538SXin Li 
ScaleRowDown2Linear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)121*4e366538SXin Li void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
122*4e366538SXin Li                                ptrdiff_t src_stride,
123*4e366538SXin Li                                uint8_t* dst_ptr,
124*4e366538SXin Li                                int dst_width) {
125*4e366538SXin Li   (void)src_stride;
126*4e366538SXin Li   asm volatile(
127*4e366538SXin Li       "pcmpeqb     %%xmm4,%%xmm4                 \n"
128*4e366538SXin Li       "psrlw       $0xf,%%xmm4                   \n"
129*4e366538SXin Li       "packuswb    %%xmm4,%%xmm4                 \n"
130*4e366538SXin Li       "pxor        %%xmm5,%%xmm5                 \n"
131*4e366538SXin Li 
132*4e366538SXin Li       LABELALIGN
133*4e366538SXin Li       "1:                                        \n"
134*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
135*4e366538SXin Li       "movdqu      0x10(%0),%%xmm1               \n"
136*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
137*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm0                 \n"
138*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm1                 \n"
139*4e366538SXin Li       "pavgw       %%xmm5,%%xmm0                 \n"
140*4e366538SXin Li       "pavgw       %%xmm5,%%xmm1                 \n"
141*4e366538SXin Li       "packuswb    %%xmm1,%%xmm0                 \n"
142*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
143*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
144*4e366538SXin Li       "sub         $0x10,%2                      \n"
145*4e366538SXin Li       "jg          1b                            \n"
146*4e366538SXin Li       : "+r"(src_ptr),   // %0
147*4e366538SXin Li         "+r"(dst_ptr),   // %1
148*4e366538SXin Li         "+r"(dst_width)  // %2
149*4e366538SXin Li         ::"memory",
150*4e366538SXin Li         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
151*4e366538SXin Li }
152*4e366538SXin Li 
ScaleRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)153*4e366538SXin Li void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
154*4e366538SXin Li                             ptrdiff_t src_stride,
155*4e366538SXin Li                             uint8_t* dst_ptr,
156*4e366538SXin Li                             int dst_width) {
157*4e366538SXin Li   asm volatile(
158*4e366538SXin Li       "pcmpeqb     %%xmm4,%%xmm4                 \n"
159*4e366538SXin Li       "psrlw       $0xf,%%xmm4                   \n"
160*4e366538SXin Li       "packuswb    %%xmm4,%%xmm4                 \n"
161*4e366538SXin Li       "pxor        %%xmm5,%%xmm5                 \n"
162*4e366538SXin Li 
163*4e366538SXin Li       LABELALIGN
164*4e366538SXin Li       "1:                                        \n"
165*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
166*4e366538SXin Li       "movdqu      0x10(%0),%%xmm1               \n"
167*4e366538SXin Li       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
168*4e366538SXin Li       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
169*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
170*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm0                 \n"
171*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm1                 \n"
172*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm2                 \n"
173*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm3                 \n"
174*4e366538SXin Li       "paddw       %%xmm2,%%xmm0                 \n"
175*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"
176*4e366538SXin Li       "psrlw       $0x1,%%xmm0                   \n"
177*4e366538SXin Li       "psrlw       $0x1,%%xmm1                   \n"
178*4e366538SXin Li       "pavgw       %%xmm5,%%xmm0                 \n"
179*4e366538SXin Li       "pavgw       %%xmm5,%%xmm1                 \n"
180*4e366538SXin Li       "packuswb    %%xmm1,%%xmm0                 \n"
181*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
182*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
183*4e366538SXin Li       "sub         $0x10,%2                      \n"
184*4e366538SXin Li       "jg          1b                            \n"
185*4e366538SXin Li       : "+r"(src_ptr),               // %0
186*4e366538SXin Li         "+r"(dst_ptr),               // %1
187*4e366538SXin Li         "+r"(dst_width)              // %2
188*4e366538SXin Li       : "r"((intptr_t)(src_stride))  // %3
189*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
190*4e366538SXin Li }
191*4e366538SXin Li 
192*4e366538SXin Li #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)193*4e366538SXin Li void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
194*4e366538SXin Li                         ptrdiff_t src_stride,
195*4e366538SXin Li                         uint8_t* dst_ptr,
196*4e366538SXin Li                         int dst_width) {
197*4e366538SXin Li   (void)src_stride;
198*4e366538SXin Li   asm volatile(LABELALIGN
199*4e366538SXin Li       "1:                                        \n"
200*4e366538SXin Li       "vmovdqu     (%0),%%ymm0                   \n"
201*4e366538SXin Li       "vmovdqu     0x20(%0),%%ymm1               \n"
202*4e366538SXin Li       "lea         0x40(%0),%0                   \n"
203*4e366538SXin Li       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
204*4e366538SXin Li       "vpsrlw      $0x8,%%ymm1,%%ymm1            \n"
205*4e366538SXin Li       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
206*4e366538SXin Li       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
207*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"
208*4e366538SXin Li       "lea         0x20(%1),%1                   \n"
209*4e366538SXin Li       "sub         $0x20,%2                      \n"
210*4e366538SXin Li       "jg          1b                            \n"
211*4e366538SXin Li       "vzeroupper                                \n"
212*4e366538SXin Li                : "+r"(src_ptr),   // %0
213*4e366538SXin Li                  "+r"(dst_ptr),   // %1
214*4e366538SXin Li                  "+r"(dst_width)  // %2
215*4e366538SXin Li                  ::"memory",
216*4e366538SXin Li                  "cc", "xmm0", "xmm1");
217*4e366538SXin Li }
218*4e366538SXin Li 
ScaleRowDown2Linear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)219*4e366538SXin Li void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
220*4e366538SXin Li                               ptrdiff_t src_stride,
221*4e366538SXin Li                               uint8_t* dst_ptr,
222*4e366538SXin Li                               int dst_width) {
223*4e366538SXin Li   (void)src_stride;
224*4e366538SXin Li   asm volatile(
225*4e366538SXin Li       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
226*4e366538SXin Li       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
227*4e366538SXin Li       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
228*4e366538SXin Li       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
229*4e366538SXin Li 
230*4e366538SXin Li       LABELALIGN
231*4e366538SXin Li       "1:                                        \n"
232*4e366538SXin Li       "vmovdqu     (%0),%%ymm0                   \n"
233*4e366538SXin Li       "vmovdqu     0x20(%0),%%ymm1               \n"
234*4e366538SXin Li       "lea         0x40(%0),%0                   \n"
235*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
236*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
237*4e366538SXin Li       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
238*4e366538SXin Li       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
239*4e366538SXin Li       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
240*4e366538SXin Li       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
241*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"
242*4e366538SXin Li       "lea         0x20(%1),%1                   \n"
243*4e366538SXin Li       "sub         $0x20,%2                      \n"
244*4e366538SXin Li       "jg          1b                            \n"
245*4e366538SXin Li       "vzeroupper                                \n"
246*4e366538SXin Li       : "+r"(src_ptr),   // %0
247*4e366538SXin Li         "+r"(dst_ptr),   // %1
248*4e366538SXin Li         "+r"(dst_width)  // %2
249*4e366538SXin Li         ::"memory",
250*4e366538SXin Li         "cc", "xmm0", "xmm1", "xmm4", "xmm5");
251*4e366538SXin Li }
252*4e366538SXin Li 
ScaleRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)253*4e366538SXin Li void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
254*4e366538SXin Li                            ptrdiff_t src_stride,
255*4e366538SXin Li                            uint8_t* dst_ptr,
256*4e366538SXin Li                            int dst_width) {
257*4e366538SXin Li   asm volatile(
258*4e366538SXin Li       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
259*4e366538SXin Li       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
260*4e366538SXin Li       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
261*4e366538SXin Li       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
262*4e366538SXin Li 
263*4e366538SXin Li       LABELALIGN
264*4e366538SXin Li       "1:                                        \n"
265*4e366538SXin Li       "vmovdqu     (%0),%%ymm0                   \n"
266*4e366538SXin Li       "vmovdqu     0x20(%0),%%ymm1               \n"
267*4e366538SXin Li       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
268*4e366538SXin Li       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
269*4e366538SXin Li       "lea         0x40(%0),%0                   \n"
270*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
271*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
272*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
273*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
274*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
275*4e366538SXin Li       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
276*4e366538SXin Li       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"
277*4e366538SXin Li       "vpsrlw      $0x1,%%ymm1,%%ymm1            \n"
278*4e366538SXin Li       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
279*4e366538SXin Li       "vpavgw      %%ymm5,%%ymm1,%%ymm1          \n"
280*4e366538SXin Li       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
281*4e366538SXin Li       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
282*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"
283*4e366538SXin Li       "lea         0x20(%1),%1                   \n"
284*4e366538SXin Li       "sub         $0x20,%2                      \n"
285*4e366538SXin Li       "jg          1b                            \n"
286*4e366538SXin Li       "vzeroupper                                \n"
287*4e366538SXin Li       : "+r"(src_ptr),               // %0
288*4e366538SXin Li         "+r"(dst_ptr),               // %1
289*4e366538SXin Li         "+r"(dst_width)              // %2
290*4e366538SXin Li       : "r"((intptr_t)(src_stride))  // %3
291*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
292*4e366538SXin Li }
293*4e366538SXin Li #endif  // HAS_SCALEROWDOWN2_AVX2
294*4e366538SXin Li 
ScaleRowDown4_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)295*4e366538SXin Li void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
296*4e366538SXin Li                          ptrdiff_t src_stride,
297*4e366538SXin Li                          uint8_t* dst_ptr,
298*4e366538SXin Li                          int dst_width) {
299*4e366538SXin Li   (void)src_stride;
300*4e366538SXin Li   asm volatile(
301*4e366538SXin Li       "pcmpeqb     %%xmm5,%%xmm5                 \n"
302*4e366538SXin Li       "psrld       $0x18,%%xmm5                  \n"
303*4e366538SXin Li       "pslld       $0x10,%%xmm5                  \n"
304*4e366538SXin Li 
305*4e366538SXin Li       LABELALIGN
306*4e366538SXin Li       "1:                                        \n"
307*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
308*4e366538SXin Li       "movdqu      0x10(%0),%%xmm1               \n"
309*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
310*4e366538SXin Li       "pand        %%xmm5,%%xmm0                 \n"
311*4e366538SXin Li       "pand        %%xmm5,%%xmm1                 \n"
312*4e366538SXin Li       "packuswb    %%xmm1,%%xmm0                 \n"
313*4e366538SXin Li       "psrlw       $0x8,%%xmm0                   \n"
314*4e366538SXin Li       "packuswb    %%xmm0,%%xmm0                 \n"
315*4e366538SXin Li       "movq        %%xmm0,(%1)                   \n"
316*4e366538SXin Li       "lea         0x8(%1),%1                    \n"
317*4e366538SXin Li       "sub         $0x8,%2                       \n"
318*4e366538SXin Li       "jg          1b                            \n"
319*4e366538SXin Li       : "+r"(src_ptr),   // %0
320*4e366538SXin Li         "+r"(dst_ptr),   // %1
321*4e366538SXin Li         "+r"(dst_width)  // %2
322*4e366538SXin Li         ::"memory",
323*4e366538SXin Li         "cc", "xmm0", "xmm1", "xmm5");
324*4e366538SXin Li }
325*4e366538SXin Li 
ScaleRowDown4Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)326*4e366538SXin Li void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
327*4e366538SXin Li                             ptrdiff_t src_stride,
328*4e366538SXin Li                             uint8_t* dst_ptr,
329*4e366538SXin Li                             int dst_width) {
330*4e366538SXin Li   intptr_t stridex3;
331*4e366538SXin Li   asm volatile(
332*4e366538SXin Li       "pcmpeqb     %%xmm4,%%xmm4                 \n"
333*4e366538SXin Li       "psrlw       $0xf,%%xmm4                   \n"
334*4e366538SXin Li       "movdqa      %%xmm4,%%xmm5                 \n"
335*4e366538SXin Li       "packuswb    %%xmm4,%%xmm4                 \n"
336*4e366538SXin Li       "psllw       $0x3,%%xmm5                   \n"
337*4e366538SXin Li       "lea         0x00(%4,%4,2),%3              \n"
338*4e366538SXin Li 
339*4e366538SXin Li       LABELALIGN
340*4e366538SXin Li       "1:                                        \n"
341*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
342*4e366538SXin Li       "movdqu      0x10(%0),%%xmm1               \n"
343*4e366538SXin Li       "movdqu      0x00(%0,%4,1),%%xmm2          \n"
344*4e366538SXin Li       "movdqu      0x10(%0,%4,1),%%xmm3          \n"
345*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm0                 \n"
346*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm1                 \n"
347*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm2                 \n"
348*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm3                 \n"
349*4e366538SXin Li       "paddw       %%xmm2,%%xmm0                 \n"
350*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"
351*4e366538SXin Li       "movdqu      0x00(%0,%4,2),%%xmm2          \n"
352*4e366538SXin Li       "movdqu      0x10(%0,%4,2),%%xmm3          \n"
353*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm2                 \n"
354*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm3                 \n"
355*4e366538SXin Li       "paddw       %%xmm2,%%xmm0                 \n"
356*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"
357*4e366538SXin Li       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
358*4e366538SXin Li       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
359*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
360*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm2                 \n"
361*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm3                 \n"
362*4e366538SXin Li       "paddw       %%xmm2,%%xmm0                 \n"
363*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"
364*4e366538SXin Li       "phaddw      %%xmm1,%%xmm0                 \n"
365*4e366538SXin Li       "paddw       %%xmm5,%%xmm0                 \n"
366*4e366538SXin Li       "psrlw       $0x4,%%xmm0                   \n"
367*4e366538SXin Li       "packuswb    %%xmm0,%%xmm0                 \n"
368*4e366538SXin Li       "movq        %%xmm0,(%1)                   \n"
369*4e366538SXin Li       "lea         0x8(%1),%1                    \n"
370*4e366538SXin Li       "sub         $0x8,%2                       \n"
371*4e366538SXin Li       "jg          1b                            \n"
372*4e366538SXin Li       : "+r"(src_ptr),               // %0
373*4e366538SXin Li         "+r"(dst_ptr),               // %1
374*4e366538SXin Li         "+r"(dst_width),             // %2
375*4e366538SXin Li         "=&r"(stridex3)              // %3
376*4e366538SXin Li       : "r"((intptr_t)(src_stride))  // %4
377*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
378*4e366538SXin Li }
379*4e366538SXin Li 
380*4e366538SXin Li #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)381*4e366538SXin Li void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
382*4e366538SXin Li                         ptrdiff_t src_stride,
383*4e366538SXin Li                         uint8_t* dst_ptr,
384*4e366538SXin Li                         int dst_width) {
385*4e366538SXin Li   (void)src_stride;
386*4e366538SXin Li   asm volatile(
387*4e366538SXin Li       "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
388*4e366538SXin Li       "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
389*4e366538SXin Li       "vpslld      $0x10,%%ymm5,%%ymm5           \n"
390*4e366538SXin Li 
391*4e366538SXin Li       LABELALIGN
392*4e366538SXin Li       "1:                                        \n"
393*4e366538SXin Li       "vmovdqu     (%0),%%ymm0                   \n"
394*4e366538SXin Li       "vmovdqu     0x20(%0),%%ymm1               \n"
395*4e366538SXin Li       "lea         0x40(%0),%0                   \n"
396*4e366538SXin Li       "vpand       %%ymm5,%%ymm0,%%ymm0          \n"
397*4e366538SXin Li       "vpand       %%ymm5,%%ymm1,%%ymm1          \n"
398*4e366538SXin Li       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
399*4e366538SXin Li       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
400*4e366538SXin Li       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
401*4e366538SXin Li       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
402*4e366538SXin Li       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
403*4e366538SXin Li       "vmovdqu     %%xmm0,(%1)                   \n"
404*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
405*4e366538SXin Li       "sub         $0x10,%2                      \n"
406*4e366538SXin Li       "jg          1b                            \n"
407*4e366538SXin Li       "vzeroupper                                \n"
408*4e366538SXin Li       : "+r"(src_ptr),   // %0
409*4e366538SXin Li         "+r"(dst_ptr),   // %1
410*4e366538SXin Li         "+r"(dst_width)  // %2
411*4e366538SXin Li         ::"memory",
412*4e366538SXin Li         "cc", "xmm0", "xmm1", "xmm5");
413*4e366538SXin Li }
414*4e366538SXin Li 
ScaleRowDown4Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)415*4e366538SXin Li void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
416*4e366538SXin Li                            ptrdiff_t src_stride,
417*4e366538SXin Li                            uint8_t* dst_ptr,
418*4e366538SXin Li                            int dst_width) {
419*4e366538SXin Li   asm volatile(
420*4e366538SXin Li       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
421*4e366538SXin Li       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
422*4e366538SXin Li       "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
423*4e366538SXin Li       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
424*4e366538SXin Li 
425*4e366538SXin Li       LABELALIGN
426*4e366538SXin Li       "1:                                        \n"
427*4e366538SXin Li       "vmovdqu     (%0),%%ymm0                   \n"
428*4e366538SXin Li       "vmovdqu     0x20(%0),%%ymm1               \n"
429*4e366538SXin Li       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"
430*4e366538SXin Li       "vmovdqu     0x20(%0,%3,1),%%ymm3          \n"
431*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"
432*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm1,%%ymm1          \n"
433*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
434*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
435*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
436*4e366538SXin Li       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
437*4e366538SXin Li       "vmovdqu     0x00(%0,%3,2),%%ymm2          \n"
438*4e366538SXin Li       "vmovdqu     0x20(%0,%3,2),%%ymm3          \n"
439*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
440*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
441*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
442*4e366538SXin Li       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
443*4e366538SXin Li       "vmovdqu     0x00(%0,%4,1),%%ymm2          \n"
444*4e366538SXin Li       "vmovdqu     0x20(%0,%4,1),%%ymm3          \n"
445*4e366538SXin Li       "lea         0x40(%0),%0                   \n"
446*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
447*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm3,%%ymm3          \n"
448*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"
449*4e366538SXin Li       "vpaddw      %%ymm3,%%ymm1,%%ymm1          \n"
450*4e366538SXin Li       "vphaddw     %%ymm1,%%ymm0,%%ymm0          \n"
451*4e366538SXin Li       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
452*4e366538SXin Li       "vpaddw      %%ymm5,%%ymm0,%%ymm0          \n"
453*4e366538SXin Li       "vpsrlw      $0x4,%%ymm0,%%ymm0            \n"
454*4e366538SXin Li       "vpackuswb   %%ymm0,%%ymm0,%%ymm0          \n"
455*4e366538SXin Li       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
456*4e366538SXin Li       "vmovdqu     %%xmm0,(%1)                   \n"
457*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
458*4e366538SXin Li       "sub         $0x10,%2                      \n"
459*4e366538SXin Li       "jg          1b                            \n"
460*4e366538SXin Li       "vzeroupper                                \n"
461*4e366538SXin Li       : "+r"(src_ptr),                   // %0
462*4e366538SXin Li         "+r"(dst_ptr),                   // %1
463*4e366538SXin Li         "+r"(dst_width)                  // %2
464*4e366538SXin Li       : "r"((intptr_t)(src_stride)),     // %3
465*4e366538SXin Li         "r"((intptr_t)(src_stride * 3))  // %4
466*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
467*4e366538SXin Li }
468*4e366538SXin Li #endif  // HAS_SCALEROWDOWN4_AVX2
469*4e366538SXin Li 
ScaleRowDown34_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)470*4e366538SXin Li void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
471*4e366538SXin Li                           ptrdiff_t src_stride,
472*4e366538SXin Li                           uint8_t* dst_ptr,
473*4e366538SXin Li                           int dst_width) {
474*4e366538SXin Li   (void)src_stride;
475*4e366538SXin Li   asm volatile(
476*4e366538SXin Li       "movdqa      %0,%%xmm3                     \n"
477*4e366538SXin Li       "movdqa      %1,%%xmm4                     \n"
478*4e366538SXin Li       "movdqa      %2,%%xmm5                     \n"
479*4e366538SXin Li       :
480*4e366538SXin Li       : "m"(kShuf0),  // %0
481*4e366538SXin Li         "m"(kShuf1),  // %1
482*4e366538SXin Li         "m"(kShuf2)   // %2
483*4e366538SXin Li   );
484*4e366538SXin Li   asm volatile(LABELALIGN
485*4e366538SXin Li       "1:                                        \n"
486*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
487*4e366538SXin Li       "movdqu      0x10(%0),%%xmm2               \n"
488*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
489*4e366538SXin Li       "movdqa      %%xmm2,%%xmm1                 \n"
490*4e366538SXin Li       "palignr     $0x8,%%xmm0,%%xmm1            \n"
491*4e366538SXin Li       "pshufb      %%xmm3,%%xmm0                 \n"
492*4e366538SXin Li       "pshufb      %%xmm4,%%xmm1                 \n"
493*4e366538SXin Li       "pshufb      %%xmm5,%%xmm2                 \n"
494*4e366538SXin Li       "movq        %%xmm0,(%1)                   \n"
495*4e366538SXin Li       "movq        %%xmm1,0x8(%1)                \n"
496*4e366538SXin Li       "movq        %%xmm2,0x10(%1)               \n"
497*4e366538SXin Li       "lea         0x18(%1),%1                   \n"
498*4e366538SXin Li       "sub         $0x18,%2                      \n"
499*4e366538SXin Li       "jg          1b                            \n"
500*4e366538SXin Li                : "+r"(src_ptr),   // %0
501*4e366538SXin Li                  "+r"(dst_ptr),   // %1
502*4e366538SXin Li                  "+r"(dst_width)  // %2
503*4e366538SXin Li                  ::"memory",
504*4e366538SXin Li                  "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
505*4e366538SXin Li }
506*4e366538SXin Li 
ScaleRowDown34_1_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)507*4e366538SXin Li void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
508*4e366538SXin Li                                 ptrdiff_t src_stride,
509*4e366538SXin Li                                 uint8_t* dst_ptr,
510*4e366538SXin Li                                 int dst_width) {
511*4e366538SXin Li   asm volatile(
512*4e366538SXin Li       "movdqa      %0,%%xmm2                     \n"  // kShuf01
513*4e366538SXin Li       "movdqa      %1,%%xmm3                     \n"  // kShuf11
514*4e366538SXin Li       "movdqa      %2,%%xmm4                     \n"  // kShuf21
515*4e366538SXin Li       :
516*4e366538SXin Li       : "m"(kShuf01),  // %0
517*4e366538SXin Li         "m"(kShuf11),  // %1
518*4e366538SXin Li         "m"(kShuf21)   // %2
519*4e366538SXin Li   );
520*4e366538SXin Li   asm volatile(
521*4e366538SXin Li       "movdqa      %0,%%xmm5                     \n"  // kMadd01
522*4e366538SXin Li       "movdqa      %1,%%xmm0                     \n"  // kMadd11
523*4e366538SXin Li       "movdqa      %2,%%xmm1                     \n"  // kRound34
524*4e366538SXin Li       :
525*4e366538SXin Li       : "m"(kMadd01),  // %0
526*4e366538SXin Li         "m"(kMadd11),  // %1
527*4e366538SXin Li         "m"(kRound34)  // %2
528*4e366538SXin Li   );
529*4e366538SXin Li   asm volatile(LABELALIGN
530*4e366538SXin Li       "1:                                        \n"
531*4e366538SXin Li       "movdqu      (%0),%%xmm6                   \n"
532*4e366538SXin Li       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
533*4e366538SXin Li       "pavgb       %%xmm7,%%xmm6                 \n"
534*4e366538SXin Li       "pshufb      %%xmm2,%%xmm6                 \n"
535*4e366538SXin Li       "pmaddubsw   %%xmm5,%%xmm6                 \n"
536*4e366538SXin Li       "paddsw      %%xmm1,%%xmm6                 \n"
537*4e366538SXin Li       "psrlw       $0x2,%%xmm6                   \n"
538*4e366538SXin Li       "packuswb    %%xmm6,%%xmm6                 \n"
539*4e366538SXin Li       "movq        %%xmm6,(%1)                   \n"
540*4e366538SXin Li       "movdqu      0x8(%0),%%xmm6                \n"
541*4e366538SXin Li       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
542*4e366538SXin Li       "pavgb       %%xmm7,%%xmm6                 \n"
543*4e366538SXin Li       "pshufb      %%xmm3,%%xmm6                 \n"
544*4e366538SXin Li       "pmaddubsw   %%xmm0,%%xmm6                 \n"
545*4e366538SXin Li       "paddsw      %%xmm1,%%xmm6                 \n"
546*4e366538SXin Li       "psrlw       $0x2,%%xmm6                   \n"
547*4e366538SXin Li       "packuswb    %%xmm6,%%xmm6                 \n"
548*4e366538SXin Li       "movq        %%xmm6,0x8(%1)                \n"
549*4e366538SXin Li       "movdqu      0x10(%0),%%xmm6               \n"
550*4e366538SXin Li       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
551*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
552*4e366538SXin Li       "pavgb       %%xmm7,%%xmm6                 \n"
553*4e366538SXin Li       "pshufb      %%xmm4,%%xmm6                 \n"
554*4e366538SXin Li       "pmaddubsw   %4,%%xmm6                     \n"
555*4e366538SXin Li       "paddsw      %%xmm1,%%xmm6                 \n"
556*4e366538SXin Li       "psrlw       $0x2,%%xmm6                   \n"
557*4e366538SXin Li       "packuswb    %%xmm6,%%xmm6                 \n"
558*4e366538SXin Li       "movq        %%xmm6,0x10(%1)               \n"
559*4e366538SXin Li       "lea         0x18(%1),%1                   \n"
560*4e366538SXin Li       "sub         $0x18,%2                      \n"
561*4e366538SXin Li       "jg          1b                            \n"
562*4e366538SXin Li                : "+r"(src_ptr),                // %0
563*4e366538SXin Li                  "+r"(dst_ptr),                // %1
564*4e366538SXin Li                  "+r"(dst_width)               // %2
565*4e366538SXin Li                : "r"((intptr_t)(src_stride)),  // %3
566*4e366538SXin Li                  "m"(kMadd21)                  // %4
567*4e366538SXin Li                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
568*4e366538SXin Li                  "xmm6", "xmm7");
569*4e366538SXin Li }
570*4e366538SXin Li 
ScaleRowDown34_0_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)571*4e366538SXin Li void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
572*4e366538SXin Li                                 ptrdiff_t src_stride,
573*4e366538SXin Li                                 uint8_t* dst_ptr,
574*4e366538SXin Li                                 int dst_width) {
575*4e366538SXin Li   asm volatile(
576*4e366538SXin Li       "movdqa      %0,%%xmm2                     \n"  // kShuf01
577*4e366538SXin Li       "movdqa      %1,%%xmm3                     \n"  // kShuf11
578*4e366538SXin Li       "movdqa      %2,%%xmm4                     \n"  // kShuf21
579*4e366538SXin Li       :
580*4e366538SXin Li       : "m"(kShuf01),  // %0
581*4e366538SXin Li         "m"(kShuf11),  // %1
582*4e366538SXin Li         "m"(kShuf21)   // %2
583*4e366538SXin Li   );
584*4e366538SXin Li   asm volatile(
585*4e366538SXin Li       "movdqa      %0,%%xmm5                     \n"  // kMadd01
586*4e366538SXin Li       "movdqa      %1,%%xmm0                     \n"  // kMadd11
587*4e366538SXin Li       "movdqa      %2,%%xmm1                     \n"  // kRound34
588*4e366538SXin Li       :
589*4e366538SXin Li       : "m"(kMadd01),  // %0
590*4e366538SXin Li         "m"(kMadd11),  // %1
591*4e366538SXin Li         "m"(kRound34)  // %2
592*4e366538SXin Li   );
593*4e366538SXin Li 
594*4e366538SXin Li   asm volatile(LABELALIGN
595*4e366538SXin Li       "1:                                        \n"
596*4e366538SXin Li       "movdqu      (%0),%%xmm6                   \n"
597*4e366538SXin Li       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
598*4e366538SXin Li       "pavgb       %%xmm6,%%xmm7                 \n"
599*4e366538SXin Li       "pavgb       %%xmm7,%%xmm6                 \n"
600*4e366538SXin Li       "pshufb      %%xmm2,%%xmm6                 \n"
601*4e366538SXin Li       "pmaddubsw   %%xmm5,%%xmm6                 \n"
602*4e366538SXin Li       "paddsw      %%xmm1,%%xmm6                 \n"
603*4e366538SXin Li       "psrlw       $0x2,%%xmm6                   \n"
604*4e366538SXin Li       "packuswb    %%xmm6,%%xmm6                 \n"
605*4e366538SXin Li       "movq        %%xmm6,(%1)                   \n"
606*4e366538SXin Li       "movdqu      0x8(%0),%%xmm6                \n"
607*4e366538SXin Li       "movdqu      0x8(%0,%3,1),%%xmm7           \n"
608*4e366538SXin Li       "pavgb       %%xmm6,%%xmm7                 \n"
609*4e366538SXin Li       "pavgb       %%xmm7,%%xmm6                 \n"
610*4e366538SXin Li       "pshufb      %%xmm3,%%xmm6                 \n"
611*4e366538SXin Li       "pmaddubsw   %%xmm0,%%xmm6                 \n"
612*4e366538SXin Li       "paddsw      %%xmm1,%%xmm6                 \n"
613*4e366538SXin Li       "psrlw       $0x2,%%xmm6                   \n"
614*4e366538SXin Li       "packuswb    %%xmm6,%%xmm6                 \n"
615*4e366538SXin Li       "movq        %%xmm6,0x8(%1)                \n"
616*4e366538SXin Li       "movdqu      0x10(%0),%%xmm6               \n"
617*4e366538SXin Li       "movdqu      0x10(%0,%3,1),%%xmm7          \n"
618*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
619*4e366538SXin Li       "pavgb       %%xmm6,%%xmm7                 \n"
620*4e366538SXin Li       "pavgb       %%xmm7,%%xmm6                 \n"
621*4e366538SXin Li       "pshufb      %%xmm4,%%xmm6                 \n"
622*4e366538SXin Li       "pmaddubsw   %4,%%xmm6                     \n"
623*4e366538SXin Li       "paddsw      %%xmm1,%%xmm6                 \n"
624*4e366538SXin Li       "psrlw       $0x2,%%xmm6                   \n"
625*4e366538SXin Li       "packuswb    %%xmm6,%%xmm6                 \n"
626*4e366538SXin Li       "movq        %%xmm6,0x10(%1)               \n"
627*4e366538SXin Li       "lea         0x18(%1),%1                   \n"
628*4e366538SXin Li       "sub         $0x18,%2                      \n"
629*4e366538SXin Li       "jg          1b                            \n"
630*4e366538SXin Li                : "+r"(src_ptr),                // %0
631*4e366538SXin Li                  "+r"(dst_ptr),                // %1
632*4e366538SXin Li                  "+r"(dst_width)               // %2
633*4e366538SXin Li                : "r"((intptr_t)(src_stride)),  // %3
634*4e366538SXin Li                  "m"(kMadd21)                  // %4
635*4e366538SXin Li                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
636*4e366538SXin Li                  "xmm6", "xmm7");
637*4e366538SXin Li }
638*4e366538SXin Li 
ScaleRowDown38_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)639*4e366538SXin Li void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
640*4e366538SXin Li                           ptrdiff_t src_stride,
641*4e366538SXin Li                           uint8_t* dst_ptr,
642*4e366538SXin Li                           int dst_width) {
643*4e366538SXin Li   (void)src_stride;
644*4e366538SXin Li   asm volatile(
645*4e366538SXin Li       "movdqa      %3,%%xmm4                     \n"
646*4e366538SXin Li       "movdqa      %4,%%xmm5                     \n"
647*4e366538SXin Li 
648*4e366538SXin Li       LABELALIGN
649*4e366538SXin Li       "1:                                        \n"
650*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
651*4e366538SXin Li       "movdqu      0x10(%0),%%xmm1               \n"
652*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
653*4e366538SXin Li       "pshufb      %%xmm4,%%xmm0                 \n"
654*4e366538SXin Li       "pshufb      %%xmm5,%%xmm1                 \n"
655*4e366538SXin Li       "paddusb     %%xmm1,%%xmm0                 \n"
656*4e366538SXin Li       "movq        %%xmm0,(%1)                   \n"
657*4e366538SXin Li       "movhlps     %%xmm0,%%xmm1                 \n"
658*4e366538SXin Li       "movd        %%xmm1,0x8(%1)                \n"
659*4e366538SXin Li       "lea         0xc(%1),%1                    \n"
660*4e366538SXin Li       "sub         $0xc,%2                       \n"
661*4e366538SXin Li       "jg          1b                            \n"
662*4e366538SXin Li       : "+r"(src_ptr),   // %0
663*4e366538SXin Li         "+r"(dst_ptr),   // %1
664*4e366538SXin Li         "+r"(dst_width)  // %2
665*4e366538SXin Li       : "m"(kShuf38a),   // %3
666*4e366538SXin Li         "m"(kShuf38b)    // %4
667*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
668*4e366538SXin Li }
669*4e366538SXin Li 
ScaleRowDown38_2_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)670*4e366538SXin Li void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
671*4e366538SXin Li                                 ptrdiff_t src_stride,
672*4e366538SXin Li                                 uint8_t* dst_ptr,
673*4e366538SXin Li                                 int dst_width) {
674*4e366538SXin Li   asm volatile(
675*4e366538SXin Li       "movdqa      %0,%%xmm2                     \n"
676*4e366538SXin Li       "movdqa      %1,%%xmm3                     \n"
677*4e366538SXin Li       "movdqa      %2,%%xmm4                     \n"
678*4e366538SXin Li       "movdqa      %3,%%xmm5                     \n"
679*4e366538SXin Li       :
680*4e366538SXin Li       : "m"(kShufAb0),  // %0
681*4e366538SXin Li         "m"(kShufAb1),  // %1
682*4e366538SXin Li         "m"(kShufAb2),  // %2
683*4e366538SXin Li         "m"(kScaleAb2)  // %3
684*4e366538SXin Li   );
685*4e366538SXin Li   asm volatile(LABELALIGN
686*4e366538SXin Li       "1:                                        \n"
687*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
688*4e366538SXin Li       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
689*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
690*4e366538SXin Li       "pavgb       %%xmm1,%%xmm0                 \n"
691*4e366538SXin Li       "movdqa      %%xmm0,%%xmm1                 \n"
692*4e366538SXin Li       "pshufb      %%xmm2,%%xmm1                 \n"
693*4e366538SXin Li       "movdqa      %%xmm0,%%xmm6                 \n"
694*4e366538SXin Li       "pshufb      %%xmm3,%%xmm6                 \n"
695*4e366538SXin Li       "paddusw     %%xmm6,%%xmm1                 \n"
696*4e366538SXin Li       "pshufb      %%xmm4,%%xmm0                 \n"
697*4e366538SXin Li       "paddusw     %%xmm0,%%xmm1                 \n"
698*4e366538SXin Li       "pmulhuw     %%xmm5,%%xmm1                 \n"
699*4e366538SXin Li       "packuswb    %%xmm1,%%xmm1                 \n"
700*4e366538SXin Li       "movd        %%xmm1,(%1)                   \n"
701*4e366538SXin Li       "psrlq       $0x10,%%xmm1                  \n"
702*4e366538SXin Li       "movd        %%xmm1,0x2(%1)                \n"
703*4e366538SXin Li       "lea         0x6(%1),%1                    \n"
704*4e366538SXin Li       "sub         $0x6,%2                       \n"
705*4e366538SXin Li       "jg          1b                            \n"
706*4e366538SXin Li                : "+r"(src_ptr),               // %0
707*4e366538SXin Li                  "+r"(dst_ptr),               // %1
708*4e366538SXin Li                  "+r"(dst_width)              // %2
709*4e366538SXin Li                : "r"((intptr_t)(src_stride))  // %3
710*4e366538SXin Li                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
711*4e366538SXin Li                  "xmm6");
712*4e366538SXin Li }
713*4e366538SXin Li 
ScaleRowDown38_3_Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)714*4e366538SXin Li void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
715*4e366538SXin Li                                 ptrdiff_t src_stride,
716*4e366538SXin Li                                 uint8_t* dst_ptr,
717*4e366538SXin Li                                 int dst_width) {
718*4e366538SXin Li   asm volatile(
719*4e366538SXin Li       "movdqa      %0,%%xmm2                     \n"
720*4e366538SXin Li       "movdqa      %1,%%xmm3                     \n"
721*4e366538SXin Li       "movdqa      %2,%%xmm4                     \n"
722*4e366538SXin Li       "pxor        %%xmm5,%%xmm5                 \n"
723*4e366538SXin Li       :
724*4e366538SXin Li       : "m"(kShufAc),    // %0
725*4e366538SXin Li         "m"(kShufAc3),   // %1
726*4e366538SXin Li         "m"(kScaleAc33)  // %2
727*4e366538SXin Li   );
728*4e366538SXin Li   asm volatile(LABELALIGN
729*4e366538SXin Li       "1:                                        \n"
730*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
731*4e366538SXin Li       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
732*4e366538SXin Li       "movhlps     %%xmm0,%%xmm1                 \n"
733*4e366538SXin Li       "movhlps     %%xmm6,%%xmm7                 \n"
734*4e366538SXin Li       "punpcklbw   %%xmm5,%%xmm0                 \n"
735*4e366538SXin Li       "punpcklbw   %%xmm5,%%xmm1                 \n"
736*4e366538SXin Li       "punpcklbw   %%xmm5,%%xmm6                 \n"
737*4e366538SXin Li       "punpcklbw   %%xmm5,%%xmm7                 \n"
738*4e366538SXin Li       "paddusw     %%xmm6,%%xmm0                 \n"
739*4e366538SXin Li       "paddusw     %%xmm7,%%xmm1                 \n"
740*4e366538SXin Li       "movdqu      0x00(%0,%3,2),%%xmm6          \n"
741*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
742*4e366538SXin Li       "movhlps     %%xmm6,%%xmm7                 \n"
743*4e366538SXin Li       "punpcklbw   %%xmm5,%%xmm6                 \n"
744*4e366538SXin Li       "punpcklbw   %%xmm5,%%xmm7                 \n"
745*4e366538SXin Li       "paddusw     %%xmm6,%%xmm0                 \n"
746*4e366538SXin Li       "paddusw     %%xmm7,%%xmm1                 \n"
747*4e366538SXin Li       "movdqa      %%xmm0,%%xmm6                 \n"
748*4e366538SXin Li       "psrldq      $0x2,%%xmm0                   \n"
749*4e366538SXin Li       "paddusw     %%xmm0,%%xmm6                 \n"
750*4e366538SXin Li       "psrldq      $0x2,%%xmm0                   \n"
751*4e366538SXin Li       "paddusw     %%xmm0,%%xmm6                 \n"
752*4e366538SXin Li       "pshufb      %%xmm2,%%xmm6                 \n"
753*4e366538SXin Li       "movdqa      %%xmm1,%%xmm7                 \n"
754*4e366538SXin Li       "psrldq      $0x2,%%xmm1                   \n"
755*4e366538SXin Li       "paddusw     %%xmm1,%%xmm7                 \n"
756*4e366538SXin Li       "psrldq      $0x2,%%xmm1                   \n"
757*4e366538SXin Li       "paddusw     %%xmm1,%%xmm7                 \n"
758*4e366538SXin Li       "pshufb      %%xmm3,%%xmm7                 \n"
759*4e366538SXin Li       "paddusw     %%xmm7,%%xmm6                 \n"
760*4e366538SXin Li       "pmulhuw     %%xmm4,%%xmm6                 \n"
761*4e366538SXin Li       "packuswb    %%xmm6,%%xmm6                 \n"
762*4e366538SXin Li       "movd        %%xmm6,(%1)                   \n"
763*4e366538SXin Li       "psrlq       $0x10,%%xmm6                  \n"
764*4e366538SXin Li       "movd        %%xmm6,0x2(%1)                \n"
765*4e366538SXin Li       "lea         0x6(%1),%1                    \n"
766*4e366538SXin Li       "sub         $0x6,%2                       \n"
767*4e366538SXin Li       "jg          1b                            \n"
768*4e366538SXin Li                : "+r"(src_ptr),               // %0
769*4e366538SXin Li                  "+r"(dst_ptr),               // %1
770*4e366538SXin Li                  "+r"(dst_width)              // %2
771*4e366538SXin Li                : "r"((intptr_t)(src_stride))  // %3
772*4e366538SXin Li                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
773*4e366538SXin Li                  "xmm6", "xmm7");
774*4e366538SXin Li }
775*4e366538SXin Li 
776*4e366538SXin Li static const uvec8 kLinearShuffleFar = {2,  3,  0, 1, 6,  7,  4,  5,
777*4e366538SXin Li                                         10, 11, 8, 9, 14, 15, 12, 13};
778*4e366538SXin Li 
779*4e366538SXin Li static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
780*4e366538SXin Li                                     3, 1, 1, 3, 3, 1, 1, 3};
781*4e366538SXin Li 
782*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_SSE2
ScaleRowUp2_Linear_SSE2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)783*4e366538SXin Li void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
784*4e366538SXin Li                              uint8_t* dst_ptr,
785*4e366538SXin Li                              int dst_width) {
786*4e366538SXin Li   asm volatile(
787*4e366538SXin Li       "pxor        %%xmm0,%%xmm0                 \n"  // 0
788*4e366538SXin Li       "pcmpeqw     %%xmm6,%%xmm6                 \n"
789*4e366538SXin Li       "psrlw       $15,%%xmm6                    \n"
790*4e366538SXin Li       "psllw       $1,%%xmm6                     \n"  // all 2
791*4e366538SXin Li 
792*4e366538SXin Li       LABELALIGN
793*4e366538SXin Li       "1:                                        \n"
794*4e366538SXin Li       "movq        (%0),%%xmm1                   \n"  // 01234567
795*4e366538SXin Li       "movq        1(%0),%%xmm2                  \n"  // 12345678
796*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
797*4e366538SXin Li       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
798*4e366538SXin Li       "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
799*4e366538SXin Li       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
800*4e366538SXin Li       "movdqa      %%xmm1,%%xmm4                 \n"
801*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
802*4e366538SXin Li       "movdqa      %%xmm2,%%xmm5                 \n"
803*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
804*4e366538SXin Li       "paddw       %%xmm5,%%xmm4                 \n"
805*4e366538SXin Li       "movdqa      %%xmm3,%%xmm5                 \n"
806*4e366538SXin Li       "paddw       %%xmm6,%%xmm4                 \n"
807*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
808*4e366538SXin Li       "paddw       %%xmm5,%%xmm5                 \n"
809*4e366538SXin Li       "paddw       %%xmm4,%%xmm5                 \n"  // 3*near+far+2 (lo)
810*4e366538SXin Li       "psrlw       $2,%%xmm5                     \n"  // 3/4*near+1/4*far (lo)
811*4e366538SXin Li 
812*4e366538SXin Li       "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
813*4e366538SXin Li       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
814*4e366538SXin Li       "paddw       %%xmm2,%%xmm1                 \n"
815*4e366538SXin Li       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
816*4e366538SXin Li       "paddw       %%xmm6,%%xmm1                 \n"
817*4e366538SXin Li       "paddw       %%xmm3,%%xmm3                 \n"
818*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
819*4e366538SXin Li       "psrlw       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
820*4e366538SXin Li 
821*4e366538SXin Li       "packuswb    %%xmm1,%%xmm5                 \n"
822*4e366538SXin Li       "movdqu      %%xmm5,(%1)                   \n"
823*4e366538SXin Li 
824*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
825*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
826*4e366538SXin Li       "sub         $0x10,%2                      \n"
827*4e366538SXin Li       "jg          1b                            \n"
828*4e366538SXin Li       : "+r"(src_ptr),   // %0
829*4e366538SXin Li         "+r"(dst_ptr),   // %1
830*4e366538SXin Li         "+r"(dst_width)  // %2
831*4e366538SXin Li       :
832*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
833*4e366538SXin Li }
834*4e366538SXin Li #endif
835*4e366538SXin Li 
836*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_SSE2
ScaleRowUp2_Bilinear_SSE2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)837*4e366538SXin Li void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
838*4e366538SXin Li                                ptrdiff_t src_stride,
839*4e366538SXin Li                                uint8_t* dst_ptr,
840*4e366538SXin Li                                ptrdiff_t dst_stride,
841*4e366538SXin Li                                int dst_width) {
842*4e366538SXin Li   asm volatile(
843*4e366538SXin Li       LABELALIGN
844*4e366538SXin Li       "1:                                        \n"
845*4e366538SXin Li       "pxor        %%xmm0,%%xmm0                 \n"  // 0
846*4e366538SXin Li       // above line
847*4e366538SXin Li       "movq        (%0),%%xmm1                   \n"  // 01234567
848*4e366538SXin Li       "movq        1(%0),%%xmm2                  \n"  // 12345678
849*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
850*4e366538SXin Li       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
851*4e366538SXin Li       "punpcklbw   %%xmm1,%%xmm1                 \n"  // 0011223344556677
852*4e366538SXin Li       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
853*4e366538SXin Li 
854*4e366538SXin Li       "movdqa      %%xmm1,%%xmm4                 \n"
855*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm4                 \n"  // 00112233 (16)
856*4e366538SXin Li       "movdqa      %%xmm2,%%xmm5                 \n"
857*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 11223344 (16)
858*4e366538SXin Li       "paddw       %%xmm5,%%xmm4                 \n"  // near+far
859*4e366538SXin Li       "movdqa      %%xmm3,%%xmm5                 \n"
860*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 01122334 (16)
861*4e366538SXin Li       "paddw       %%xmm5,%%xmm5                 \n"  // 2*near
862*4e366538SXin Li       "paddw       %%xmm5,%%xmm4                 \n"  // 3*near+far (1, lo)
863*4e366538SXin Li 
864*4e366538SXin Li       "punpckhbw   %%xmm0,%%xmm1                 \n"  // 44556677 (16)
865*4e366538SXin Li       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
866*4e366538SXin Li       "paddw       %%xmm2,%%xmm1                 \n"
867*4e366538SXin Li       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
868*4e366538SXin Li       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
869*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
870*4e366538SXin Li 
871*4e366538SXin Li       // below line
872*4e366538SXin Li       "movq        (%0,%3),%%xmm6                \n"  // 01234567
873*4e366538SXin Li       "movq        1(%0,%3),%%xmm2               \n"  // 12345678
874*4e366538SXin Li       "movdqa      %%xmm6,%%xmm3                 \n"
875*4e366538SXin Li       "punpcklbw   %%xmm2,%%xmm3                 \n"  // 0112233445566778
876*4e366538SXin Li       "punpcklbw   %%xmm6,%%xmm6                 \n"  // 0011223344556677
877*4e366538SXin Li       "punpcklbw   %%xmm2,%%xmm2                 \n"  // 1122334455667788
878*4e366538SXin Li 
879*4e366538SXin Li       "movdqa      %%xmm6,%%xmm5                 \n"
880*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm5                 \n"  // 00112233 (16)
881*4e366538SXin Li       "movdqa      %%xmm2,%%xmm7                 \n"
882*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm7                 \n"  // 11223344 (16)
883*4e366538SXin Li       "paddw       %%xmm7,%%xmm5                 \n"  // near+far
884*4e366538SXin Li       "movdqa      %%xmm3,%%xmm7                 \n"
885*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm7                 \n"  // 01122334 (16)
886*4e366538SXin Li       "paddw       %%xmm7,%%xmm7                 \n"  // 2*near
887*4e366538SXin Li       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far (2, lo)
888*4e366538SXin Li 
889*4e366538SXin Li       "punpckhbw   %%xmm0,%%xmm6                 \n"  // 44556677 (16)
890*4e366538SXin Li       "punpckhbw   %%xmm0,%%xmm2                 \n"  // 55667788 (16)
891*4e366538SXin Li       "paddw       %%xmm6,%%xmm2                 \n"  // near+far
892*4e366538SXin Li       "punpckhbw   %%xmm0,%%xmm3                 \n"  // 45566778 (16)
893*4e366538SXin Li       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
894*4e366538SXin Li       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (2, hi)
895*4e366538SXin Li 
896*4e366538SXin Li       // xmm4 xmm1
897*4e366538SXin Li       // xmm5 xmm2
898*4e366538SXin Li       "pcmpeqw     %%xmm0,%%xmm0                 \n"
899*4e366538SXin Li       "psrlw       $15,%%xmm0                    \n"
900*4e366538SXin Li       "psllw       $3,%%xmm0                     \n"  // all 8
901*4e366538SXin Li 
902*4e366538SXin Li       "movdqa      %%xmm4,%%xmm3                 \n"
903*4e366538SXin Li       "movdqa      %%xmm5,%%xmm6                 \n"
904*4e366538SXin Li       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (1, lo)
905*4e366538SXin Li       "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, lo)
906*4e366538SXin Li       "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (1, lo)
907*4e366538SXin Li       "paddw       %%xmm6,%%xmm3                 \n"  // 9 3 3 1 + 8 (1, lo)
908*4e366538SXin Li       "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
909*4e366538SXin Li 
910*4e366538SXin Li       "movdqa      %%xmm1,%%xmm7                 \n"
911*4e366538SXin Li       "movdqa      %%xmm2,%%xmm6                 \n"
912*4e366538SXin Li       "paddw       %%xmm7,%%xmm7                 \n"  // 6*near+2*far (1, hi)
913*4e366538SXin Li       "paddw       %%xmm0,%%xmm6                 \n"  // 3*near+far+8 (2, hi)
914*4e366538SXin Li       "paddw       %%xmm1,%%xmm7                 \n"  // 9*near+3*far (1, hi)
915*4e366538SXin Li       "paddw       %%xmm6,%%xmm7                 \n"  // 9 3 3 1 + 8 (1, hi)
916*4e366538SXin Li       "psrlw       $4,%%xmm7                     \n"  // ^ div by 16
917*4e366538SXin Li 
918*4e366538SXin Li       "packuswb    %%xmm7,%%xmm3                 \n"
919*4e366538SXin Li       "movdqu      %%xmm3,(%1)                   \n"  // save above line
920*4e366538SXin Li 
921*4e366538SXin Li       "movdqa      %%xmm5,%%xmm3                 \n"
922*4e366538SXin Li       "paddw       %%xmm0,%%xmm4                 \n"  // 3*near+far+8 (1, lo)
923*4e366538SXin Li       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, lo)
924*4e366538SXin Li       "paddw       %%xmm3,%%xmm5                 \n"  // 9*near+3*far (2, lo)
925*4e366538SXin Li       "paddw       %%xmm4,%%xmm5                 \n"  // 9 3 3 1 + 8 (lo)
926*4e366538SXin Li       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16
927*4e366538SXin Li 
928*4e366538SXin Li       "movdqa      %%xmm2,%%xmm3                 \n"
929*4e366538SXin Li       "paddw       %%xmm0,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
930*4e366538SXin Li       "paddw       %%xmm3,%%xmm3                 \n"  // 6*near+2*far (2, hi)
931*4e366538SXin Li       "paddw       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
932*4e366538SXin Li       "paddw       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (hi)
933*4e366538SXin Li       "psrlw       $4,%%xmm2                     \n"  // ^ div by 16
934*4e366538SXin Li 
935*4e366538SXin Li       "packuswb    %%xmm2,%%xmm5                 \n"
936*4e366538SXin Li       "movdqu      %%xmm5,(%1,%4)                \n"  // save below line
937*4e366538SXin Li 
938*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
939*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
940*4e366538SXin Li       "sub         $0x10,%2                      \n"
941*4e366538SXin Li       "jg          1b                            \n"
942*4e366538SXin Li       : "+r"(src_ptr),                // %0
943*4e366538SXin Li         "+r"(dst_ptr),                // %1
944*4e366538SXin Li         "+r"(dst_width)               // %2
945*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
946*4e366538SXin Li         "r"((intptr_t)(dst_stride))   // %4
947*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
948*4e366538SXin Li         "xmm7");
949*4e366538SXin Li }
950*4e366538SXin Li #endif
951*4e366538SXin Li 
952*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_12_SSSE3
ScaleRowUp2_Linear_12_SSSE3(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)953*4e366538SXin Li void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
954*4e366538SXin Li                                  uint16_t* dst_ptr,
955*4e366538SXin Li                                  int dst_width) {
956*4e366538SXin Li   asm volatile(
957*4e366538SXin Li       "movdqa      %3,%%xmm5                     \n"
958*4e366538SXin Li       "pcmpeqw     %%xmm4,%%xmm4                 \n"
959*4e366538SXin Li       "psrlw       $15,%%xmm4                    \n"
960*4e366538SXin Li       "psllw       $1,%%xmm4                     \n"  // all 2
961*4e366538SXin Li 
962*4e366538SXin Li       LABELALIGN
963*4e366538SXin Li       "1:                                        \n"
964*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
965*4e366538SXin Li       "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
966*4e366538SXin Li 
967*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
968*4e366538SXin Li       "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
969*4e366538SXin Li       "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
970*4e366538SXin Li 
971*4e366538SXin Li       "movdqa      %%xmm2,%%xmm3                 \n"
972*4e366538SXin Li       "movdqa      %%xmm0,%%xmm1                 \n"
973*4e366538SXin Li       "pshufb      %%xmm5,%%xmm3                 \n"  // 54657687 (far)
974*4e366538SXin Li       "pshufb      %%xmm5,%%xmm1                 \n"  // 10213243 (far)
975*4e366538SXin Li 
976*4e366538SXin Li       "paddw       %%xmm4,%%xmm1                 \n"  // far+2
977*4e366538SXin Li       "paddw       %%xmm4,%%xmm3                 \n"  // far+2
978*4e366538SXin Li       "paddw       %%xmm0,%%xmm1                 \n"  // near+far+2
979*4e366538SXin Li       "paddw       %%xmm2,%%xmm3                 \n"  // near+far+2
980*4e366538SXin Li       "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
981*4e366538SXin Li       "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
982*4e366538SXin Li       "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far+2 (lo)
983*4e366538SXin Li       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far+2 (hi)
984*4e366538SXin Li 
985*4e366538SXin Li       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far
986*4e366538SXin Li       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far
987*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
988*4e366538SXin Li       "movdqu      %%xmm2,16(%1)                 \n"
989*4e366538SXin Li 
990*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
991*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
992*4e366538SXin Li       "sub         $0x10,%2                      \n"
993*4e366538SXin Li       "jg          1b                            \n"
994*4e366538SXin Li       : "+r"(src_ptr),          // %0
995*4e366538SXin Li         "+r"(dst_ptr),          // %1
996*4e366538SXin Li         "+r"(dst_width)         // %2
997*4e366538SXin Li       : "m"(kLinearShuffleFar)  // %3
998*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
999*4e366538SXin Li }
1000*4e366538SXin Li #endif
1001*4e366538SXin Li 
1002*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_12_SSSE3
ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1003*4e366538SXin Li void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
1004*4e366538SXin Li                                    ptrdiff_t src_stride,
1005*4e366538SXin Li                                    uint16_t* dst_ptr,
1006*4e366538SXin Li                                    ptrdiff_t dst_stride,
1007*4e366538SXin Li                                    int dst_width) {
1008*4e366538SXin Li   asm volatile(
1009*4e366538SXin Li       "pcmpeqw     %%xmm7,%%xmm7                 \n"
1010*4e366538SXin Li       "psrlw       $15,%%xmm7                    \n"
1011*4e366538SXin Li       "psllw       $3,%%xmm7                     \n"  // all 8
1012*4e366538SXin Li       "movdqa      %5,%%xmm6                     \n"
1013*4e366538SXin Li 
1014*4e366538SXin Li       LABELALIGN
1015*4e366538SXin Li       "1:                                        \n"
1016*4e366538SXin Li       // above line
1017*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"  // 01234567 (16)
1018*4e366538SXin Li       "movdqu      2(%0),%%xmm1                  \n"  // 12345678 (16)
1019*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
1020*4e366538SXin Li       "punpckhwd   %%xmm1,%%xmm2                 \n"  // 45566778 (16)
1021*4e366538SXin Li       "punpcklwd   %%xmm1,%%xmm0                 \n"  // 01122334 (16)
1022*4e366538SXin Li       "movdqa      %%xmm2,%%xmm3                 \n"
1023*4e366538SXin Li       "movdqa      %%xmm0,%%xmm1                 \n"
1024*4e366538SXin Li       "pshufb      %%xmm6,%%xmm3                 \n"  // 54657687 (far)
1025*4e366538SXin Li       "pshufb      %%xmm6,%%xmm1                 \n"  // 10213243 (far)
1026*4e366538SXin Li       "paddw       %%xmm0,%%xmm1                 \n"  // near+far
1027*4e366538SXin Li       "paddw       %%xmm2,%%xmm3                 \n"  // near+far
1028*4e366538SXin Li       "paddw       %%xmm0,%%xmm0                 \n"  // 2*near
1029*4e366538SXin Li       "paddw       %%xmm2,%%xmm2                 \n"  // 2*near
1030*4e366538SXin Li       "paddw       %%xmm1,%%xmm0                 \n"  // 3*near+far (1, lo)
1031*4e366538SXin Li       "paddw       %%xmm3,%%xmm2                 \n"  // 3*near+far (1, hi)
1032*4e366538SXin Li 
1033*4e366538SXin Li       // below line
1034*4e366538SXin Li       "movdqu      (%0,%3,2),%%xmm1              \n"  // 01234567 (16)
1035*4e366538SXin Li       "movdqu      2(%0,%3,2),%%xmm4             \n"  // 12345678 (16)
1036*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
1037*4e366538SXin Li       "punpckhwd   %%xmm4,%%xmm3                 \n"  // 45566778 (16)
1038*4e366538SXin Li       "punpcklwd   %%xmm4,%%xmm1                 \n"  // 01122334 (16)
1039*4e366538SXin Li       "movdqa      %%xmm3,%%xmm5                 \n"
1040*4e366538SXin Li       "movdqa      %%xmm1,%%xmm4                 \n"
1041*4e366538SXin Li       "pshufb      %%xmm6,%%xmm5                 \n"  // 54657687 (far)
1042*4e366538SXin Li       "pshufb      %%xmm6,%%xmm4                 \n"  // 10213243 (far)
1043*4e366538SXin Li       "paddw       %%xmm1,%%xmm4                 \n"  // near+far
1044*4e366538SXin Li       "paddw       %%xmm3,%%xmm5                 \n"  // near+far
1045*4e366538SXin Li       "paddw       %%xmm1,%%xmm1                 \n"  // 2*near
1046*4e366538SXin Li       "paddw       %%xmm3,%%xmm3                 \n"  // 2*near
1047*4e366538SXin Li       "paddw       %%xmm4,%%xmm1                 \n"  // 3*near+far (2, lo)
1048*4e366538SXin Li       "paddw       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1049*4e366538SXin Li 
1050*4e366538SXin Li       // xmm0 xmm2
1051*4e366538SXin Li       // xmm1 xmm3
1052*4e366538SXin Li 
1053*4e366538SXin Li       "movdqa      %%xmm0,%%xmm4                 \n"
1054*4e366538SXin Li       "movdqa      %%xmm1,%%xmm5                 \n"
1055*4e366538SXin Li       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1056*4e366538SXin Li       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1057*4e366538SXin Li       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1058*4e366538SXin Li       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1059*4e366538SXin Li       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1060*4e366538SXin Li       "movdqu      %%xmm4,(%1)                   \n"
1061*4e366538SXin Li 
1062*4e366538SXin Li       "movdqa      %%xmm2,%%xmm4                 \n"
1063*4e366538SXin Li       "movdqa      %%xmm3,%%xmm5                 \n"
1064*4e366538SXin Li       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (1, hi)
1065*4e366538SXin Li       "paddw       %%xmm7,%%xmm5                 \n"  // 3*near+far+8 (2, hi)
1066*4e366538SXin Li       "paddw       %%xmm2,%%xmm4                 \n"  // 9*near+3*far (1, hi)
1067*4e366538SXin Li       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, hi)
1068*4e366538SXin Li       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16
1069*4e366538SXin Li       "movdqu      %%xmm4,0x10(%1)               \n"
1070*4e366538SXin Li 
1071*4e366538SXin Li       "movdqa      %%xmm1,%%xmm4                 \n"
1072*4e366538SXin Li       "paddw       %%xmm7,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1073*4e366538SXin Li       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, lo)
1074*4e366538SXin Li       "paddw       %%xmm4,%%xmm1                 \n"  // 9*near+3*far (2, lo)
1075*4e366538SXin Li       "paddw       %%xmm0,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, lo)
1076*4e366538SXin Li       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16
1077*4e366538SXin Li       "movdqu      %%xmm1,(%1,%4,2)              \n"
1078*4e366538SXin Li 
1079*4e366538SXin Li       "movdqa      %%xmm3,%%xmm4                 \n"
1080*4e366538SXin Li       "paddw       %%xmm7,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1081*4e366538SXin Li       "paddw       %%xmm4,%%xmm4                 \n"  // 6*near+2*far (2, hi)
1082*4e366538SXin Li       "paddw       %%xmm4,%%xmm3                 \n"  // 9*near+3*far (2, hi)
1083*4e366538SXin Li       "paddw       %%xmm2,%%xmm3                 \n"  // 9 3 3 1 + 8 (2, hi)
1084*4e366538SXin Li       "psrlw       $4,%%xmm3                     \n"  // ^ div by 16
1085*4e366538SXin Li       "movdqu      %%xmm3,0x10(%1,%4,2)          \n"
1086*4e366538SXin Li 
1087*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
1088*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1089*4e366538SXin Li       "sub         $0x10,%2                      \n"
1090*4e366538SXin Li       "jg          1b                            \n"
1091*4e366538SXin Li       : "+r"(src_ptr),                // %0
1092*4e366538SXin Li         "+r"(dst_ptr),                // %1
1093*4e366538SXin Li         "+r"(dst_width)               // %2
1094*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
1095*4e366538SXin Li         "r"((intptr_t)(dst_stride)),  // %4
1096*4e366538SXin Li         "m"(kLinearShuffleFar)        // %5
1097*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1098*4e366538SXin Li         "xmm7");
1099*4e366538SXin Li }
1100*4e366538SXin Li #endif
1101*4e366538SXin Li 
1102*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_16_SSE2
ScaleRowUp2_Linear_16_SSE2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1103*4e366538SXin Li void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
1104*4e366538SXin Li                                 uint16_t* dst_ptr,
1105*4e366538SXin Li                                 int dst_width) {
1106*4e366538SXin Li   asm volatile(
1107*4e366538SXin Li       "pxor        %%xmm5,%%xmm5                 \n"
1108*4e366538SXin Li       "pcmpeqd     %%xmm4,%%xmm4                 \n"
1109*4e366538SXin Li       "psrld       $31,%%xmm4                    \n"
1110*4e366538SXin Li       "pslld       $1,%%xmm4                     \n"  // all 2
1111*4e366538SXin Li 
1112*4e366538SXin Li       LABELALIGN
1113*4e366538SXin Li       "1:                                        \n"
1114*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1115*4e366538SXin Li       "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1116*4e366538SXin Li 
1117*4e366538SXin Li       "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0123 (32b)
1118*4e366538SXin Li       "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1234 (32b)
1119*4e366538SXin Li 
1120*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
1121*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
1122*4e366538SXin Li 
1123*4e366538SXin Li       "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1124*4e366538SXin Li       "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1125*4e366538SXin Li 
1126*4e366538SXin Li       "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
1127*4e366538SXin Li       "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
1128*4e366538SXin Li       "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
1129*4e366538SXin Li       "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
1130*4e366538SXin Li       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1131*4e366538SXin Li       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1132*4e366538SXin Li       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
1133*4e366538SXin Li       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
1134*4e366538SXin Li 
1135*4e366538SXin Li       "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1136*4e366538SXin Li       "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
1137*4e366538SXin Li       "packssdw    %%xmm1,%%xmm0                 \n"
1138*4e366538SXin Li       "pshufd      $0b11011000,%%xmm0,%%xmm0     \n"
1139*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
1140*4e366538SXin Li 
1141*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
1142*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1143*4e366538SXin Li       "sub         $0x8,%2                       \n"
1144*4e366538SXin Li       "jg          1b                            \n"
1145*4e366538SXin Li       : "+r"(src_ptr),   // %0
1146*4e366538SXin Li         "+r"(dst_ptr),   // %1
1147*4e366538SXin Li         "+r"(dst_width)  // %2
1148*4e366538SXin Li       :
1149*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1150*4e366538SXin Li }
1151*4e366538SXin Li #endif
1152*4e366538SXin Li 
1153*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_16_SSE2
ScaleRowUp2_Bilinear_16_SSE2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1154*4e366538SXin Li void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
1155*4e366538SXin Li                                   ptrdiff_t src_stride,
1156*4e366538SXin Li                                   uint16_t* dst_ptr,
1157*4e366538SXin Li                                   ptrdiff_t dst_stride,
1158*4e366538SXin Li                                   int dst_width) {
1159*4e366538SXin Li   asm volatile(
1160*4e366538SXin Li       "pxor        %%xmm7,%%xmm7                 \n"
1161*4e366538SXin Li       "pcmpeqd     %%xmm6,%%xmm6                 \n"
1162*4e366538SXin Li       "psrld       $31,%%xmm6                    \n"
1163*4e366538SXin Li       "pslld       $3,%%xmm6                     \n"  // all 8
1164*4e366538SXin Li 
1165*4e366538SXin Li       LABELALIGN
1166*4e366538SXin Li       "1:                                        \n"
1167*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
1168*4e366538SXin Li       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
1169*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
1170*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
1171*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
1172*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
1173*4e366538SXin Li       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
1174*4e366538SXin Li       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
1175*4e366538SXin Li       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
1176*4e366538SXin Li       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
1177*4e366538SXin Li       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
1178*4e366538SXin Li       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
1179*4e366538SXin Li       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1180*4e366538SXin Li       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1181*4e366538SXin Li 
1182*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"  // 0123 (16b)
1183*4e366538SXin Li       "movq        2(%0),%%xmm1                  \n"  // 1234 (16b)
1184*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0123 (32b)
1185*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1234 (32b)
1186*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
1187*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
1188*4e366538SXin Li       "pshufd      $0b10110001,%%xmm2,%%xmm2     \n"  // 1032 (even, far)
1189*4e366538SXin Li       "pshufd      $0b10110001,%%xmm3,%%xmm3     \n"  // 2143 (odd, far)
1190*4e366538SXin Li       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (lo)
1191*4e366538SXin Li       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (hi)
1192*4e366538SXin Li       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
1193*4e366538SXin Li       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
1194*4e366538SXin Li       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
1195*4e366538SXin Li       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
1196*4e366538SXin Li 
1197*4e366538SXin Li       "movq        (%0,%3,2),%%xmm2              \n"
1198*4e366538SXin Li       "movq        2(%0,%3,2),%%xmm3             \n"
1199*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm2                 \n"  // 0123 (32b)
1200*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm3                 \n"  // 1234 (32b)
1201*4e366538SXin Li       "movdqa      %%xmm2,%%xmm4                 \n"
1202*4e366538SXin Li       "movdqa      %%xmm3,%%xmm5                 \n"
1203*4e366538SXin Li       "pshufd      $0b10110001,%%xmm4,%%xmm4     \n"  // 1032 (even, far)
1204*4e366538SXin Li       "pshufd      $0b10110001,%%xmm5,%%xmm5     \n"  // 2143 (odd, far)
1205*4e366538SXin Li       "paddd       %%xmm2,%%xmm4                 \n"  // near+far (lo)
1206*4e366538SXin Li       "paddd       %%xmm3,%%xmm5                 \n"  // near+far (hi)
1207*4e366538SXin Li       "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (lo)
1208*4e366538SXin Li       "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (hi)
1209*4e366538SXin Li       "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
1210*4e366538SXin Li       "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
1211*4e366538SXin Li 
1212*4e366538SXin Li       "movdqa      %%xmm0,%%xmm4                 \n"
1213*4e366538SXin Li       "movdqa      %%xmm2,%%xmm5                 \n"
1214*4e366538SXin Li       "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1215*4e366538SXin Li       "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1216*4e366538SXin Li       "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1217*4e366538SXin Li       "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1218*4e366538SXin Li       "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1219*4e366538SXin Li 
1220*4e366538SXin Li       "movdqa      %%xmm2,%%xmm5                 \n"
1221*4e366538SXin Li       "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1222*4e366538SXin Li       "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1223*4e366538SXin Li       "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1224*4e366538SXin Li       "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1225*4e366538SXin Li       "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1226*4e366538SXin Li 
1227*4e366538SXin Li       "movdqa      %%xmm1,%%xmm0                 \n"
1228*4e366538SXin Li       "movdqa      %%xmm3,%%xmm2                 \n"
1229*4e366538SXin Li       "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1230*4e366538SXin Li       "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
1231*4e366538SXin Li       "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1232*4e366538SXin Li       "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1233*4e366538SXin Li       "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1234*4e366538SXin Li 
1235*4e366538SXin Li       "movdqa      %%xmm3,%%xmm2                 \n"
1236*4e366538SXin Li       "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
1237*4e366538SXin Li       "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
1238*4e366538SXin Li       "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
1239*4e366538SXin Li       "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
1240*4e366538SXin Li       "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
1241*4e366538SXin Li 
1242*4e366538SXin Li       "packssdw    %%xmm0,%%xmm4                 \n"
1243*4e366538SXin Li       "pshufd      $0b11011000,%%xmm4,%%xmm4     \n"
1244*4e366538SXin Li       "movdqu      %%xmm4,(%1)                   \n"  // store above
1245*4e366538SXin Li       "packssdw    %%xmm2,%%xmm5                 \n"
1246*4e366538SXin Li       "pshufd      $0b11011000,%%xmm5,%%xmm5     \n"
1247*4e366538SXin Li       "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
1248*4e366538SXin Li 
1249*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
1250*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 4 pixel to 8 pixel
1251*4e366538SXin Li       "sub         $0x8,%2                       \n"
1252*4e366538SXin Li       "jg          1b                            \n"
1253*4e366538SXin Li       : "+r"(src_ptr),                // %0
1254*4e366538SXin Li         "+r"(dst_ptr),                // %1
1255*4e366538SXin Li         "+r"(dst_width)               // %2
1256*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
1257*4e366538SXin Li         "r"((intptr_t)(dst_stride))   // %4
1258*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1259*4e366538SXin Li         "xmm7");
1260*4e366538SXin Li }
1261*4e366538SXin Li #endif
1262*4e366538SXin Li 
1263*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_SSSE3
ScaleRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1264*4e366538SXin Li void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
1265*4e366538SXin Li                               uint8_t* dst_ptr,
1266*4e366538SXin Li                               int dst_width) {
1267*4e366538SXin Li   asm volatile(
1268*4e366538SXin Li       "pcmpeqw     %%xmm4,%%xmm4                 \n"
1269*4e366538SXin Li       "psrlw       $15,%%xmm4                    \n"
1270*4e366538SXin Li       "psllw       $1,%%xmm4                     \n"  // all 2
1271*4e366538SXin Li       "movdqa      %3,%%xmm3                     \n"
1272*4e366538SXin Li 
1273*4e366538SXin Li       LABELALIGN
1274*4e366538SXin Li       "1:                                        \n"
1275*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"  // 01234567
1276*4e366538SXin Li       "movq        1(%0),%%xmm1                  \n"  // 12345678
1277*4e366538SXin Li       "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1278*4e366538SXin Li       "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1279*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
1280*4e366538SXin Li       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1281*4e366538SXin Li       "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1282*4e366538SXin Li       "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
1283*4e366538SXin Li       "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
1284*4e366538SXin Li       "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
1285*4e366538SXin Li       "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
1286*4e366538SXin Li       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
1287*4e366538SXin Li       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
1288*4e366538SXin Li       "packuswb    %%xmm2,%%xmm0                 \n"
1289*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
1290*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
1291*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1292*4e366538SXin Li       "sub         $0x10,%2                      \n"
1293*4e366538SXin Li       "jg          1b                            \n"
1294*4e366538SXin Li       : "+r"(src_ptr),      // %0
1295*4e366538SXin Li         "+r"(dst_ptr),      // %1
1296*4e366538SXin Li         "+r"(dst_width)     // %2
1297*4e366538SXin Li       : "m"(kLinearMadd31)  // %3
1298*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1299*4e366538SXin Li }
1300*4e366538SXin Li #endif
1301*4e366538SXin Li 
1302*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_SSSE3
ScaleRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1303*4e366538SXin Li void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
1304*4e366538SXin Li                                 ptrdiff_t src_stride,
1305*4e366538SXin Li                                 uint8_t* dst_ptr,
1306*4e366538SXin Li                                 ptrdiff_t dst_stride,
1307*4e366538SXin Li                                 int dst_width) {
1308*4e366538SXin Li   asm volatile(
1309*4e366538SXin Li       "pcmpeqw     %%xmm6,%%xmm6                 \n"
1310*4e366538SXin Li       "psrlw       $15,%%xmm6                    \n"
1311*4e366538SXin Li       "psllw       $3,%%xmm6                     \n"  // all 8
1312*4e366538SXin Li       "movdqa      %5,%%xmm7                     \n"
1313*4e366538SXin Li 
1314*4e366538SXin Li       LABELALIGN
1315*4e366538SXin Li       "1:                                        \n"
1316*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"  // 01234567
1317*4e366538SXin Li       "movq        1(%0),%%xmm1                  \n"  // 12345678
1318*4e366538SXin Li       "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
1319*4e366538SXin Li       "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
1320*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
1321*4e366538SXin Li       "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
1322*4e366538SXin Li       "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
1323*4e366538SXin Li       "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
1324*4e366538SXin Li       "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)
1325*4e366538SXin Li 
1326*4e366538SXin Li       "movq        (%0,%3),%%xmm1                \n"
1327*4e366538SXin Li       "movq        1(%0,%3),%%xmm4               \n"
1328*4e366538SXin Li       "punpcklwd   %%xmm1,%%xmm1                 \n"
1329*4e366538SXin Li       "punpcklwd   %%xmm4,%%xmm4                 \n"
1330*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
1331*4e366538SXin Li       "punpckhdq   %%xmm4,%%xmm3                 \n"
1332*4e366538SXin Li       "punpckldq   %%xmm4,%%xmm1                 \n"
1333*4e366538SXin Li       "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
1334*4e366538SXin Li       "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
1335*4e366538SXin Li 
1336*4e366538SXin Li       // xmm0 xmm2
1337*4e366538SXin Li       // xmm1 xmm3
1338*4e366538SXin Li 
1339*4e366538SXin Li       "movdqa      %%xmm0,%%xmm4                 \n"
1340*4e366538SXin Li       "movdqa      %%xmm1,%%xmm5                 \n"
1341*4e366538SXin Li       "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
1342*4e366538SXin Li       "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
1343*4e366538SXin Li       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
1344*4e366538SXin Li       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
1345*4e366538SXin Li       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
1346*4e366538SXin Li 
1347*4e366538SXin Li       "movdqa      %%xmm1,%%xmm5                 \n"
1348*4e366538SXin Li       "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
1349*4e366538SXin Li       "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
1350*4e366538SXin Li       "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
1351*4e366538SXin Li       "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
1352*4e366538SXin Li       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
1353*4e366538SXin Li 
1354*4e366538SXin Li       "movdqa      %%xmm2,%%xmm0                 \n"
1355*4e366538SXin Li       "movdqa      %%xmm3,%%xmm1                 \n"
1356*4e366538SXin Li       "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
1357*4e366538SXin Li       "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
1358*4e366538SXin Li       "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
1359*4e366538SXin Li       "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
1360*4e366538SXin Li       "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
1361*4e366538SXin Li 
1362*4e366538SXin Li       "movdqa      %%xmm3,%%xmm1                 \n"
1363*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
1364*4e366538SXin Li       "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
1365*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
1366*4e366538SXin Li       "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
1367*4e366538SXin Li       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
1368*4e366538SXin Li 
1369*4e366538SXin Li       "packuswb    %%xmm0,%%xmm4                 \n"
1370*4e366538SXin Li       "movdqu      %%xmm4,(%1)                   \n"  // store above
1371*4e366538SXin Li       "packuswb    %%xmm1,%%xmm5                 \n"
1372*4e366538SXin Li       "movdqu      %%xmm5,(%1,%4)                \n"  // store below
1373*4e366538SXin Li 
1374*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
1375*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
1376*4e366538SXin Li       "sub         $0x10,%2                      \n"
1377*4e366538SXin Li       "jg          1b                            \n"
1378*4e366538SXin Li       : "+r"(src_ptr),                // %0
1379*4e366538SXin Li         "+r"(dst_ptr),                // %1
1380*4e366538SXin Li         "+r"(dst_width)               // %2
1381*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
1382*4e366538SXin Li         "r"((intptr_t)(dst_stride)),  // %4
1383*4e366538SXin Li         "m"(kLinearMadd31)            // %5
1384*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1385*4e366538SXin Li         "xmm7");
1386*4e366538SXin Li }
1387*4e366538SXin Li #endif
1388*4e366538SXin Li 
1389*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_AVX2
ScaleRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)1390*4e366538SXin Li void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
1391*4e366538SXin Li                              uint8_t* dst_ptr,
1392*4e366538SXin Li                              int dst_width) {
1393*4e366538SXin Li   asm volatile(
1394*4e366538SXin Li       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1395*4e366538SXin Li       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1396*4e366538SXin Li       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1397*4e366538SXin Li       "vbroadcastf128 %3,%%ymm3                  \n"
1398*4e366538SXin Li 
1399*4e366538SXin Li       LABELALIGN
1400*4e366538SXin Li       "1:                                        \n"
1401*4e366538SXin Li       "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1402*4e366538SXin Li       "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1403*4e366538SXin Li       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1404*4e366538SXin Li       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1405*4e366538SXin Li       "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1406*4e366538SXin Li       "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1407*4e366538SXin Li       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1408*4e366538SXin Li       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1409*4e366538SXin Li       "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
1410*4e366538SXin Li       "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
1411*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
1412*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
1413*4e366538SXin Li       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1414*4e366538SXin Li       "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1415*4e366538SXin Li       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
1416*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"
1417*4e366538SXin Li 
1418*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
1419*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1420*4e366538SXin Li       "sub         $0x20,%2                      \n"
1421*4e366538SXin Li       "jg          1b                            \n"
1422*4e366538SXin Li       "vzeroupper                                \n"
1423*4e366538SXin Li       : "+r"(src_ptr),      // %0
1424*4e366538SXin Li         "+r"(dst_ptr),      // %1
1425*4e366538SXin Li         "+r"(dst_width)     // %2
1426*4e366538SXin Li       : "m"(kLinearMadd31)  // %3
1427*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1428*4e366538SXin Li }
1429*4e366538SXin Li #endif
1430*4e366538SXin Li 
1431*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
ScaleRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1432*4e366538SXin Li void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
1433*4e366538SXin Li                                ptrdiff_t src_stride,
1434*4e366538SXin Li                                uint8_t* dst_ptr,
1435*4e366538SXin Li                                ptrdiff_t dst_stride,
1436*4e366538SXin Li                                int dst_width) {
1437*4e366538SXin Li   asm volatile(
1438*4e366538SXin Li       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
1439*4e366538SXin Li       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
1440*4e366538SXin Li       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
1441*4e366538SXin Li       "vbroadcastf128 %5,%%ymm7                  \n"
1442*4e366538SXin Li 
1443*4e366538SXin Li       LABELALIGN
1444*4e366538SXin Li       "1:                                        \n"
1445*4e366538SXin Li       "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
1446*4e366538SXin Li       "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
1447*4e366538SXin Li       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
1448*4e366538SXin Li       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
1449*4e366538SXin Li       "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
1450*4e366538SXin Li       "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
1451*4e366538SXin Li       "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
1452*4e366538SXin Li       "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
1453*4e366538SXin Li       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
1454*4e366538SXin Li       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
1455*4e366538SXin Li 
1456*4e366538SXin Li       "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
1457*4e366538SXin Li       "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
1458*4e366538SXin Li       "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
1459*4e366538SXin Li       "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
1460*4e366538SXin Li       "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
1461*4e366538SXin Li       "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
1462*4e366538SXin Li       "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
1463*4e366538SXin Li       "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
1464*4e366538SXin Li       "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
1465*4e366538SXin Li       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
1466*4e366538SXin Li 
1467*4e366538SXin Li       // ymm0 ymm1
1468*4e366538SXin Li       // ymm2 ymm3
1469*4e366538SXin Li 
1470*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1471*4e366538SXin Li       "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1472*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1473*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1474*4e366538SXin Li       "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1475*4e366538SXin Li 
1476*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1477*4e366538SXin Li       "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1478*4e366538SXin Li       "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1479*4e366538SXin Li       "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1480*4e366538SXin Li       "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1481*4e366538SXin Li 
1482*4e366538SXin Li       "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1483*4e366538SXin Li       "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1484*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1485*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1486*4e366538SXin Li       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1487*4e366538SXin Li 
1488*4e366538SXin Li       "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1489*4e366538SXin Li       "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1490*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1491*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1492*4e366538SXin Li       "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1493*4e366538SXin Li 
1494*4e366538SXin Li       "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
1495*4e366538SXin Li       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1496*4e366538SXin Li       "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
1497*4e366538SXin Li       "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
1498*4e366538SXin Li 
1499*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
1500*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
1501*4e366538SXin Li       "sub         $0x20,%2                      \n"
1502*4e366538SXin Li       "jg          1b                            \n"
1503*4e366538SXin Li       "vzeroupper                                \n"
1504*4e366538SXin Li       : "+r"(src_ptr),                // %0
1505*4e366538SXin Li         "+r"(dst_ptr),                // %1
1506*4e366538SXin Li         "+r"(dst_width)               // %2
1507*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
1508*4e366538SXin Li         "r"((intptr_t)(dst_stride)),  // %4
1509*4e366538SXin Li         "m"(kLinearMadd31)            // %5
1510*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1511*4e366538SXin Li         "xmm7");
1512*4e366538SXin Li }
1513*4e366538SXin Li #endif
1514*4e366538SXin Li 
1515*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
ScaleRowUp2_Linear_12_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1516*4e366538SXin Li void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
1517*4e366538SXin Li                                 uint16_t* dst_ptr,
1518*4e366538SXin Li                                 int dst_width) {
1519*4e366538SXin Li   asm volatile(
1520*4e366538SXin Li       "vbroadcastf128 %3,%%ymm5                  \n"
1521*4e366538SXin Li       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1522*4e366538SXin Li       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1523*4e366538SXin Li       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
1524*4e366538SXin Li 
1525*4e366538SXin Li       LABELALIGN
1526*4e366538SXin Li       "1:                                        \n"
1527*4e366538SXin Li       "vmovdqu     (%0),%%ymm0                   \n"  // 0123456789ABCDEF (16b)
1528*4e366538SXin Li       "vmovdqu     2(%0),%%ymm1                  \n"  // 123456789ABCDEF0 (16b)
1529*4e366538SXin Li 
1530*4e366538SXin Li       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 012389AB4567CDEF
1531*4e366538SXin Li       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 12349ABC5678DEF0
1532*4e366538SXin Li 
1533*4e366538SXin Li       "vpunpckhwd  %%ymm1,%%ymm0,%%ymm2          \n"  // 899AABBCCDDEEFF0 (near)
1534*4e366538SXin Li       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1535*4e366538SXin Li       "vpshufb     %%ymm5,%%ymm2,%%ymm3          \n"  // 98A9BACBDCEDFE0F (far)
1536*4e366538SXin Li       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1537*4e366538SXin Li 
1538*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // far+2
1539*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2
1540*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far+2
1541*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm3,%%ymm3          \n"  // near+far+2
1542*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1543*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near
1544*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 3*near+far+2
1545*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 3*near+far+2
1546*4e366538SXin Li 
1547*4e366538SXin Li       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
1548*4e366538SXin Li       "vpsrlw      $2,%%ymm2,%%ymm2              \n"  // 3/4*near+1/4*far
1549*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"
1550*4e366538SXin Li       "vmovdqu     %%ymm2,32(%1)                 \n"
1551*4e366538SXin Li 
1552*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
1553*4e366538SXin Li       "lea         0x40(%1),%1                   \n"  // 16 sample to 32 sample
1554*4e366538SXin Li       "sub         $0x20,%2                      \n"
1555*4e366538SXin Li       "jg          1b                            \n"
1556*4e366538SXin Li       "vzeroupper                                \n"
1557*4e366538SXin Li       : "+r"(src_ptr),          // %0
1558*4e366538SXin Li         "+r"(dst_ptr),          // %1
1559*4e366538SXin Li         "+r"(dst_width)         // %2
1560*4e366538SXin Li       : "m"(kLinearShuffleFar)  // %3
1561*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1562*4e366538SXin Li }
1563*4e366538SXin Li #endif
1564*4e366538SXin Li 
1565*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
ScaleRowUp2_Bilinear_12_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1566*4e366538SXin Li void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
1567*4e366538SXin Li                                   ptrdiff_t src_stride,
1568*4e366538SXin Li                                   uint16_t* dst_ptr,
1569*4e366538SXin Li                                   ptrdiff_t dst_stride,
1570*4e366538SXin Li                                   int dst_width) {
1571*4e366538SXin Li   asm volatile(
1572*4e366538SXin Li       "vbroadcastf128 %5,%%ymm5                  \n"
1573*4e366538SXin Li       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
1574*4e366538SXin Li       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
1575*4e366538SXin Li       "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
1576*4e366538SXin Li 
1577*4e366538SXin Li       LABELALIGN
1578*4e366538SXin Li       "1:                                        \n"
1579*4e366538SXin Li 
1580*4e366538SXin Li       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
1581*4e366538SXin Li       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
1582*4e366538SXin Li       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1583*4e366538SXin Li       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1584*4e366538SXin Li       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1585*4e366538SXin Li       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1586*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1587*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1588*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm2          \n"  // 3*near+far (1)
1589*4e366538SXin Li 
1590*4e366538SXin Li       "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
1591*4e366538SXin Li       "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
1592*4e366538SXin Li       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
1593*4e366538SXin Li       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
1594*4e366538SXin Li       "vpunpcklwd  %%ymm1,%%ymm0,%%ymm0          \n"  // 0112233445566778 (near)
1595*4e366538SXin Li       "vpshufb     %%ymm5,%%ymm0,%%ymm1          \n"  // 1021324354657687 (far)
1596*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm1          \n"  // near+far
1597*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near
1598*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm3          \n"  // 3*near+far (2)
1599*4e366538SXin Li 
1600*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
1601*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
1602*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
1603*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
1604*4e366538SXin Li       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1605*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"  // store above
1606*4e366538SXin Li 
1607*4e366538SXin Li       "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
1608*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
1609*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
1610*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
1611*4e366538SXin Li       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
1612*4e366538SXin Li       "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below
1613*4e366538SXin Li 
1614*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
1615*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
1616*4e366538SXin Li       "sub         $0x10,%2                      \n"
1617*4e366538SXin Li       "jg          1b                            \n"
1618*4e366538SXin Li       "vzeroupper                                \n"
1619*4e366538SXin Li       : "+r"(src_ptr),                // %0
1620*4e366538SXin Li         "+r"(dst_ptr),                // %1
1621*4e366538SXin Li         "+r"(dst_width)               // %2
1622*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
1623*4e366538SXin Li         "r"((intptr_t)(dst_stride)),  // %4
1624*4e366538SXin Li         "m"(kLinearShuffleFar)        // %5
1625*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
1626*4e366538SXin Li }
1627*4e366538SXin Li #endif
1628*4e366538SXin Li 
1629*4e366538SXin Li #ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
ScaleRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)1630*4e366538SXin Li void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
1631*4e366538SXin Li                                 uint16_t* dst_ptr,
1632*4e366538SXin Li                                 int dst_width) {
1633*4e366538SXin Li   asm volatile(
1634*4e366538SXin Li       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
1635*4e366538SXin Li       "vpsrld      $31,%%ymm4,%%ymm4             \n"
1636*4e366538SXin Li       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
1637*4e366538SXin Li 
1638*4e366538SXin Li       LABELALIGN
1639*4e366538SXin Li       "1:                                        \n"
1640*4e366538SXin Li       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1641*4e366538SXin Li       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1642*4e366538SXin Li 
1643*4e366538SXin Li       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1644*4e366538SXin Li       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1645*4e366538SXin Li 
1646*4e366538SXin Li       "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1647*4e366538SXin Li       "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1648*4e366538SXin Li 
1649*4e366538SXin Li       "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
1650*4e366538SXin Li       "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
1651*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
1652*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
1653*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1654*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1655*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
1656*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
1657*4e366538SXin Li 
1658*4e366538SXin Li       "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
1659*4e366538SXin Li       "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
1660*4e366538SXin Li       "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
1661*4e366538SXin Li       "vpshufd     $0b11011000,%%ymm0,%%ymm0     \n"
1662*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"
1663*4e366538SXin Li 
1664*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
1665*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1666*4e366538SXin Li       "sub         $0x10,%2                      \n"
1667*4e366538SXin Li       "jg          1b                            \n"
1668*4e366538SXin Li       "vzeroupper                                \n"
1669*4e366538SXin Li       : "+r"(src_ptr),   // %0
1670*4e366538SXin Li         "+r"(dst_ptr),   // %1
1671*4e366538SXin Li         "+r"(dst_width)  // %2
1672*4e366538SXin Li       :
1673*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
1674*4e366538SXin Li }
1675*4e366538SXin Li #endif
1676*4e366538SXin Li 
1677*4e366538SXin Li #ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
ScaleRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)1678*4e366538SXin Li void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
1679*4e366538SXin Li                                   ptrdiff_t src_stride,
1680*4e366538SXin Li                                   uint16_t* dst_ptr,
1681*4e366538SXin Li                                   ptrdiff_t dst_stride,
1682*4e366538SXin Li                                   int dst_width) {
1683*4e366538SXin Li   asm volatile(
1684*4e366538SXin Li       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
1685*4e366538SXin Li       "vpsrld      $31,%%ymm6,%%ymm6             \n"
1686*4e366538SXin Li       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
1687*4e366538SXin Li 
1688*4e366538SXin Li       LABELALIGN
1689*4e366538SXin Li       "1:                                        \n"
1690*4e366538SXin Li 
1691*4e366538SXin Li       "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b, 1u1v)
1692*4e366538SXin Li       "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b, 1u1v)
1693*4e366538SXin Li       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
1694*4e366538SXin Li       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
1695*4e366538SXin Li       "vpshufd     $0b10110001,%%ymm0,%%ymm2     \n"  // 10325476 (lo, far)
1696*4e366538SXin Li       "vpshufd     $0b10110001,%%ymm1,%%ymm3     \n"  // 21436587 (hi, far)
1697*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
1698*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
1699*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
1700*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
1701*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (1, lo)
1702*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (1, hi)
1703*4e366538SXin Li 
1704*4e366538SXin Li       "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b, 1u1v)
1705*4e366538SXin Li       "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b, 1u1v)
1706*4e366538SXin Li       "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
1707*4e366538SXin Li       "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
1708*4e366538SXin Li       "vpshufd     $0b10110001,%%ymm2,%%ymm4     \n"  // 10325476 (lo, far)
1709*4e366538SXin Li       "vpshufd     $0b10110001,%%ymm3,%%ymm5     \n"  // 21436587 (hi, far)
1710*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
1711*4e366538SXin Li       "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
1712*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
1713*4e366538SXin Li       "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
1714*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (2, lo)
1715*4e366538SXin Li       "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (2, hi)
1716*4e366538SXin Li 
1717*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
1718*4e366538SXin Li       "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
1719*4e366538SXin Li       "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
1720*4e366538SXin Li       "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
1721*4e366538SXin Li       "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
1722*4e366538SXin Li 
1723*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
1724*4e366538SXin Li       "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
1725*4e366538SXin Li       "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
1726*4e366538SXin Li       "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
1727*4e366538SXin Li       "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
1728*4e366538SXin Li 
1729*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
1730*4e366538SXin Li       "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
1731*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
1732*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
1733*4e366538SXin Li       "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
1734*4e366538SXin Li 
1735*4e366538SXin Li       "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
1736*4e366538SXin Li       "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
1737*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
1738*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
1739*4e366538SXin Li       "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
1740*4e366538SXin Li 
1741*4e366538SXin Li       "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
1742*4e366538SXin Li       "vpshufd     $0b11011000,%%ymm4,%%ymm4     \n"
1743*4e366538SXin Li       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
1744*4e366538SXin Li       "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
1745*4e366538SXin Li       "vpshufd     $0b11011000,%%ymm5,%%ymm5     \n"
1746*4e366538SXin Li       "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
1747*4e366538SXin Li 
1748*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
1749*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 8 pixel to 16 pixel
1750*4e366538SXin Li       "sub         $0x10,%2                      \n"
1751*4e366538SXin Li       "jg          1b                            \n"
1752*4e366538SXin Li       "vzeroupper                                \n"
1753*4e366538SXin Li       : "+r"(src_ptr),                // %0
1754*4e366538SXin Li         "+r"(dst_ptr),                // %1
1755*4e366538SXin Li         "+r"(dst_width)               // %2
1756*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
1757*4e366538SXin Li         "r"((intptr_t)(dst_stride))   // %4
1758*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
1759*4e366538SXin Li }
1760*4e366538SXin Li #endif
1761*4e366538SXin Li 
1762*4e366538SXin Li // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1763*4e366538SXin Li void ScaleAddRow_SSE2(const uint8_t* src_ptr,
1764*4e366538SXin Li                       uint16_t* dst_ptr,
1765*4e366538SXin Li                       int src_width) {
1766*4e366538SXin Li       asm volatile("pxor        %%xmm5,%%xmm5                 \n"
1767*4e366538SXin Li 
1768*4e366538SXin Li                // 16 pixel loop.
1769*4e366538SXin Li                LABELALIGN
1770*4e366538SXin Li       "1:                                        \n"
1771*4e366538SXin Li       "movdqu      (%0),%%xmm3                   \n"
1772*4e366538SXin Li       "lea         0x10(%0),%0                   \n"  // src_ptr += 16
1773*4e366538SXin Li       "movdqu      (%1),%%xmm0                   \n"
1774*4e366538SXin Li       "movdqu      0x10(%1),%%xmm1               \n"
1775*4e366538SXin Li       "movdqa      %%xmm3,%%xmm2                 \n"
1776*4e366538SXin Li       "punpcklbw   %%xmm5,%%xmm2                 \n"
1777*4e366538SXin Li       "punpckhbw   %%xmm5,%%xmm3                 \n"
1778*4e366538SXin Li       "paddusw     %%xmm2,%%xmm0                 \n"
1779*4e366538SXin Li       "paddusw     %%xmm3,%%xmm1                 \n"
1780*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
1781*4e366538SXin Li       "movdqu      %%xmm1,0x10(%1)               \n"
1782*4e366538SXin Li       "lea         0x20(%1),%1                   \n"
1783*4e366538SXin Li       "sub         $0x10,%2                      \n"
1784*4e366538SXin Li       "jg          1b                            \n"
1785*4e366538SXin Li                : "+r"(src_ptr),   // %0
1786*4e366538SXin Li                  "+r"(dst_ptr),   // %1
1787*4e366538SXin Li                  "+r"(src_width)  // %2
1788*4e366538SXin Li                :
1789*4e366538SXin Li                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1790*4e366538SXin Li }
1791*4e366538SXin Li 
1792*4e366538SXin Li #ifdef HAS_SCALEADDROW_AVX2
1793*4e366538SXin Li // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1794*4e366538SXin Li void ScaleAddRow_AVX2(const uint8_t* src_ptr,
1795*4e366538SXin Li                       uint16_t* dst_ptr,
1796*4e366538SXin Li                       int src_width) {
1797*4e366538SXin Li       asm volatile("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
1798*4e366538SXin Li 
1799*4e366538SXin Li                LABELALIGN
1800*4e366538SXin Li       "1:                                        \n"
1801*4e366538SXin Li       "vmovdqu     (%0),%%ymm3                   \n"
1802*4e366538SXin Li       "lea         0x20(%0),%0                   \n"  // src_ptr += 32
1803*4e366538SXin Li       "vpermq      $0xd8,%%ymm3,%%ymm3           \n"
1804*4e366538SXin Li       "vpunpcklbw  %%ymm5,%%ymm3,%%ymm2          \n"
1805*4e366538SXin Li       "vpunpckhbw  %%ymm5,%%ymm3,%%ymm3          \n"
1806*4e366538SXin Li       "vpaddusw    (%1),%%ymm2,%%ymm0            \n"
1807*4e366538SXin Li       "vpaddusw    0x20(%1),%%ymm3,%%ymm1        \n"
1808*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"
1809*4e366538SXin Li       "vmovdqu     %%ymm1,0x20(%1)               \n"
1810*4e366538SXin Li       "lea         0x40(%1),%1                   \n"
1811*4e366538SXin Li       "sub         $0x20,%2                      \n"
1812*4e366538SXin Li       "jg          1b                            \n"
1813*4e366538SXin Li       "vzeroupper                                \n"
1814*4e366538SXin Li                : "+r"(src_ptr),   // %0
1815*4e366538SXin Li                  "+r"(dst_ptr),   // %1
1816*4e366538SXin Li                  "+r"(src_width)  // %2
1817*4e366538SXin Li                :
1818*4e366538SXin Li                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
1819*4e366538SXin Li }
1820*4e366538SXin Li #endif  // HAS_SCALEADDROW_AVX2
1821*4e366538SXin Li 
1822*4e366538SXin Li // Constant for making pixels signed to avoid pmaddubsw
1823*4e366538SXin Li // saturation.
1824*4e366538SXin Li static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
1825*4e366538SXin Li                               0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
1826*4e366538SXin Li 
1827*4e366538SXin Li // Constant for making pixels unsigned and adding .5 for rounding.
1828*4e366538SXin Li static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
1829*4e366538SXin Li                                0x4040, 0x4040, 0x4040, 0x4040};
1830*4e366538SXin Li 
1831*4e366538SXin Li // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1832*4e366538SXin Li void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
1833*4e366538SXin Li                            const uint8_t* src_ptr,
1834*4e366538SXin Li                            int dst_width,
1835*4e366538SXin Li                            int x,
1836*4e366538SXin Li                            int dx) {
1837*4e366538SXin Li   intptr_t x0, x1, temp_pixel;
1838*4e366538SXin Li   asm volatile(
1839*4e366538SXin Li       "movd        %6,%%xmm2                     \n"
1840*4e366538SXin Li       "movd        %7,%%xmm3                     \n"
1841*4e366538SXin Li       "movl        $0x04040000,%k2               \n"
1842*4e366538SXin Li       "movd        %k2,%%xmm5                    \n"
1843*4e366538SXin Li       "pcmpeqb     %%xmm6,%%xmm6                 \n"
1844*4e366538SXin Li       "psrlw       $0x9,%%xmm6                   \n"  // 0x007f007f
1845*4e366538SXin Li       "pcmpeqb     %%xmm7,%%xmm7                 \n"
1846*4e366538SXin Li       "psrlw       $15,%%xmm7                    \n"  // 0x00010001
1847*4e366538SXin Li 
1848*4e366538SXin Li       "pextrw      $0x1,%%xmm2,%k3               \n"
1849*4e366538SXin Li       "subl        $0x2,%5                       \n"
1850*4e366538SXin Li       "jl          29f                           \n"
1851*4e366538SXin Li       "movdqa      %%xmm2,%%xmm0                 \n"
1852*4e366538SXin Li       "paddd       %%xmm3,%%xmm0                 \n"
1853*4e366538SXin Li       "punpckldq   %%xmm0,%%xmm2                 \n"
1854*4e366538SXin Li       "punpckldq   %%xmm3,%%xmm3                 \n"
1855*4e366538SXin Li       "paddd       %%xmm3,%%xmm3                 \n"
1856*4e366538SXin Li       "pextrw      $0x3,%%xmm2,%k4               \n"
1857*4e366538SXin Li 
1858*4e366538SXin Li       LABELALIGN
1859*4e366538SXin Li       "2:                                        \n"
1860*4e366538SXin Li       "movdqa      %%xmm2,%%xmm1                 \n"
1861*4e366538SXin Li       "paddd       %%xmm3,%%xmm2                 \n"
1862*4e366538SXin Li       "movzwl      0x00(%1,%3,1),%k2             \n"
1863*4e366538SXin Li       "movd        %k2,%%xmm0                    \n"
1864*4e366538SXin Li       "psrlw       $0x9,%%xmm1                   \n"
1865*4e366538SXin Li       "movzwl      0x00(%1,%4,1),%k2             \n"
1866*4e366538SXin Li       "movd        %k2,%%xmm4                    \n"
1867*4e366538SXin Li       "pshufb      %%xmm5,%%xmm1                 \n"
1868*4e366538SXin Li       "punpcklwd   %%xmm4,%%xmm0                 \n"
1869*4e366538SXin Li       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1870*4e366538SXin Li       "pxor        %%xmm6,%%xmm1                 \n"  // 128 - f = (f ^ 127 ) +
1871*4e366538SXin Li                                                       // 1
1872*4e366538SXin Li       "paddusb     %%xmm7,%%xmm1                 \n"
1873*4e366538SXin Li       "pmaddubsw   %%xmm0,%%xmm1                 \n"
1874*4e366538SXin Li       "pextrw      $0x1,%%xmm2,%k3               \n"
1875*4e366538SXin Li       "pextrw      $0x3,%%xmm2,%k4               \n"
1876*4e366538SXin Li       "paddw       %9,%%xmm1                     \n"  // make pixels unsigned.
1877*4e366538SXin Li       "psrlw       $0x7,%%xmm1                   \n"
1878*4e366538SXin Li       "packuswb    %%xmm1,%%xmm1                 \n"
1879*4e366538SXin Li       "movd        %%xmm1,%k2                    \n"
1880*4e366538SXin Li       "mov         %w2,(%0)                      \n"
1881*4e366538SXin Li       "lea         0x2(%0),%0                    \n"
1882*4e366538SXin Li       "subl        $0x2,%5                       \n"
1883*4e366538SXin Li       "jge         2b                            \n"
1884*4e366538SXin Li 
1885*4e366538SXin Li       LABELALIGN
1886*4e366538SXin Li       "29:                                       \n"
1887*4e366538SXin Li       "addl        $0x1,%5                       \n"
1888*4e366538SXin Li       "jl          99f                           \n"
1889*4e366538SXin Li       "movzwl      0x00(%1,%3,1),%k2             \n"
1890*4e366538SXin Li       "movd        %k2,%%xmm0                    \n"
1891*4e366538SXin Li       "psrlw       $0x9,%%xmm2                   \n"
1892*4e366538SXin Li       "pshufb      %%xmm5,%%xmm2                 \n"
1893*4e366538SXin Li       "psubb       %8,%%xmm0                     \n"  // make pixels signed.
1894*4e366538SXin Li       "pxor        %%xmm6,%%xmm2                 \n"
1895*4e366538SXin Li       "paddusb     %%xmm7,%%xmm2                 \n"
1896*4e366538SXin Li       "pmaddubsw   %%xmm0,%%xmm2                 \n"
1897*4e366538SXin Li       "paddw       %9,%%xmm2                     \n"  // make pixels unsigned.
1898*4e366538SXin Li       "psrlw       $0x7,%%xmm2                   \n"
1899*4e366538SXin Li       "packuswb    %%xmm2,%%xmm2                 \n"
1900*4e366538SXin Li       "movd        %%xmm2,%k2                    \n"
1901*4e366538SXin Li       "mov         %b2,(%0)                      \n"
1902*4e366538SXin Li       "99:                                       \n"
1903*4e366538SXin Li       : "+r"(dst_ptr),      // %0
1904*4e366538SXin Li         "+r"(src_ptr),      // %1
1905*4e366538SXin Li         "=&a"(temp_pixel),  // %2
1906*4e366538SXin Li         "=&r"(x0),          // %3
1907*4e366538SXin Li         "=&r"(x1),          // %4
1908*4e366538SXin Li #if defined(__x86_64__)
1909*4e366538SXin Li         "+rm"(dst_width)  // %5
1910*4e366538SXin Li #else
1911*4e366538SXin Li         "+m"(dst_width)  // %5
1912*4e366538SXin Li #endif
1913*4e366538SXin Li       : "rm"(x),   // %6
1914*4e366538SXin Li         "rm"(dx),  // %7
1915*4e366538SXin Li #if defined(__x86_64__)
1916*4e366538SXin Li         "x"(kFsub80),  // %8
1917*4e366538SXin Li         "x"(kFadd40)   // %9
1918*4e366538SXin Li #else
1919*4e366538SXin Li         "m"(kFsub80),    // %8
1920*4e366538SXin Li         "m"(kFadd40)     // %9
1921*4e366538SXin Li #endif
1922*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
1923*4e366538SXin Li         "xmm7");
1924*4e366538SXin Li }
1925*4e366538SXin Li 
1926*4e366538SXin Li // Reads 4 pixels, duplicates them and writes 8 pixels.
1927*4e366538SXin Li // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1928*4e366538SXin Li void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
1929*4e366538SXin Li                        const uint8_t* src_ptr,
1930*4e366538SXin Li                        int dst_width,
1931*4e366538SXin Li                        int x,
1932*4e366538SXin Li                        int dx) {
1933*4e366538SXin Li   (void)x;
1934*4e366538SXin Li   (void)dx;
1935*4e366538SXin Li   asm volatile(LABELALIGN
1936*4e366538SXin Li       "1:                                        \n"
1937*4e366538SXin Li       "movdqu      (%1),%%xmm0                   \n"
1938*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
1939*4e366538SXin Li       "movdqa      %%xmm0,%%xmm1                 \n"
1940*4e366538SXin Li       "punpcklbw   %%xmm0,%%xmm0                 \n"
1941*4e366538SXin Li       "punpckhbw   %%xmm1,%%xmm1                 \n"
1942*4e366538SXin Li       "movdqu      %%xmm0,(%0)                   \n"
1943*4e366538SXin Li       "movdqu      %%xmm1,0x10(%0)               \n"
1944*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
1945*4e366538SXin Li       "sub         $0x20,%2                      \n"
1946*4e366538SXin Li       "jg          1b                            \n"
1947*4e366538SXin Li 
1948*4e366538SXin Li                : "+r"(dst_ptr),   // %0
1949*4e366538SXin Li                  "+r"(src_ptr),   // %1
1950*4e366538SXin Li                  "+r"(dst_width)  // %2
1951*4e366538SXin Li                  ::"memory",
1952*4e366538SXin Li                  "cc", "xmm0", "xmm1");
1953*4e366538SXin Li }
1954*4e366538SXin Li 
ScaleARGBRowDown2_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1955*4e366538SXin Li void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
1956*4e366538SXin Li                             ptrdiff_t src_stride,
1957*4e366538SXin Li                             uint8_t* dst_argb,
1958*4e366538SXin Li                             int dst_width) {
1959*4e366538SXin Li   (void)src_stride;
1960*4e366538SXin Li   asm volatile(LABELALIGN
1961*4e366538SXin Li       "1:                                        \n"
1962*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
1963*4e366538SXin Li       "movdqu      0x10(%0),%%xmm1               \n"
1964*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
1965*4e366538SXin Li       "shufps      $0xdd,%%xmm1,%%xmm0           \n"
1966*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
1967*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
1968*4e366538SXin Li       "sub         $0x4,%2                       \n"
1969*4e366538SXin Li       "jg          1b                            \n"
1970*4e366538SXin Li                : "+r"(src_argb),  // %0
1971*4e366538SXin Li                  "+r"(dst_argb),  // %1
1972*4e366538SXin Li                  "+r"(dst_width)  // %2
1973*4e366538SXin Li                  ::"memory",
1974*4e366538SXin Li                  "cc", "xmm0", "xmm1");
1975*4e366538SXin Li }
1976*4e366538SXin Li 
ScaleARGBRowDown2Linear_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1977*4e366538SXin Li void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
1978*4e366538SXin Li                                   ptrdiff_t src_stride,
1979*4e366538SXin Li                                   uint8_t* dst_argb,
1980*4e366538SXin Li                                   int dst_width) {
1981*4e366538SXin Li   (void)src_stride;
1982*4e366538SXin Li   asm volatile(LABELALIGN
1983*4e366538SXin Li       "1:                                        \n"
1984*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
1985*4e366538SXin Li       "movdqu      0x10(%0),%%xmm1               \n"
1986*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
1987*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
1988*4e366538SXin Li       "shufps      $0x88,%%xmm1,%%xmm0           \n"
1989*4e366538SXin Li       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
1990*4e366538SXin Li       "pavgb       %%xmm2,%%xmm0                 \n"
1991*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
1992*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
1993*4e366538SXin Li       "sub         $0x4,%2                       \n"
1994*4e366538SXin Li       "jg          1b                            \n"
1995*4e366538SXin Li                : "+r"(src_argb),  // %0
1996*4e366538SXin Li                  "+r"(dst_argb),  // %1
1997*4e366538SXin Li                  "+r"(dst_width)  // %2
1998*4e366538SXin Li                  ::"memory",
1999*4e366538SXin Li                  "cc", "xmm0", "xmm1");
2000*4e366538SXin Li }
2001*4e366538SXin Li 
ScaleARGBRowDown2Box_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)2002*4e366538SXin Li void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
2003*4e366538SXin Li                                ptrdiff_t src_stride,
2004*4e366538SXin Li                                uint8_t* dst_argb,
2005*4e366538SXin Li                                int dst_width) {
2006*4e366538SXin Li   asm volatile(LABELALIGN
2007*4e366538SXin Li       "1:                                        \n"
2008*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"
2009*4e366538SXin Li       "movdqu      0x10(%0),%%xmm1               \n"
2010*4e366538SXin Li       "movdqu      0x00(%0,%3,1),%%xmm2          \n"
2011*4e366538SXin Li       "movdqu      0x10(%0,%3,1),%%xmm3          \n"
2012*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
2013*4e366538SXin Li       "pavgb       %%xmm2,%%xmm0                 \n"
2014*4e366538SXin Li       "pavgb       %%xmm3,%%xmm1                 \n"
2015*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
2016*4e366538SXin Li       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2017*4e366538SXin Li       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2018*4e366538SXin Li       "pavgb       %%xmm2,%%xmm0                 \n"
2019*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
2020*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
2021*4e366538SXin Li       "sub         $0x4,%2                       \n"
2022*4e366538SXin Li       "jg          1b                            \n"
2023*4e366538SXin Li                : "+r"(src_argb),              // %0
2024*4e366538SXin Li                  "+r"(dst_argb),              // %1
2025*4e366538SXin Li                  "+r"(dst_width)              // %2
2026*4e366538SXin Li                : "r"((intptr_t)(src_stride))  // %3
2027*4e366538SXin Li                : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2028*4e366538SXin Li }
2029*4e366538SXin Li 
2030*4e366538SXin Li // Reads 4 pixels at a time.
2031*4e366538SXin Li // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2032*4e366538SXin Li void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
2033*4e366538SXin Li                                ptrdiff_t src_stride,
2034*4e366538SXin Li                                int src_stepx,
2035*4e366538SXin Li                                uint8_t* dst_argb,
2036*4e366538SXin Li                                int dst_width) {
2037*4e366538SXin Li   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2038*4e366538SXin Li   intptr_t src_stepx_x12;
2039*4e366538SXin Li   (void)src_stride;
2040*4e366538SXin Li   asm volatile(
2041*4e366538SXin Li       "lea         0x00(,%1,4),%1                \n"
2042*4e366538SXin Li       "lea         0x00(%1,%1,2),%4              \n"
2043*4e366538SXin Li 
2044*4e366538SXin Li       LABELALIGN
2045*4e366538SXin Li       "1:                                        \n"
2046*4e366538SXin Li       "movd        (%0),%%xmm0                   \n"
2047*4e366538SXin Li       "movd        0x00(%0,%1,1),%%xmm1          \n"
2048*4e366538SXin Li       "punpckldq   %%xmm1,%%xmm0                 \n"
2049*4e366538SXin Li       "movd        0x00(%0,%1,2),%%xmm2          \n"
2050*4e366538SXin Li       "movd        0x00(%0,%4,1),%%xmm3          \n"
2051*4e366538SXin Li       "lea         0x00(%0,%1,4),%0              \n"
2052*4e366538SXin Li       "punpckldq   %%xmm3,%%xmm2                 \n"
2053*4e366538SXin Li       "punpcklqdq  %%xmm2,%%xmm0                 \n"
2054*4e366538SXin Li       "movdqu      %%xmm0,(%2)                   \n"
2055*4e366538SXin Li       "lea         0x10(%2),%2                   \n"
2056*4e366538SXin Li       "sub         $0x4,%3                       \n"
2057*4e366538SXin Li       "jg          1b                            \n"
2058*4e366538SXin Li       : "+r"(src_argb),       // %0
2059*4e366538SXin Li         "+r"(src_stepx_x4),   // %1
2060*4e366538SXin Li         "+r"(dst_argb),       // %2
2061*4e366538SXin Li         "+r"(dst_width),      // %3
2062*4e366538SXin Li         "=&r"(src_stepx_x12)  // %4
2063*4e366538SXin Li         ::"memory",
2064*4e366538SXin Li         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2065*4e366538SXin Li }
2066*4e366538SXin Li 
2067*4e366538SXin Li // Blends four 2x2 to 4x1.
2068*4e366538SXin Li // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)2069*4e366538SXin Li void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
2070*4e366538SXin Li                                   ptrdiff_t src_stride,
2071*4e366538SXin Li                                   int src_stepx,
2072*4e366538SXin Li                                   uint8_t* dst_argb,
2073*4e366538SXin Li                                   int dst_width) {
2074*4e366538SXin Li   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
2075*4e366538SXin Li   intptr_t src_stepx_x12;
2076*4e366538SXin Li   intptr_t row1 = (intptr_t)(src_stride);
2077*4e366538SXin Li   asm volatile(
2078*4e366538SXin Li       "lea         0x00(,%1,4),%1                \n"
2079*4e366538SXin Li       "lea         0x00(%1,%1,2),%4              \n"
2080*4e366538SXin Li       "lea         0x00(%0,%5,1),%5              \n"
2081*4e366538SXin Li 
2082*4e366538SXin Li       LABELALIGN
2083*4e366538SXin Li       "1:                                        \n"
2084*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"
2085*4e366538SXin Li       "movhps      0x00(%0,%1,1),%%xmm0          \n"
2086*4e366538SXin Li       "movq        0x00(%0,%1,2),%%xmm1          \n"
2087*4e366538SXin Li       "movhps      0x00(%0,%4,1),%%xmm1          \n"
2088*4e366538SXin Li       "lea         0x00(%0,%1,4),%0              \n"
2089*4e366538SXin Li       "movq        (%5),%%xmm2                   \n"
2090*4e366538SXin Li       "movhps      0x00(%5,%1,1),%%xmm2          \n"
2091*4e366538SXin Li       "movq        0x00(%5,%1,2),%%xmm3          \n"
2092*4e366538SXin Li       "movhps      0x00(%5,%4,1),%%xmm3          \n"
2093*4e366538SXin Li       "lea         0x00(%5,%1,4),%5              \n"
2094*4e366538SXin Li       "pavgb       %%xmm2,%%xmm0                 \n"
2095*4e366538SXin Li       "pavgb       %%xmm3,%%xmm1                 \n"
2096*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
2097*4e366538SXin Li       "shufps      $0x88,%%xmm1,%%xmm0           \n"
2098*4e366538SXin Li       "shufps      $0xdd,%%xmm1,%%xmm2           \n"
2099*4e366538SXin Li       "pavgb       %%xmm2,%%xmm0                 \n"
2100*4e366538SXin Li       "movdqu      %%xmm0,(%2)                   \n"
2101*4e366538SXin Li       "lea         0x10(%2),%2                   \n"
2102*4e366538SXin Li       "sub         $0x4,%3                       \n"
2103*4e366538SXin Li       "jg          1b                            \n"
2104*4e366538SXin Li       : "+r"(src_argb),        // %0
2105*4e366538SXin Li         "+r"(src_stepx_x4),    // %1
2106*4e366538SXin Li         "+r"(dst_argb),        // %2
2107*4e366538SXin Li         "+rm"(dst_width),      // %3
2108*4e366538SXin Li         "=&r"(src_stepx_x12),  // %4
2109*4e366538SXin Li         "+r"(row1)             // %5
2110*4e366538SXin Li         ::"memory",
2111*4e366538SXin Li         "cc", "xmm0", "xmm1", "xmm2", "xmm3");
2112*4e366538SXin Li }
2113*4e366538SXin Li 
ScaleARGBCols_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2114*4e366538SXin Li void ScaleARGBCols_SSE2(uint8_t* dst_argb,
2115*4e366538SXin Li                         const uint8_t* src_argb,
2116*4e366538SXin Li                         int dst_width,
2117*4e366538SXin Li                         int x,
2118*4e366538SXin Li                         int dx) {
2119*4e366538SXin Li   intptr_t x0, x1;
2120*4e366538SXin Li   asm volatile(
2121*4e366538SXin Li       "movd        %5,%%xmm2                     \n"
2122*4e366538SXin Li       "movd        %6,%%xmm3                     \n"
2123*4e366538SXin Li       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
2124*4e366538SXin Li       "pshufd      $0x11,%%xmm3,%%xmm0           \n"
2125*4e366538SXin Li       "paddd       %%xmm0,%%xmm2                 \n"
2126*4e366538SXin Li       "paddd       %%xmm3,%%xmm3                 \n"
2127*4e366538SXin Li       "pshufd      $0x5,%%xmm3,%%xmm0            \n"
2128*4e366538SXin Li       "paddd       %%xmm0,%%xmm2                 \n"
2129*4e366538SXin Li       "paddd       %%xmm3,%%xmm3                 \n"
2130*4e366538SXin Li       "pshufd      $0x0,%%xmm3,%%xmm3            \n"
2131*4e366538SXin Li       "pextrw      $0x1,%%xmm2,%k0               \n"
2132*4e366538SXin Li       "pextrw      $0x3,%%xmm2,%k1               \n"
2133*4e366538SXin Li       "cmp         $0x0,%4                       \n"
2134*4e366538SXin Li       "jl          99f                           \n"
2135*4e366538SXin Li       "sub         $0x4,%4                       \n"
2136*4e366538SXin Li       "jl          49f                           \n"
2137*4e366538SXin Li 
2138*4e366538SXin Li       LABELALIGN
2139*4e366538SXin Li       "40:                                       \n"
2140*4e366538SXin Li       "movd        0x00(%3,%0,4),%%xmm0          \n"
2141*4e366538SXin Li       "movd        0x00(%3,%1,4),%%xmm1          \n"
2142*4e366538SXin Li       "pextrw      $0x5,%%xmm2,%k0               \n"
2143*4e366538SXin Li       "pextrw      $0x7,%%xmm2,%k1               \n"
2144*4e366538SXin Li       "paddd       %%xmm3,%%xmm2                 \n"
2145*4e366538SXin Li       "punpckldq   %%xmm1,%%xmm0                 \n"
2146*4e366538SXin Li       "movd        0x00(%3,%0,4),%%xmm1          \n"
2147*4e366538SXin Li       "movd        0x00(%3,%1,4),%%xmm4          \n"
2148*4e366538SXin Li       "pextrw      $0x1,%%xmm2,%k0               \n"
2149*4e366538SXin Li       "pextrw      $0x3,%%xmm2,%k1               \n"
2150*4e366538SXin Li       "punpckldq   %%xmm4,%%xmm1                 \n"
2151*4e366538SXin Li       "punpcklqdq  %%xmm1,%%xmm0                 \n"
2152*4e366538SXin Li       "movdqu      %%xmm0,(%2)                   \n"
2153*4e366538SXin Li       "lea         0x10(%2),%2                   \n"
2154*4e366538SXin Li       "sub         $0x4,%4                       \n"
2155*4e366538SXin Li       "jge         40b                           \n"
2156*4e366538SXin Li 
2157*4e366538SXin Li       "49:                                       \n"
2158*4e366538SXin Li       "test        $0x2,%4                       \n"
2159*4e366538SXin Li       "je          29f                           \n"
2160*4e366538SXin Li       "movd        0x00(%3,%0,4),%%xmm0          \n"
2161*4e366538SXin Li       "movd        0x00(%3,%1,4),%%xmm1          \n"
2162*4e366538SXin Li       "pextrw      $0x5,%%xmm2,%k0               \n"
2163*4e366538SXin Li       "punpckldq   %%xmm1,%%xmm0                 \n"
2164*4e366538SXin Li       "movq        %%xmm0,(%2)                   \n"
2165*4e366538SXin Li       "lea         0x8(%2),%2                    \n"
2166*4e366538SXin Li       "29:                                       \n"
2167*4e366538SXin Li       "test        $0x1,%4                       \n"
2168*4e366538SXin Li       "je          99f                           \n"
2169*4e366538SXin Li       "movd        0x00(%3,%0,4),%%xmm0          \n"
2170*4e366538SXin Li       "movd        %%xmm0,(%2)                   \n"
2171*4e366538SXin Li       "99:                                       \n"
2172*4e366538SXin Li       : "=&a"(x0),       // %0
2173*4e366538SXin Li         "=&d"(x1),       // %1
2174*4e366538SXin Li         "+r"(dst_argb),  // %2
2175*4e366538SXin Li         "+r"(src_argb),  // %3
2176*4e366538SXin Li         "+r"(dst_width)  // %4
2177*4e366538SXin Li       : "rm"(x),         // %5
2178*4e366538SXin Li         "rm"(dx)         // %6
2179*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2180*4e366538SXin Li }
2181*4e366538SXin Li 
2182*4e366538SXin Li // Reads 4 pixels, duplicates them and writes 8 pixels.
2183*4e366538SXin Li // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2184*4e366538SXin Li void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
2185*4e366538SXin Li                            const uint8_t* src_argb,
2186*4e366538SXin Li                            int dst_width,
2187*4e366538SXin Li                            int x,
2188*4e366538SXin Li                            int dx) {
2189*4e366538SXin Li   (void)x;
2190*4e366538SXin Li   (void)dx;
2191*4e366538SXin Li   asm volatile(LABELALIGN
2192*4e366538SXin Li       "1:                                        \n"
2193*4e366538SXin Li       "movdqu      (%1),%%xmm0                   \n"
2194*4e366538SXin Li       "lea         0x10(%1),%1                   \n"
2195*4e366538SXin Li       "movdqa      %%xmm0,%%xmm1                 \n"
2196*4e366538SXin Li       "punpckldq   %%xmm0,%%xmm0                 \n"
2197*4e366538SXin Li       "punpckhdq   %%xmm1,%%xmm1                 \n"
2198*4e366538SXin Li       "movdqu      %%xmm0,(%0)                   \n"
2199*4e366538SXin Li       "movdqu      %%xmm1,0x10(%0)               \n"
2200*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
2201*4e366538SXin Li       "sub         $0x8,%2                       \n"
2202*4e366538SXin Li       "jg          1b                            \n"
2203*4e366538SXin Li 
2204*4e366538SXin Li                : "+r"(dst_argb),  // %0
2205*4e366538SXin Li                  "+r"(src_argb),  // %1
2206*4e366538SXin Li                  "+r"(dst_width)  // %2
2207*4e366538SXin Li                  ::"memory",
2208*4e366538SXin Li                  "cc", "xmm0", "xmm1");
2209*4e366538SXin Li }
2210*4e366538SXin Li 
2211*4e366538SXin Li // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
2212*4e366538SXin Li static const uvec8 kShuffleColARGB = {
2213*4e366538SXin Li     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
2214*4e366538SXin Li     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
2215*4e366538SXin Li };
2216*4e366538SXin Li 
2217*4e366538SXin Li // Shuffle table for duplicating 2 fractions into 8 bytes each
2218*4e366538SXin Li static const uvec8 kShuffleFractions = {
2219*4e366538SXin Li     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
2220*4e366538SXin Li };
2221*4e366538SXin Li 
2222*4e366538SXin Li // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)2223*4e366538SXin Li void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
2224*4e366538SXin Li                                const uint8_t* src_argb,
2225*4e366538SXin Li                                int dst_width,
2226*4e366538SXin Li                                int x,
2227*4e366538SXin Li                                int dx) {
2228*4e366538SXin Li   intptr_t x0, x1;
2229*4e366538SXin Li   asm volatile(
2230*4e366538SXin Li       "movdqa      %0,%%xmm4                     \n"
2231*4e366538SXin Li       "movdqa      %1,%%xmm5                     \n"
2232*4e366538SXin Li       :
2233*4e366538SXin Li       : "m"(kShuffleColARGB),   // %0
2234*4e366538SXin Li         "m"(kShuffleFractions)  // %1
2235*4e366538SXin Li   );
2236*4e366538SXin Li 
2237*4e366538SXin Li   asm volatile(
2238*4e366538SXin Li       "movd        %5,%%xmm2                     \n"
2239*4e366538SXin Li       "movd        %6,%%xmm3                     \n"
2240*4e366538SXin Li       "pcmpeqb     %%xmm6,%%xmm6                 \n"
2241*4e366538SXin Li       "psrlw       $0x9,%%xmm6                   \n"
2242*4e366538SXin Li       "pextrw      $0x1,%%xmm2,%k3               \n"
2243*4e366538SXin Li       "sub         $0x2,%2                       \n"
2244*4e366538SXin Li       "jl          29f                           \n"
2245*4e366538SXin Li       "movdqa      %%xmm2,%%xmm0                 \n"
2246*4e366538SXin Li       "paddd       %%xmm3,%%xmm0                 \n"
2247*4e366538SXin Li       "punpckldq   %%xmm0,%%xmm2                 \n"
2248*4e366538SXin Li       "punpckldq   %%xmm3,%%xmm3                 \n"
2249*4e366538SXin Li       "paddd       %%xmm3,%%xmm3                 \n"
2250*4e366538SXin Li       "pextrw      $0x3,%%xmm2,%k4               \n"
2251*4e366538SXin Li 
2252*4e366538SXin Li       LABELALIGN
2253*4e366538SXin Li       "2:                                        \n"
2254*4e366538SXin Li       "movdqa      %%xmm2,%%xmm1                 \n"
2255*4e366538SXin Li       "paddd       %%xmm3,%%xmm2                 \n"
2256*4e366538SXin Li       "movq        0x00(%1,%3,4),%%xmm0          \n"
2257*4e366538SXin Li       "psrlw       $0x9,%%xmm1                   \n"
2258*4e366538SXin Li       "movhps      0x00(%1,%4,4),%%xmm0          \n"
2259*4e366538SXin Li       "pshufb      %%xmm5,%%xmm1                 \n"
2260*4e366538SXin Li       "pshufb      %%xmm4,%%xmm0                 \n"
2261*4e366538SXin Li       "pxor        %%xmm6,%%xmm1                 \n"
2262*4e366538SXin Li       "pmaddubsw   %%xmm1,%%xmm0                 \n"
2263*4e366538SXin Li       "psrlw       $0x7,%%xmm0                   \n"
2264*4e366538SXin Li       "pextrw      $0x1,%%xmm2,%k3               \n"
2265*4e366538SXin Li       "pextrw      $0x3,%%xmm2,%k4               \n"
2266*4e366538SXin Li       "packuswb    %%xmm0,%%xmm0                 \n"
2267*4e366538SXin Li       "movq        %%xmm0,(%0)                   \n"
2268*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
2269*4e366538SXin Li       "sub         $0x2,%2                       \n"
2270*4e366538SXin Li       "jge         2b                            \n"
2271*4e366538SXin Li 
2272*4e366538SXin Li       LABELALIGN
2273*4e366538SXin Li       "29:                                       \n"
2274*4e366538SXin Li       "add         $0x1,%2                       \n"
2275*4e366538SXin Li       "jl          99f                           \n"
2276*4e366538SXin Li       "psrlw       $0x9,%%xmm2                   \n"
2277*4e366538SXin Li       "movq        0x00(%1,%3,4),%%xmm0          \n"
2278*4e366538SXin Li       "pshufb      %%xmm5,%%xmm2                 \n"
2279*4e366538SXin Li       "pshufb      %%xmm4,%%xmm0                 \n"
2280*4e366538SXin Li       "pxor        %%xmm6,%%xmm2                 \n"
2281*4e366538SXin Li       "pmaddubsw   %%xmm2,%%xmm0                 \n"
2282*4e366538SXin Li       "psrlw       $0x7,%%xmm0                   \n"
2283*4e366538SXin Li       "packuswb    %%xmm0,%%xmm0                 \n"
2284*4e366538SXin Li       "movd        %%xmm0,(%0)                   \n"
2285*4e366538SXin Li 
2286*4e366538SXin Li       LABELALIGN
2287*4e366538SXin Li       "99:                                       \n"  // clang-format error.
2288*4e366538SXin Li 
2289*4e366538SXin Li       : "+r"(dst_argb),    // %0
2290*4e366538SXin Li         "+r"(src_argb),    // %1
2291*4e366538SXin Li         "+rm"(dst_width),  // %2
2292*4e366538SXin Li         "=&r"(x0),         // %3
2293*4e366538SXin Li         "=&r"(x1)          // %4
2294*4e366538SXin Li       : "rm"(x),           // %5
2295*4e366538SXin Li         "rm"(dx)           // %6
2296*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2297*4e366538SXin Li }
2298*4e366538SXin Li 
2299*4e366538SXin Li // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)2300*4e366538SXin Li int FixedDiv_X86(int num, int div) {
2301*4e366538SXin Li   asm volatile(
2302*4e366538SXin Li       "cdq                                       \n"
2303*4e366538SXin Li       "shld        $0x10,%%eax,%%edx             \n"
2304*4e366538SXin Li       "shl         $0x10,%%eax                   \n"
2305*4e366538SXin Li       "idiv        %1                            \n"
2306*4e366538SXin Li       "mov         %0, %%eax                     \n"
2307*4e366538SXin Li       : "+a"(num)  // %0
2308*4e366538SXin Li       : "c"(div)   // %1
2309*4e366538SXin Li       : "memory", "cc", "edx");
2310*4e366538SXin Li   return num;
2311*4e366538SXin Li }
2312*4e366538SXin Li 
2313*4e366538SXin Li // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)2314*4e366538SXin Li int FixedDiv1_X86(int num, int div) {
2315*4e366538SXin Li   asm volatile(
2316*4e366538SXin Li       "cdq                                       \n"
2317*4e366538SXin Li       "shld        $0x10,%%eax,%%edx             \n"
2318*4e366538SXin Li       "shl         $0x10,%%eax                   \n"
2319*4e366538SXin Li       "sub         $0x10001,%%eax                \n"
2320*4e366538SXin Li       "sbb         $0x0,%%edx                    \n"
2321*4e366538SXin Li       "sub         $0x1,%1                       \n"
2322*4e366538SXin Li       "idiv        %1                            \n"
2323*4e366538SXin Li       "mov         %0, %%eax                     \n"
2324*4e366538SXin Li       : "+a"(num)  // %0
2325*4e366538SXin Li       : "c"(div)   // %1
2326*4e366538SXin Li       : "memory", "cc", "edx");
2327*4e366538SXin Li   return num;
2328*4e366538SXin Li }
2329*4e366538SXin Li 
2330*4e366538SXin Li #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) || \
2331*4e366538SXin Li     defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
2332*4e366538SXin Li 
2333*4e366538SXin Li // Shuffle table for splitting UV into upper and lower part of register.
2334*4e366538SXin Li static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
2335*4e366538SXin Li                                       1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
2336*4e366538SXin Li static const uvec8 kShuffleMergeUV = {0u,   8u,   2u,   10u,  4u,   12u,
2337*4e366538SXin Li                                       6u,   14u,  0x80, 0x80, 0x80, 0x80,
2338*4e366538SXin Li                                       0x80, 0x80, 0x80, 0x80};
2339*4e366538SXin Li #endif
2340*4e366538SXin Li 
2341*4e366538SXin Li #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
2342*4e366538SXin Li 
ScaleUVRowDown2Box_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2343*4e366538SXin Li void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
2344*4e366538SXin Li                               ptrdiff_t src_stride,
2345*4e366538SXin Li                               uint8_t* dst_ptr,
2346*4e366538SXin Li                               int dst_width) {
2347*4e366538SXin Li   asm volatile(
2348*4e366538SXin Li       "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
2349*4e366538SXin Li       "psrlw       $0xf,%%xmm4                   \n"
2350*4e366538SXin Li       "packuswb    %%xmm4,%%xmm4                 \n"
2351*4e366538SXin Li       "pxor        %%xmm5, %%xmm5                \n"  // zero
2352*4e366538SXin Li       "movdqa      %4,%%xmm1                     \n"  // split shuffler
2353*4e366538SXin Li       "movdqa      %5,%%xmm3                     \n"  // merge shuffler
2354*4e366538SXin Li 
2355*4e366538SXin Li       LABELALIGN
2356*4e366538SXin Li       "1:                                        \n"
2357*4e366538SXin Li       "movdqu      (%0),%%xmm0                   \n"  // 8 UV row 0
2358*4e366538SXin Li       "movdqu      0x00(%0,%3,1),%%xmm2          \n"  // 8 UV row 1
2359*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
2360*4e366538SXin Li       "pshufb      %%xmm1,%%xmm0                 \n"  // uuuuvvvv
2361*4e366538SXin Li       "pshufb      %%xmm1,%%xmm2                 \n"
2362*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm0                 \n"  // horizontal add
2363*4e366538SXin Li       "pmaddubsw   %%xmm4,%%xmm2                 \n"
2364*4e366538SXin Li       "paddw       %%xmm2,%%xmm0                 \n"  // vertical add
2365*4e366538SXin Li       "psrlw       $0x1,%%xmm0                   \n"  // round
2366*4e366538SXin Li       "pavgw       %%xmm5,%%xmm0                 \n"
2367*4e366538SXin Li       "pshufb      %%xmm3,%%xmm0                 \n"  // merge uv
2368*4e366538SXin Li       "movq        %%xmm0,(%1)                   \n"
2369*4e366538SXin Li       "lea         0x8(%1),%1                    \n"  // 4 UV
2370*4e366538SXin Li       "sub         $0x4,%2                       \n"
2371*4e366538SXin Li       "jg          1b                            \n"
2372*4e366538SXin Li       : "+r"(src_ptr),                // %0
2373*4e366538SXin Li         "+r"(dst_ptr),                // %1
2374*4e366538SXin Li         "+r"(dst_width)               // %2
2375*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
2376*4e366538SXin Li         "m"(kShuffleSplitUV),         // %4
2377*4e366538SXin Li         "m"(kShuffleMergeUV)          // %5
2378*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2379*4e366538SXin Li }
2380*4e366538SXin Li #endif  // HAS_SCALEUVROWDOWN2BOX_SSSE3
2381*4e366538SXin Li 
2382*4e366538SXin Li #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
ScaleUVRowDown2Box_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)2383*4e366538SXin Li void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
2384*4e366538SXin Li                              ptrdiff_t src_stride,
2385*4e366538SXin Li                              uint8_t* dst_ptr,
2386*4e366538SXin Li                              int dst_width) {
2387*4e366538SXin Li   asm volatile(
2388*4e366538SXin Li       "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
2389*4e366538SXin Li       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
2390*4e366538SXin Li       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
2391*4e366538SXin Li       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
2392*4e366538SXin Li       "vbroadcastf128 %4,%%ymm1                  \n"  // split shuffler
2393*4e366538SXin Li       "vbroadcastf128 %5,%%ymm3                  \n"  // merge shuffler
2394*4e366538SXin Li 
2395*4e366538SXin Li       LABELALIGN
2396*4e366538SXin Li       "1:                                        \n"
2397*4e366538SXin Li       "vmovdqu     (%0),%%ymm0                   \n"  // 16 UV row 0
2398*4e366538SXin Li       "vmovdqu     0x00(%0,%3,1),%%ymm2          \n"  // 16 UV row 1
2399*4e366538SXin Li       "lea         0x20(%0),%0                   \n"
2400*4e366538SXin Li       "vpshufb     %%ymm1,%%ymm0,%%ymm0          \n"  // uuuuvvvv
2401*4e366538SXin Li       "vpshufb     %%ymm1,%%ymm2,%%ymm2          \n"
2402*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm0,%%ymm0          \n"  // horizontal add
2403*4e366538SXin Li       "vpmaddubsw  %%ymm4,%%ymm2,%%ymm2          \n"
2404*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm0,%%ymm0          \n"  // vertical add
2405*4e366538SXin Li       "vpsrlw      $0x1,%%ymm0,%%ymm0            \n"  // round
2406*4e366538SXin Li       "vpavgw      %%ymm5,%%ymm0,%%ymm0          \n"
2407*4e366538SXin Li       "vpshufb     %%ymm3,%%ymm0,%%ymm0          \n"  // merge uv
2408*4e366538SXin Li       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"  // combine qwords
2409*4e366538SXin Li       "vmovdqu     %%xmm0,(%1)                   \n"
2410*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 8 UV
2411*4e366538SXin Li       "sub         $0x8,%2                       \n"
2412*4e366538SXin Li       "jg          1b                            \n"
2413*4e366538SXin Li       "vzeroupper                                \n"
2414*4e366538SXin Li       : "+r"(src_ptr),                // %0
2415*4e366538SXin Li         "+r"(dst_ptr),                // %1
2416*4e366538SXin Li         "+r"(dst_width)               // %2
2417*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
2418*4e366538SXin Li         "m"(kShuffleSplitUV),         // %4
2419*4e366538SXin Li         "m"(kShuffleMergeUV)          // %5
2420*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2421*4e366538SXin Li }
2422*4e366538SXin Li #endif  // HAS_SCALEUVROWDOWN2BOX_AVX2
2423*4e366538SXin Li 
2424*4e366538SXin Li static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
2425*4e366538SXin Li                                       3, 1, 3, 1, 1, 3, 1, 3};
2426*4e366538SXin Li 
2427*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_LINEAR_SSSE3
ScaleUVRowUp2_Linear_SSSE3(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2428*4e366538SXin Li void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
2429*4e366538SXin Li                                 uint8_t* dst_ptr,
2430*4e366538SXin Li                                 int dst_width) {
2431*4e366538SXin Li   asm volatile(
2432*4e366538SXin Li       "pcmpeqw     %%xmm4,%%xmm4                 \n"
2433*4e366538SXin Li       "psrlw       $15,%%xmm4                    \n"
2434*4e366538SXin Li       "psllw       $1,%%xmm4                     \n"  // all 2
2435*4e366538SXin Li       "movdqa      %3,%%xmm3                     \n"
2436*4e366538SXin Li 
2437*4e366538SXin Li       LABELALIGN
2438*4e366538SXin Li       "1:                                        \n"
2439*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2440*4e366538SXin Li       "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2441*4e366538SXin Li       "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2442*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
2443*4e366538SXin Li       "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2444*4e366538SXin Li       "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2445*4e366538SXin Li       "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2446*4e366538SXin Li       "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2447*4e366538SXin Li       "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
2448*4e366538SXin Li       "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
2449*4e366538SXin Li       "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2450*4e366538SXin Li       "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
2451*4e366538SXin Li       "packuswb    %%xmm2,%%xmm0                 \n"
2452*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
2453*4e366538SXin Li 
2454*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
2455*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2456*4e366538SXin Li       "sub         $0x8,%2                       \n"
2457*4e366538SXin Li       "jg          1b                            \n"
2458*4e366538SXin Li       : "+r"(src_ptr),        // %0
2459*4e366538SXin Li         "+r"(dst_ptr),        // %1
2460*4e366538SXin Li         "+r"(dst_width)       // %2
2461*4e366538SXin Li       : "m"(kUVLinearMadd31)  // %3
2462*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2463*4e366538SXin Li }
2464*4e366538SXin Li #endif
2465*4e366538SXin Li 
2466*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_BILINEAR_SSSE3
ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2467*4e366538SXin Li void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
2468*4e366538SXin Li                                   ptrdiff_t src_stride,
2469*4e366538SXin Li                                   uint8_t* dst_ptr,
2470*4e366538SXin Li                                   ptrdiff_t dst_stride,
2471*4e366538SXin Li                                   int dst_width) {
2472*4e366538SXin Li   asm volatile(
2473*4e366538SXin Li       "pcmpeqw     %%xmm6,%%xmm6                 \n"
2474*4e366538SXin Li       "psrlw       $15,%%xmm6                    \n"
2475*4e366538SXin Li       "psllw       $3,%%xmm6                     \n"  // all 8
2476*4e366538SXin Li       "movdqa      %5,%%xmm7                     \n"
2477*4e366538SXin Li 
2478*4e366538SXin Li       LABELALIGN
2479*4e366538SXin Li       "1:                                        \n"
2480*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"  // 00112233 (1u1v)
2481*4e366538SXin Li       "movq        2(%0),%%xmm1                  \n"  // 11223344 (1u1v)
2482*4e366538SXin Li       "punpcklbw   %%xmm1,%%xmm0                 \n"  // 0101121223233434 (2u2v)
2483*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
2484*4e366538SXin Li       "punpckhdq   %%xmm0,%%xmm2                 \n"  // 2323232334343434 (2u2v)
2485*4e366538SXin Li       "punpckldq   %%xmm0,%%xmm0                 \n"  // 0101010112121212 (2u2v)
2486*4e366538SXin Li       "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1u1v16, hi)
2487*4e366538SXin Li       "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1u1v16, lo)
2488*4e366538SXin Li 
2489*4e366538SXin Li       "movq        (%0,%3),%%xmm1                \n"
2490*4e366538SXin Li       "movq        2(%0,%3),%%xmm4               \n"
2491*4e366538SXin Li       "punpcklbw   %%xmm4,%%xmm1                 \n"
2492*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
2493*4e366538SXin Li       "punpckhdq   %%xmm1,%%xmm3                 \n"
2494*4e366538SXin Li       "punpckldq   %%xmm1,%%xmm1                 \n"
2495*4e366538SXin Li       "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
2496*4e366538SXin Li       "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)
2497*4e366538SXin Li 
2498*4e366538SXin Li       // xmm0 xmm2
2499*4e366538SXin Li       // xmm1 xmm3
2500*4e366538SXin Li 
2501*4e366538SXin Li       "movdqa      %%xmm0,%%xmm4                 \n"
2502*4e366538SXin Li       "movdqa      %%xmm1,%%xmm5                 \n"
2503*4e366538SXin Li       "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2504*4e366538SXin Li       "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2505*4e366538SXin Li       "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2506*4e366538SXin Li       "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2507*4e366538SXin Li       "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2508*4e366538SXin Li 
2509*4e366538SXin Li       "movdqa      %%xmm1,%%xmm5                 \n"
2510*4e366538SXin Li       "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2511*4e366538SXin Li       "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2512*4e366538SXin Li       "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2513*4e366538SXin Li       "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2514*4e366538SXin Li       "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2515*4e366538SXin Li 
2516*4e366538SXin Li       "movdqa      %%xmm2,%%xmm0                 \n"
2517*4e366538SXin Li       "movdqa      %%xmm3,%%xmm1                 \n"
2518*4e366538SXin Li       "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2519*4e366538SXin Li       "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
2520*4e366538SXin Li       "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2521*4e366538SXin Li       "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2522*4e366538SXin Li       "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2523*4e366538SXin Li 
2524*4e366538SXin Li       "movdqa      %%xmm3,%%xmm1                 \n"
2525*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
2526*4e366538SXin Li       "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
2527*4e366538SXin Li       "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
2528*4e366538SXin Li       "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
2529*4e366538SXin Li       "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)
2530*4e366538SXin Li 
2531*4e366538SXin Li       "packuswb    %%xmm0,%%xmm4                 \n"
2532*4e366538SXin Li       "movdqu      %%xmm4,(%1)                   \n"  // store above
2533*4e366538SXin Li       "packuswb    %%xmm1,%%xmm5                 \n"
2534*4e366538SXin Li       "movdqu      %%xmm5,(%1,%4)                \n"  // store below
2535*4e366538SXin Li 
2536*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
2537*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 4 uv to 8 uv
2538*4e366538SXin Li       "sub         $0x8,%2                       \n"
2539*4e366538SXin Li       "jg          1b                            \n"
2540*4e366538SXin Li       : "+r"(src_ptr),                // %0
2541*4e366538SXin Li         "+r"(dst_ptr),                // %1
2542*4e366538SXin Li         "+r"(dst_width)               // %2
2543*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
2544*4e366538SXin Li         "r"((intptr_t)(dst_stride)),  // %4
2545*4e366538SXin Li         "m"(kUVLinearMadd31)          // %5
2546*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2547*4e366538SXin Li         "xmm7");
2548*4e366538SXin Li }
2549*4e366538SXin Li #endif
2550*4e366538SXin Li 
2551*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
2552*4e366538SXin Li 
ScaleUVRowUp2_Linear_AVX2(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)2553*4e366538SXin Li void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
2554*4e366538SXin Li                                uint8_t* dst_ptr,
2555*4e366538SXin Li                                int dst_width) {
2556*4e366538SXin Li   asm volatile(
2557*4e366538SXin Li       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
2558*4e366538SXin Li       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
2559*4e366538SXin Li       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
2560*4e366538SXin Li       "vbroadcastf128 %3,%%ymm3                  \n"
2561*4e366538SXin Li 
2562*4e366538SXin Li       LABELALIGN
2563*4e366538SXin Li       "1:                                        \n"
2564*4e366538SXin Li       "vmovdqu     (%0),%%xmm0                   \n"
2565*4e366538SXin Li       "vmovdqu     2(%0),%%xmm1                  \n"
2566*4e366538SXin Li       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2567*4e366538SXin Li       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2568*4e366538SXin Li       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2569*4e366538SXin Li       "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2570*4e366538SXin Li       "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2571*4e366538SXin Li       "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
2572*4e366538SXin Li       "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
2573*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
2574*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
2575*4e366538SXin Li       "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2576*4e366538SXin Li       "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2577*4e366538SXin Li       "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
2578*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"
2579*4e366538SXin Li 
2580*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
2581*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2582*4e366538SXin Li       "sub         $0x10,%2                      \n"
2583*4e366538SXin Li       "jg          1b                            \n"
2584*4e366538SXin Li       "vzeroupper                                \n"
2585*4e366538SXin Li       : "+r"(src_ptr),        // %0
2586*4e366538SXin Li         "+r"(dst_ptr),        // %1
2587*4e366538SXin Li         "+r"(dst_width)       // %2
2588*4e366538SXin Li       : "m"(kUVLinearMadd31)  // %3
2589*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2590*4e366538SXin Li }
2591*4e366538SXin Li #endif
2592*4e366538SXin Li 
2593*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
ScaleUVRowUp2_Bilinear_AVX2(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2594*4e366538SXin Li void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
2595*4e366538SXin Li                                  ptrdiff_t src_stride,
2596*4e366538SXin Li                                  uint8_t* dst_ptr,
2597*4e366538SXin Li                                  ptrdiff_t dst_stride,
2598*4e366538SXin Li                                  int dst_width) {
2599*4e366538SXin Li   asm volatile(
2600*4e366538SXin Li       "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
2601*4e366538SXin Li       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
2602*4e366538SXin Li       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
2603*4e366538SXin Li       "vbroadcastf128 %5,%%ymm7                  \n"
2604*4e366538SXin Li 
2605*4e366538SXin Li       LABELALIGN
2606*4e366538SXin Li       "1:                                        \n"
2607*4e366538SXin Li       "vmovdqu     (%0),%%xmm0                   \n"
2608*4e366538SXin Li       "vmovdqu     2(%0),%%xmm1                  \n"
2609*4e366538SXin Li       "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
2610*4e366538SXin Li       "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
2611*4e366538SXin Li       "vpunpcklbw  %%ymm1,%%ymm0,%%ymm0          \n"
2612*4e366538SXin Li       "vpunpckhdq  %%ymm0,%%ymm0,%%ymm2          \n"
2613*4e366538SXin Li       "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"
2614*4e366538SXin Li       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
2615*4e366538SXin Li       "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)
2616*4e366538SXin Li 
2617*4e366538SXin Li       "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
2618*4e366538SXin Li       "vmovdqu     2(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
2619*4e366538SXin Li       "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
2620*4e366538SXin Li       "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
2621*4e366538SXin Li       "vpunpcklbw  %%ymm3,%%ymm2,%%ymm2          \n"
2622*4e366538SXin Li       "vpunpckhdq  %%ymm2,%%ymm2,%%ymm4          \n"
2623*4e366538SXin Li       "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"
2624*4e366538SXin Li       "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
2625*4e366538SXin Li       "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)
2626*4e366538SXin Li 
2627*4e366538SXin Li       // ymm0 ymm1
2628*4e366538SXin Li       // ymm2 ymm3
2629*4e366538SXin Li 
2630*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2631*4e366538SXin Li       "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2632*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2633*4e366538SXin Li       "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2634*4e366538SXin Li       "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2635*4e366538SXin Li 
2636*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2637*4e366538SXin Li       "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2638*4e366538SXin Li       "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2639*4e366538SXin Li       "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2640*4e366538SXin Li       "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2641*4e366538SXin Li 
2642*4e366538SXin Li       "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2643*4e366538SXin Li       "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2644*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2645*4e366538SXin Li       "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2646*4e366538SXin Li       "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2647*4e366538SXin Li 
2648*4e366538SXin Li       "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2649*4e366538SXin Li       "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2650*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2651*4e366538SXin Li       "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2652*4e366538SXin Li       "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2653*4e366538SXin Li 
2654*4e366538SXin Li       "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
2655*4e366538SXin Li       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2656*4e366538SXin Li       "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
2657*4e366538SXin Li       "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below
2658*4e366538SXin Li 
2659*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
2660*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 8 uv to 16 uv
2661*4e366538SXin Li       "sub         $0x10,%2                      \n"
2662*4e366538SXin Li       "jg          1b                            \n"
2663*4e366538SXin Li       "vzeroupper                                \n"
2664*4e366538SXin Li       : "+r"(src_ptr),                // %0
2665*4e366538SXin Li         "+r"(dst_ptr),                // %1
2666*4e366538SXin Li         "+r"(dst_width)               // %2
2667*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
2668*4e366538SXin Li         "r"((intptr_t)(dst_stride)),  // %4
2669*4e366538SXin Li         "m"(kUVLinearMadd31)          // %5
2670*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2671*4e366538SXin Li         "xmm7");
2672*4e366538SXin Li }
2673*4e366538SXin Li #endif
2674*4e366538SXin Li 
2675*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
ScaleUVRowUp2_Linear_16_SSE41(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2676*4e366538SXin Li void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
2677*4e366538SXin Li                                    uint16_t* dst_ptr,
2678*4e366538SXin Li                                    int dst_width) {
2679*4e366538SXin Li   asm volatile(
2680*4e366538SXin Li       "pxor        %%xmm5,%%xmm5                 \n"
2681*4e366538SXin Li       "pcmpeqd     %%xmm4,%%xmm4                 \n"
2682*4e366538SXin Li       "psrld       $31,%%xmm4                    \n"
2683*4e366538SXin Li       "pslld       $1,%%xmm4                     \n"  // all 2
2684*4e366538SXin Li 
2685*4e366538SXin Li       LABELALIGN
2686*4e366538SXin Li       "1:                                        \n"
2687*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2688*4e366538SXin Li       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2689*4e366538SXin Li 
2690*4e366538SXin Li       "punpcklwd   %%xmm5,%%xmm0                 \n"  // 0011 (32b, 1u1v)
2691*4e366538SXin Li       "punpcklwd   %%xmm5,%%xmm1                 \n"  // 1122 (32b, 1u1v)
2692*4e366538SXin Li 
2693*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
2694*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
2695*4e366538SXin Li 
2696*4e366538SXin Li       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (lo, far)
2697*4e366538SXin Li       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (hi, far)
2698*4e366538SXin Li 
2699*4e366538SXin Li       "paddd       %%xmm4,%%xmm2                 \n"  // far+2 (lo)
2700*4e366538SXin Li       "paddd       %%xmm4,%%xmm3                 \n"  // far+2 (hi)
2701*4e366538SXin Li       "paddd       %%xmm0,%%xmm2                 \n"  // near+far+2 (lo)
2702*4e366538SXin Li       "paddd       %%xmm1,%%xmm3                 \n"  // near+far+2 (hi)
2703*4e366538SXin Li       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (lo)
2704*4e366538SXin Li       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (hi)
2705*4e366538SXin Li       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far+2 (lo)
2706*4e366538SXin Li       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far+2 (hi)
2707*4e366538SXin Li 
2708*4e366538SXin Li       "psrld       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
2709*4e366538SXin Li       "psrld       $2,%%xmm1                     \n"  // 3/4*near+1/4*far (hi)
2710*4e366538SXin Li       "packusdw    %%xmm1,%%xmm0                 \n"
2711*4e366538SXin Li       "movdqu      %%xmm0,(%1)                   \n"
2712*4e366538SXin Li 
2713*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
2714*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2715*4e366538SXin Li       "sub         $0x4,%2                       \n"
2716*4e366538SXin Li       "jg          1b                            \n"
2717*4e366538SXin Li       : "+r"(src_ptr),   // %0
2718*4e366538SXin Li         "+r"(dst_ptr),   // %1
2719*4e366538SXin Li         "+r"(dst_width)  // %2
2720*4e366538SXin Li       :
2721*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
2722*4e366538SXin Li }
2723*4e366538SXin Li #endif
2724*4e366538SXin Li 
2725*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2726*4e366538SXin Li void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
2727*4e366538SXin Li                                      ptrdiff_t src_stride,
2728*4e366538SXin Li                                      uint16_t* dst_ptr,
2729*4e366538SXin Li                                      ptrdiff_t dst_stride,
2730*4e366538SXin Li                                      int dst_width) {
2731*4e366538SXin Li   asm volatile(
2732*4e366538SXin Li       "pxor        %%xmm7,%%xmm7                 \n"
2733*4e366538SXin Li       "pcmpeqd     %%xmm6,%%xmm6                 \n"
2734*4e366538SXin Li       "psrld       $31,%%xmm6                    \n"
2735*4e366538SXin Li       "pslld       $3,%%xmm6                     \n"  // all 8
2736*4e366538SXin Li 
2737*4e366538SXin Li       LABELALIGN
2738*4e366538SXin Li       "1:                                        \n"
2739*4e366538SXin Li       "movq        (%0),%%xmm0                   \n"  // 0011 (16b, 1u1v)
2740*4e366538SXin Li       "movq        4(%0),%%xmm1                  \n"  // 1122 (16b, 1u1v)
2741*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm0                 \n"  // 0011 (near) (32b, 1u1v)
2742*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm1                 \n"  // 1122 (near) (32b, 1u1v)
2743*4e366538SXin Li       "movdqa      %%xmm0,%%xmm2                 \n"
2744*4e366538SXin Li       "movdqa      %%xmm1,%%xmm3                 \n"
2745*4e366538SXin Li       "pshufd      $0b01001110,%%xmm2,%%xmm2     \n"  // 1100 (far) (1, lo)
2746*4e366538SXin Li       "pshufd      $0b01001110,%%xmm3,%%xmm3     \n"  // 2211 (far) (1, hi)
2747*4e366538SXin Li       "paddd       %%xmm0,%%xmm2                 \n"  // near+far (1, lo)
2748*4e366538SXin Li       "paddd       %%xmm1,%%xmm3                 \n"  // near+far (1, hi)
2749*4e366538SXin Li       "paddd       %%xmm0,%%xmm0                 \n"  // 2*near (1, lo)
2750*4e366538SXin Li       "paddd       %%xmm1,%%xmm1                 \n"  // 2*near (1, hi)
2751*4e366538SXin Li       "paddd       %%xmm2,%%xmm0                 \n"  // 3*near+far (1, lo)
2752*4e366538SXin Li       "paddd       %%xmm3,%%xmm1                 \n"  // 3*near+far (1, hi)
2753*4e366538SXin Li 
2754*4e366538SXin Li       "movq        (%0,%3,2),%%xmm2              \n"
2755*4e366538SXin Li       "movq        4(%0,%3,2),%%xmm3             \n"
2756*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm2                 \n"
2757*4e366538SXin Li       "punpcklwd   %%xmm7,%%xmm3                 \n"
2758*4e366538SXin Li       "movdqa      %%xmm2,%%xmm4                 \n"
2759*4e366538SXin Li       "movdqa      %%xmm3,%%xmm5                 \n"
2760*4e366538SXin Li       "pshufd      $0b01001110,%%xmm4,%%xmm4     \n"  // 1100 (far) (2, lo)
2761*4e366538SXin Li       "pshufd      $0b01001110,%%xmm5,%%xmm5     \n"  // 2211 (far) (2, hi)
2762*4e366538SXin Li       "paddd       %%xmm2,%%xmm4                 \n"  // near+far (2, lo)
2763*4e366538SXin Li       "paddd       %%xmm3,%%xmm5                 \n"  // near+far (2, hi)
2764*4e366538SXin Li       "paddd       %%xmm2,%%xmm2                 \n"  // 2*near (2, lo)
2765*4e366538SXin Li       "paddd       %%xmm3,%%xmm3                 \n"  // 2*near (2, hi)
2766*4e366538SXin Li       "paddd       %%xmm4,%%xmm2                 \n"  // 3*near+far (2, lo)
2767*4e366538SXin Li       "paddd       %%xmm5,%%xmm3                 \n"  // 3*near+far (2, hi)
2768*4e366538SXin Li 
2769*4e366538SXin Li       "movdqa      %%xmm0,%%xmm4                 \n"
2770*4e366538SXin Li       "movdqa      %%xmm2,%%xmm5                 \n"
2771*4e366538SXin Li       "paddd       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
2772*4e366538SXin Li       "paddd       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
2773*4e366538SXin Li       "paddd       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
2774*4e366538SXin Li       "paddd       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
2775*4e366538SXin Li       "psrld       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)
2776*4e366538SXin Li 
2777*4e366538SXin Li       "movdqa      %%xmm2,%%xmm5                 \n"
2778*4e366538SXin Li       "paddd       %%xmm2,%%xmm5                 \n"  // 6*near+2*far (2, lo)
2779*4e366538SXin Li       "paddd       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
2780*4e366538SXin Li       "paddd       %%xmm2,%%xmm5                 \n"  // 9*near+3*far (2, lo)
2781*4e366538SXin Li       "paddd       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
2782*4e366538SXin Li       "psrld       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)
2783*4e366538SXin Li 
2784*4e366538SXin Li       "movdqa      %%xmm1,%%xmm0                 \n"
2785*4e366538SXin Li       "movdqa      %%xmm3,%%xmm2                 \n"
2786*4e366538SXin Li       "paddd       %%xmm1,%%xmm0                 \n"  // 6*near+2*far (1, hi)
2787*4e366538SXin Li       "paddd       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (2, hi)
2788*4e366538SXin Li       "paddd       %%xmm1,%%xmm0                 \n"  // 9*near+3*far (1, hi)
2789*4e366538SXin Li       "paddd       %%xmm2,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
2790*4e366538SXin Li       "psrld       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)
2791*4e366538SXin Li 
2792*4e366538SXin Li       "movdqa      %%xmm3,%%xmm2                 \n"
2793*4e366538SXin Li       "paddd       %%xmm3,%%xmm2                 \n"  // 6*near+2*far (2, hi)
2794*4e366538SXin Li       "paddd       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (1, hi)
2795*4e366538SXin Li       "paddd       %%xmm3,%%xmm2                 \n"  // 9*near+3*far (2, hi)
2796*4e366538SXin Li       "paddd       %%xmm1,%%xmm2                 \n"  // 9 3 3 1 + 8 (2, hi)
2797*4e366538SXin Li       "psrld       $4,%%xmm2                     \n"  // ^ div by 16 (2, hi)
2798*4e366538SXin Li 
2799*4e366538SXin Li       "packusdw    %%xmm0,%%xmm4                 \n"
2800*4e366538SXin Li       "movdqu      %%xmm4,(%1)                   \n"  // store above
2801*4e366538SXin Li       "packusdw    %%xmm2,%%xmm5                 \n"
2802*4e366538SXin Li       "movdqu      %%xmm5,(%1,%4,2)              \n"  // store below
2803*4e366538SXin Li 
2804*4e366538SXin Li       "lea         0x8(%0),%0                    \n"
2805*4e366538SXin Li       "lea         0x10(%1),%1                   \n"  // 2 uv to 4 uv
2806*4e366538SXin Li       "sub         $0x4,%2                       \n"
2807*4e366538SXin Li       "jg          1b                            \n"
2808*4e366538SXin Li       : "+r"(src_ptr),                // %0
2809*4e366538SXin Li         "+r"(dst_ptr),                // %1
2810*4e366538SXin Li         "+r"(dst_width)               // %2
2811*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
2812*4e366538SXin Li         "r"((intptr_t)(dst_stride))   // %4
2813*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
2814*4e366538SXin Li         "xmm7");
2815*4e366538SXin Li }
2816*4e366538SXin Li #endif
2817*4e366538SXin Li 
2818*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
ScaleUVRowUp2_Linear_16_AVX2(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)2819*4e366538SXin Li void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
2820*4e366538SXin Li                                   uint16_t* dst_ptr,
2821*4e366538SXin Li                                   int dst_width) {
2822*4e366538SXin Li   asm volatile(
2823*4e366538SXin Li       "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
2824*4e366538SXin Li       "vpsrld      $31,%%ymm4,%%ymm4             \n"
2825*4e366538SXin Li       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
2826*4e366538SXin Li 
2827*4e366538SXin Li       LABELALIGN
2828*4e366538SXin Li       "1:                                        \n"
2829*4e366538SXin Li       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2830*4e366538SXin Li       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2831*4e366538SXin Li 
2832*4e366538SXin Li       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2833*4e366538SXin Li       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2834*4e366538SXin Li 
2835*4e366538SXin Li       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2836*4e366538SXin Li       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2837*4e366538SXin Li 
2838*4e366538SXin Li       "vpaddd      %%ymm4,%%ymm2,%%ymm2          \n"  // far+2 (lo)
2839*4e366538SXin Li       "vpaddd      %%ymm4,%%ymm3,%%ymm3          \n"  // far+2 (hi)
2840*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far+2 (lo)
2841*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far+2 (hi)
2842*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2843*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2844*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far+2 (lo)
2845*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far+2 (hi)
2846*4e366538SXin Li 
2847*4e366538SXin Li       "vpsrld      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
2848*4e366538SXin Li       "vpsrld      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
2849*4e366538SXin Li       "vpackusdw   %%ymm1,%%ymm0,%%ymm0          \n"
2850*4e366538SXin Li       "vmovdqu     %%ymm0,(%1)                   \n"
2851*4e366538SXin Li 
2852*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
2853*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2854*4e366538SXin Li       "sub         $0x8,%2                       \n"
2855*4e366538SXin Li       "jg          1b                            \n"
2856*4e366538SXin Li       "vzeroupper                                \n"
2857*4e366538SXin Li       : "+r"(src_ptr),   // %0
2858*4e366538SXin Li         "+r"(dst_ptr),   // %1
2859*4e366538SXin Li         "+r"(dst_width)  // %2
2860*4e366538SXin Li       :
2861*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
2862*4e366538SXin Li }
2863*4e366538SXin Li #endif
2864*4e366538SXin Li 
2865*4e366538SXin Li #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)2866*4e366538SXin Li void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
2867*4e366538SXin Li                                     ptrdiff_t src_stride,
2868*4e366538SXin Li                                     uint16_t* dst_ptr,
2869*4e366538SXin Li                                     ptrdiff_t dst_stride,
2870*4e366538SXin Li                                     int dst_width) {
2871*4e366538SXin Li   asm volatile(
2872*4e366538SXin Li       "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
2873*4e366538SXin Li       "vpsrld      $31,%%ymm6,%%ymm6             \n"
2874*4e366538SXin Li       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
2875*4e366538SXin Li 
2876*4e366538SXin Li       LABELALIGN
2877*4e366538SXin Li       "1:                                        \n"
2878*4e366538SXin Li 
2879*4e366538SXin Li       "vmovdqu     (%0),%%xmm0                   \n"  // 00112233 (16b, 1u1v)
2880*4e366538SXin Li       "vmovdqu     4(%0),%%xmm1                  \n"  // 11223344 (16b, 1u1v)
2881*4e366538SXin Li       "vpmovzxwd   %%xmm0,%%ymm0                 \n"  // 01234567 (32b, 1u1v)
2882*4e366538SXin Li       "vpmovzxwd   %%xmm1,%%ymm1                 \n"  // 12345678 (32b, 1u1v)
2883*4e366538SXin Li       "vpshufd     $0b01001110,%%ymm0,%%ymm2     \n"  // 11003322 (lo, far)
2884*4e366538SXin Li       "vpshufd     $0b01001110,%%ymm1,%%ymm3     \n"  // 22114433 (hi, far)
2885*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm2          \n"  // near+far (lo)
2886*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm3,%%ymm3          \n"  // near+far (hi)
2887*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm0,%%ymm0          \n"  // 2*near (lo)
2888*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm1,%%ymm1          \n"  // 2*near (hi)
2889*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 3*near+far (lo)
2890*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm3,%%ymm1          \n"  // 3*near+far (hi)
2891*4e366538SXin Li 
2892*4e366538SXin Li       "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 00112233 (16b, 1u1v)
2893*4e366538SXin Li       "vmovdqu     4(%0,%3,2),%%xmm3             \n"  // 11223344 (16b, 1u1v)
2894*4e366538SXin Li       "vpmovzxwd   %%xmm2,%%ymm2                 \n"  // 01234567 (32b, 1u1v)
2895*4e366538SXin Li       "vpmovzxwd   %%xmm3,%%ymm3                 \n"  // 12345678 (32b, 1u1v)
2896*4e366538SXin Li       "vpshufd     $0b01001110,%%ymm2,%%ymm4     \n"  // 11003322 (lo, far)
2897*4e366538SXin Li       "vpshufd     $0b01001110,%%ymm3,%%ymm5     \n"  // 22114433 (hi, far)
2898*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm4,%%ymm4          \n"  // near+far (lo)
2899*4e366538SXin Li       "vpaddd      %%ymm3,%%ymm5,%%ymm5          \n"  // near+far (hi)
2900*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm2,%%ymm2          \n"  // 2*near (lo)
2901*4e366538SXin Li       "vpaddd      %%ymm3,%%ymm3,%%ymm3          \n"  // 2*near (hi)
2902*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm4,%%ymm2          \n"  // 3*near+far (lo)
2903*4e366538SXin Li       "vpaddd      %%ymm3,%%ymm5,%%ymm3          \n"  // 3*near+far (hi)
2904*4e366538SXin Li 
2905*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
2906*4e366538SXin Li       "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
2907*4e366538SXin Li       "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
2908*4e366538SXin Li       "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
2909*4e366538SXin Li       "vpsrld      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)
2910*4e366538SXin Li 
2911*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
2912*4e366538SXin Li       "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
2913*4e366538SXin Li       "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
2914*4e366538SXin Li       "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
2915*4e366538SXin Li       "vpsrld      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)
2916*4e366538SXin Li 
2917*4e366538SXin Li       "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
2918*4e366538SXin Li       "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
2919*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
2920*4e366538SXin Li       "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
2921*4e366538SXin Li       "vpsrld      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)
2922*4e366538SXin Li 
2923*4e366538SXin Li       "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
2924*4e366538SXin Li       "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
2925*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
2926*4e366538SXin Li       "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
2927*4e366538SXin Li       "vpsrld      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)
2928*4e366538SXin Li 
2929*4e366538SXin Li       "vpackusdw   %%ymm0,%%ymm4,%%ymm4          \n"
2930*4e366538SXin Li       "vmovdqu     %%ymm4,(%1)                   \n"  // store above
2931*4e366538SXin Li       "vpackusdw   %%ymm2,%%ymm5,%%ymm5          \n"
2932*4e366538SXin Li       "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below
2933*4e366538SXin Li 
2934*4e366538SXin Li       "lea         0x10(%0),%0                   \n"
2935*4e366538SXin Li       "lea         0x20(%1),%1                   \n"  // 4 uv to 8 uv
2936*4e366538SXin Li       "sub         $0x8,%2                       \n"
2937*4e366538SXin Li       "jg          1b                            \n"
2938*4e366538SXin Li       "vzeroupper                                \n"
2939*4e366538SXin Li       : "+r"(src_ptr),                // %0
2940*4e366538SXin Li         "+r"(dst_ptr),                // %1
2941*4e366538SXin Li         "+r"(dst_width)               // %2
2942*4e366538SXin Li       : "r"((intptr_t)(src_stride)),  // %3
2943*4e366538SXin Li         "r"((intptr_t)(dst_stride))   // %4
2944*4e366538SXin Li       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
2945*4e366538SXin Li }
2946*4e366538SXin Li #endif
2947*4e366538SXin Li 
2948*4e366538SXin Li #endif  // defined(__x86_64__) || defined(__i386__)
2949*4e366538SXin Li 
2950*4e366538SXin Li #ifdef __cplusplus
2951*4e366538SXin Li }  // extern "C"
2952*4e366538SXin Li }  // namespace libyuv
2953*4e366538SXin Li #endif
2954